diff --git a/README.md b/README.md index a35ba14dc8cba490ae8970fba7881702fc3154fe..669ff5b711c62455f48038743ca1e089fa23d9e6 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ organization for the purposes of conducting machine learning and deep neural networks research. The system is general enough to be applicable in a wide variety of other domains, as well. +TensorFlow provides stable Python API and C APIs as well as without API backwards compatibility guarantee like C++, Go, Java, JavaScript and Swift. + Keep up to date with release announcements and security updates by subscribing to [announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce). @@ -81,13 +83,13 @@ The TensorFlow project strives to abide by generally accepted best practices in | Build Type | Status | Artifacts | | --- | --- | --- | -| **Linux CPU** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) | -| **Linux GPU** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.png) | [pypi](https://pypi.org/project/tf-nightly-gpu/) | -| **Linux XLA** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.png | TBA | -| **MacOS** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) | -| **Windows CPU** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.png) | [pypi](https://pypi.org/project/tf-nightly/) | -| **Windows GPU** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.png) | [pypi](https://pypi.org/project/tf-nightly-gpu/) | -| **Android** | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.png) | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) | +| **Linux CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html) | [pypi](https://pypi.org/project/tf-nightly/) | +| **Linux GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html) | [pypi](https://pypi.org/project/tf-nightly-gpu/) | +| **Linux XLA** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html) | TBA | +| **MacOS** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html) | [pypi](https://pypi.org/project/tf-nightly/) | +| **Windows CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html) | [pypi](https://pypi.org/project/tf-nightly/) | +| **Windows GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html) | [pypi](https://pypi.org/project/tf-nightly-gpu/) | +| **Android** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html) | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) | ### Community Supported Builds @@ -97,17 +99,20 @@ The TensorFlow project strives to abide by generally accepted best practices in | **IBM s390x** | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/) | TBA | | **IBM ppc64le CPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/) | TBA | | **IBM ppc64le GPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_PPC64LE_GPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_PPC64LE_GPU/) | TBA | -| **Linux CPU with Intel® MKL-DNN®** | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/) | TBA | +| **Linux CPU with Intel® MKL-DNN** Nightly | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/) | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) | +| **Linux CPU with Intel® MKL-DNN** Python 2.7
**Linux CPU with Intel® MKL-DNN** Python 3.5
**Linux CPU with Intel® MKL-DNN** Python 3.6| ![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)|[1.9.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.9.0-cp27-cp27mu-linux_x86_64.whl)
[1.9.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.9.0-cp35-cp35m-linux_x86_64.whl)
[1.9.0 py3.6](https://storage.cloud.google.com/intel-optimized-tensorflow/tensorflow-1.9.0-cp36-cp36m-linux_x86_64.whl) | ## For more information - +* [Tensorflow Blog](https://medium.com/tensorflow) +* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si) +* [TensorFlow Model Zoo](https://github.com/tensorflow/models) +* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730) +* [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap) +* [Tensorflow Twitter](https://twitter.com/tensorflow) * [TensorFlow Website](https://www.tensorflow.org) * [TensorFlow White Papers](https://www.tensorflow.org/about/bib) * [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ) -* [TensorFlow Model Zoo](https://github.com/tensorflow/models) -* [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730) -* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si) Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate. diff --git a/RELEASE.md b/RELEASE.md index 6b67072f8ecafa08c747f8296c7c2a59eb2350fa..763ef3b279dde209ed387534032deae40a33a9e4 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,68 @@ +# Release 1.10.0 + +## Major Features And Improvements + +* The `tf.lite` runtime now supports `complex64`. +* Initial [Google Cloud Bigtable integration](https://github.com/tensorflow/tensorflow/tree/r1.10/tensorflow/contrib/bigtable) for `tf.data`. +* Improved local run behavior in `tf.estimator.train_and_evaluate` which does not reload checkpoints for evaluation. +* `RunConfig` now sets device_filters to restrict how workers and PS can communicate. This can speed up training and ensure clean shutdowns in some situations. But if you have jobs that require communication between workers, you will have to set custom session_options in your `RunConfig`. +* Moved Distributions and Bijectors from `tf.contrib.distributions` to [Tensorflow Probability (TFP)](https://github.com/tensorflow/probability). `tf.contrib.distributions` is now deprecated and will be removed by the end of 2018. +* Adding new endpoints for existing tensorflow symbols. These endpoints are going to be the preferred endpoints going forward and may replace some of the existing endpoints in the future. See below for the complete list. New symbols have been added to the following modules: [`tf.debugging`](https://www.tensorflow.org/versions/master/api_docs/python/tf/debugging), [`tf.dtypes`](https://www.tensorflow.org/versions/master/api_docs/python/tf/dtypes), [`tf.image`](https://www.tensorflow.org/versions/master/api_docs/python/tf/image), [`tf.io`](https://www.tensorflow.org/versions/master/api_docs/python/tf/io), [`tf.linalg`](https://www.tensorflow.org/versions/master/api_docs/python/tf/linalg), [`tf.manip`](https://www.tensorflow.org/versions/master/api_docs/python/tf/manip), [`tf.math`](https://www.tensorflow.org/versions/master/api_docs/python/tf/math), [`tf.quantization`](https://www.tensorflow.org/versions/master/api_docs/python/tf/quantization), [`tf.strings`](https://www.tensorflow.org/versions/master/api_docs/python/tf/strings) + +## Breaking Changes + +* Prebuilt binaries are now (as of TensorFlow 1.10) built against NCCL 2.2 and no longer include NCCL in the binary install. TensorFlow usage with multiple GPUs and NCCL requires upgrade to [NCCL 2.2](https://developer.nvidia.com/nccl). See updated install guides: [Installing TensorFlow on Ubuntu](https://www.tensorflow.org/install/install_linux#tensorflow_gpu_support) and [Install TensorFlow from Sources](https://www.tensorflow.org/install/install_sources#optional_install_tensorflow_for_gpu_prerequisites). +* Starting from TensorFlow 1.11, Windows builds will use Bazel. Therefore, we will drop official support for cmake. + +## Bug Fixes and Other Changes + +* `tf.data`: + * `tf.contrib.data.group_by_reducer()` is now available via the public API. + * `tf.contrib.data.choose_from_datasets()` is now available via the public API. + * Adding `drop_remainder` argument to `tf.data.Dataset.batch()` and `tf.data.Dataset.padded_batch()`, deprecating `tf.contrib.data.batch_and_drop_remainder()` and `tf.contrib.data.padded_batch_and_drop_remainder()`. +* `tf.estimator`: + * `Estimator`s now use custom savers included in `EstimatorSpec` scaffolds for saving SavedModels during export. + * `EstimatorSpec` will now add a default prediction output for export if no `export_output` is provided, eliminating the need to explicitly include a `PredictOutput` object in the `model_fn` for simple use-cases. + * Support sparse_combiner in canned Linear Estimators. + * Added batch normalization to `DNNClassifier`, `DNNRegressor`, and `DNNEstimator`. + * Adding ranking support for boosted trees. + * Adding center bias option for boosted trees. +* Add `synchronization` and `aggregation` args to get_variable(). These args will be used for distributed variables. +* Add `synchronization` and `aggregation` args to the layer `add_weight()` API. These args will be used for distributed variables. +* `tf.losses.*` do not add to the global collection when executing eagerly (to avoid leaking memory). +* Support different summary and checkpoint directories in `tf.train.MonitoredTrainingSession()`. +* Added IndRNN, IndyGRU, and IndyLSTM cells to `tf.contrib.rnn`. +* Add safe static factory functions for SparseTensor and convert all CHECKs to DCHECKs. Using the constructor directly is unsafe and deprecated. +* Make the Bigtable client connection pool configurable & increase the default # of connections for performance. +* Added derivative of `tf.random_gamma` with respect to the alpha parameter. +* Added derivative of `tf.igamma(a, x)` and `tf.igammac(a, x)` with respect to a. +* Modified Bessel functions of order zero and one. +* Add FillTriangular Bijector to create triangular matrices. +* Added support for Type III DCT, and `tf.spectral.idct(type=2|3)`. +* Correctly handle CuDNN RNN weight loaded when nest in `TimeDistributed`. +* Adding per-element weight support for `WALSComputePartialLhsAndRhsOp`. +* ZerosLike and OnesLike ops treated as constants by Graph Transform Tool. +* Gamma distribution and the derived distributions (Beta, Dirichlet, Student's t, inverse Gamma) now fully reparameterized. +* Java: Experimental wrapper classes to make graph generation easier. Thanks @karllessard and @kbsriram +* Build & link in secure gRPC components (switch from the insecure grpc dependency to secure grpc dependency). +* Adding new endpoints for existing tensorflow symbols. These endpoints are going to be the preferred endpoints going forward and may replace some of the existing endpoints in the future. List of new endpoints: + * New endpoints in `tf.image` namespace: `tf.image.extract_image_patches` + * New endpoints in `tf.debugging` namespace: `tf.debugging.check_numerics`, `tf.debugging.is_finite`, `tf.debugging.is_inf`, `tf.debugging.is_nan`. + * New endpoints in `tf.dtypes` namespace: `tf.dtypes.as_string`. + * New endpoints in `tf.io` namespace: `tf.io.decode_base64`, `tf.io.decode_compressed`, `tf.io.decode_json_example`, `tf.io.decode_raw`, `tf.io.encode_base64`, `tf.io.matching_files`, `tf.io.parse_tensor`, `tf.io.read_file, `tf.io.write_file`. + * New endpoints in tf.linalg namespace: `tf.linalg.cross`, `tf.linalg.tensor_diag` (corresponds to `tf.diag`), `tf.linalg.tensor_diag_part` (corresponds to `tf.diag_part`). + * New endpoints in tf.manip namespace: `tf.manip.batch_to_space_nd`, `tf.manip.gather_nd`, `tf.manip.reshape`, `tf.manip.reverse`, `tf.manip.scatter_nd`, `tf.manip.space_to_batch_nd`, `tf.manip.tile` + * New endpoints in tf.math namespace: `tf.math.acos`, `tf.math.acosh`, `tf.math.add`, `tf.math.asin`, `tf.math.asinh`, `tf.math.atan`, `tf.math.atan2`, `tf.math.atanh`, `tf.math.betainc`, `tf.math.ceil`, `tf.math.cos`, `tf.math.cosh`, `tf.math.digamma`, `tf.math.equal`, `tf.math.erfc`, `tf.math.exp`, `tf.math.expm1`, `tf.math.floor`, `tf.math.greater`, `tf.math.greater_equal`, `tf.math.igamma`, `tf.math.igammac`, `tf.math.invert_permutation`, `tf.math.less`, `tf.math.less_equal`, `tf.math.lgamma`, `tf.math.log`, `tf.math.log1p`, `tf.math.logical_and`, `tf.math.logical_not`, `tf.math.logical_or`, `tf.math.maximum`, `tf.math.minimum`, `tf.math.not_equal`, `tf.math.polygamma`, `tf.math.reciprocal`, `tf.math.rint`, `tf.math.rsqrt`, `tf.math.segment_max`, `tf.math.segment_mean`, `tf.math.segment_min`, `tf.math.segment_prod`, `tf.math.segment_sum`, `tf.math.sin`, `tf.math.sinh`, `tf.math.softplus`, `tf.math.softsign`, `tf.math.squared_difference`, `tf.math.tan`, `tf.math.unsorted_segment_max`, `tf.math.unsorted_segment_min`, `tf.math.unsorted_segment_prod`, `tf.math.unsorted_segment_sum`, `tf.math.zeta`. + * New endpoints in `tf.quantization` namespace: `tf.quantization.dequantize`, `tf.quantization.fake_quant_with_min_max_args`, `tf.quantization.fake_quant_with_min_max_args_gradient`, `tf.quantization.fake_quant_with_min_max_vars`, `tf.quantization.fake_quant_with_min_max_vars_gradient`, `tf.quantization.fake_quant_with_min_max_vars_per_channel`, `tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient`. + * New endpoints in tf.strings namespace: `tf.strings.join` (corresponds to `tf.string_join`), `tf.strings.regex_replace`, `tf.strings.to_number` (corresponds to `tf.string_to_number`), `tf.strings.strip` (corresponds to `tf.string_strip`), `tf.strings.substr`, `tf.strings.to_hash_bucket` (corresponds to `tf.string_to_hash_bucket`), `tf.strings.to_hash_bucket_fast` (corresponds to `tf.string_to_hash_bucket_fast`), `tf.strings.to_hash_bucket_strong` (corresponds to `tf.string_to_hash_bucket_strong`). + + +## Thanks to our Contributors + +This release contains contributions from many people at Google, as well as: + +Ag Ramesh, Alex Wiltschko, Alexander Pantyukhin, Amogh Mannekote, An Jiaoyang, Andrei Nigmatulin, Andrew Ginns, BjøRn Moholt, Brett Koonce, Chengzhi Chen, Chinmay Das, Christian Ertler, Christoph Boeddeker, Clayne Robison, Courtial Florian, ctiijima, Dan Douthit, Dan J, Dan Ringwalt, EFanZh, Emanuele Ballarin, eqy, Evgeniy Zheltonozhskiy, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, G K, gracehoney, Guillaume Klein, Guozhong Zhuang, Hsien-Yang Li, hsm207, ImSheridan, Jayaram Bobba, Jiandong Ruan, Jie, Joel Shor, Jonas Rauber, Jongmin Baek, jsawruk, Karan Kaw, Karl Lessard, karl@kubx.ca, Kb Sriram, KinmanLam, leiiwang, Li, Yiqiang, Loo Rong Jie, Mahmoud Abuzaina, Mahmoud Aslan, ManHyuk, Martin Patz, Martin Zeitler, mktozk, Mohammad Ashraf Bhuiyan, mrTsjolder, Naman Bhalla, Nick Felt, Nicolas Lopez, Niranjan Hasabnis, Nishidha Panpaliya, Nitish, nrstott, Nutti, Parag Jain, PeterLee, Philipp Jund, Rach L, Rafal Wojdyla, Roland Zimmermann, Sergei Lebedev, SneakyFish5, Soila Kavulya, Sriram Veturi, Steven Schmatz, Taehoon Lee, Tang, Wenyi, Taras Sereda, Ted Chang, Tim Zaman, Tristan Rice, tucan, vchigrin, Vikram Tiwari, Vincent, WeberXie, William D. Irons, Yan Facai (颜发才), Yong Tang, Yu Yi, Yuxin Wu, Zé ViníCius + # Release 1.9.0 ## Major Features And Improvements diff --git a/configure.py b/configure.py index f97bf8a66836a6647ba6aca625cb1526e11b39af..bf570a9fa394f8fb7ef98f57007b656afd0c466c 100644 --- a/configure.py +++ b/configure.py @@ -839,15 +839,16 @@ def set_tf_cuda_version(environ_cp): cuda_toolkit_path = cygpath(cuda_toolkit_path) if is_windows(): - cuda_rt_lib_path = 'lib/x64/cudart.lib' + cuda_rt_lib_paths = ['lib/x64/cudart.lib'] elif is_linux(): - cuda_rt_lib_path = 'lib64/libcudart.so.%s' % tf_cuda_version + cuda_rt_lib_paths = ['%s/libcudart.so.%s' % (x, tf_cuda_version) + for x in ['lib64', 'lib/x86_64-linux-gnu']] elif is_macos(): - cuda_rt_lib_path = 'lib/libcudart.%s.dylib' % tf_cuda_version + cuda_rt_lib_paths = ['lib/libcudart.%s.dylib' % tf_cuda_version] - cuda_toolkit_path_full = os.path.join(cuda_toolkit_path, cuda_rt_lib_path) - if os.path.exists(cuda_toolkit_path_full): - break + cuda_toolkit_paths_full = [os.path.join(cuda_toolkit_path, x) for x in cuda_rt_lib_paths] + if any([os.path.exists(x) for x in cuda_toolkit_paths_full]): + break # Reset and retry print('Invalid path to CUDA %s toolkit. %s cannot be found' % diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 60db234c9c56fcca32418fcc3b10385f8d82bd45..b807c8c2c66889a037d387d2b5f2d56dd9cf18f3 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -123,12 +123,6 @@ config_setting( visibility = ["//visibility:public"], ) -config_setting( - name = "windows_msvc", - values = {"cpu": "x64_windows_msvc"}, - visibility = ["//visibility:public"], -) - config_setting( name = "no_tensorflow_py_deps", define_values = {"no_tensorflow_py_deps": "true"}, @@ -387,6 +381,7 @@ config_setting( define_values = { "dynamic_loaded_kernels": "true", }, + visibility = ["//visibility:public"], ) config_setting( @@ -441,11 +436,6 @@ filegroup( ), ) -filegroup( - name = "docs_src", - data = glob(["docs_src/**/*.md"]), -) - cc_library( name = "grpc", deps = select({ @@ -492,7 +482,6 @@ tf_cc_shared_object( linkopts = select({ "//tensorflow:darwin": [], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": [ "-Wl,--version-script", # This line must be directly followed by the version_script.lds file "$(location //tensorflow:tf_framework_version_script.lds)", @@ -534,7 +523,6 @@ tf_cc_shared_object( "-Wl,-install_name,@rpath/libtensorflow.so", ], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": [ "-z defs", "-Wl,--version-script", # This line must be directly followed by the version_script.lds file @@ -559,7 +547,6 @@ tf_cc_shared_object( "$(location //tensorflow:tf_exported_symbols.lds)", ], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": [ "-z defs", "-Wl,--version-script", # This line must be directly followed by the version_script.lds file @@ -589,6 +576,7 @@ exports_files( gen_api_init_files( name = "tensorflow_python_api_gen", srcs = ["api_template.__init__.py"], + api_version = 1, root_init_template = "api_template.__init__.py", ) diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc index bb9433ce25e0e3b9cfb54698c940cc1b38c88d31..73fe73769bc1219ce865149d67d333c53371ccc5 100644 --- a/tensorflow/c/c_api_function_test.cc +++ b/tensorflow/c/c_api_function_test.cc @@ -1619,5 +1619,66 @@ TEST_F(CApiFunctionTest, GetFunctionsFromGraph) { TF_DeleteFunction(func1); } +// This test only works when the TF build includes XLA compiler. One way to set +// this up is via bazel build option "--define with_xla_support=true". +// +// FIXME: generalize the macro name TENSORFLOW_EAGER_USE_XLA to +// something like TENSORFLOW_CAPI_USE_XLA. +#ifdef TENSORFLOW_EAGER_USE_XLA +TEST_F(CApiFunctionTest, StatelessIf_XLA) { + TF_Function* func; + const std::string funcName = "BranchFunc"; + DefineFunction(funcName.c_str(), &func); + TF_GraphCopyFunction(host_graph_, func, nullptr, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_Operation* feed = Placeholder(host_graph_, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_Operation* true_cond = ScalarConst(true, host_graph_, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_OperationDescription* desc = + TF_NewOperation(host_graph_, "StatelessIf", "IfNode"); + TF_AddInput(desc, {true_cond, 0}); + TF_Output inputs[] = {{feed, 0}}; + TF_AddInputList(desc, inputs, TF_ARRAYSIZE(inputs)); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + TF_SetAttrType(desc, "Tcond", TF_BOOL); + TF_DataType inputType = TF_INT32; + TF_SetAttrTypeList(desc, "Tin", &inputType, 1); + TF_SetAttrTypeList(desc, "Tout", &inputType, 1); + TF_SetAttrFuncName(desc, "then_branch", funcName.data(), funcName.size()); + TF_SetAttrFuncName(desc, "else_branch", funcName.data(), funcName.size()); + TF_SetDevice(desc, "/device:XLA_CPU:0"); + auto op = TF_FinishOperation(desc, s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + ASSERT_NE(op, nullptr); + + // Create a session for this graph. + CSession csession(host_graph_, s_, /*use_XLA*/ true); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + // Run the graph. + csession.SetInputs({{feed, Int32Tensor(17)}}); + csession.SetOutputs({op}); + csession.Run(s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + TF_Tensor* out = csession.output_tensor(0); + ASSERT_TRUE(out != nullptr); + EXPECT_EQ(TF_INT32, TF_TensorType(out)); + EXPECT_EQ(0, TF_NumDims(out)); // scalar + ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out)); + int32* output_contents = static_cast(TF_TensorData(out)); + EXPECT_EQ(-17, *output_contents); + + // Clean up + csession.CloseAndDelete(s_); + ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_); + + TF_DeleteFunction(func); +} +#endif // TENSORFLOW_EAGER_USE_XLA + } // namespace } // namespace tensorflow diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc index 24eb6c069b21349fce288db3e79fbf14e824ad11..f15d9ee20adb31a0b76e2cd0d1e67f17a9deff05 100644 --- a/tensorflow/c/c_test_util.cc +++ b/tensorflow/c/c_test_util.cc @@ -26,6 +26,10 @@ limitations under the License. using tensorflow::GraphDef; using tensorflow::NodeDef; +static void BoolDeallocator(void* data, size_t, void* arg) { + delete[] static_cast(data); +} + static void Int32Deallocator(void* data, size_t, void* arg) { delete[] static_cast(data); } @@ -38,6 +42,14 @@ static void FloatDeallocator(void* data, size_t, void* arg) { delete[] static_cast(data); } +TF_Tensor* BoolTensor(bool v) { + const int num_bytes = sizeof(bool); + bool* values = new bool[1]; + values[0] = v; + return TF_NewTensor(TF_BOOL, nullptr, 0, values, num_bytes, &BoolDeallocator, + nullptr); +} + TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values) { int64_t num_values = 1; for (int i = 0; i < num_dims; ++i) { @@ -131,6 +143,12 @@ TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s, return op; } +TF_Operation* ScalarConst(bool v, TF_Graph* graph, TF_Status* s, + const char* name) { + unique_tensor_ptr tensor(BoolTensor(v), TF_DeleteTensor); + return Const(tensor.get(), graph, s, name); +} + TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s, const char* name) { unique_tensor_ptr tensor(Int32Tensor(v), TF_DeleteTensor); diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h index 38313d647ca93d4779bb1325f8ed7bde4b743879..7eeb1ee5e17ad7e5644f8bc8a18ca967b108475d 100644 --- a/tensorflow/c/c_test_util.h +++ b/tensorflow/c/c_test_util.h @@ -31,6 +31,8 @@ using ::tensorflow::string; typedef std::unique_ptr unique_tensor_ptr; +TF_Tensor* BoolTensor(int32_t v); + // Create a tensor with values of type TF_INT8 provided by `values`. TF_Tensor* Int8Tensor(const int64_t* dims, int num_dims, const char* values); @@ -55,6 +57,9 @@ TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, TF_Operation* Const(TF_Tensor* t, TF_Graph* graph, TF_Status* s, const char* name = "const"); +TF_Operation* ScalarConst(bool v, TF_Graph* graph, TF_Status* s, + const char* name = "scalar"); + TF_Operation* ScalarConst(int32_t v, TF_Graph* graph, TF_Status* s, const char* name = "scalar"); diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index 7321b4b791ffa722e9d3c7722c43297b0eae1eab..dfb1c9a37644c726e1eabab775593596d5b556b9 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -110,7 +110,7 @@ tensorflow::Status GetAllRemoteDevices( tensorflow::Status CreateRemoteContexts( const std::vector& remote_workers, int64 rendezvous_id, - const tensorflow::ServerDef& server_def, + int keep_alive_secs, const tensorflow::ServerDef& server_def, tensorflow::eager::EagerClientCache* remote_eager_workers, bool async, tensorflow::gtl::FlatMap* remote_contexts) { for (int i = 0; i < remote_workers.size(); i++) { @@ -129,6 +129,7 @@ tensorflow::Status CreateRemoteContexts( request.mutable_server_def()->set_job_name(parsed_name.job); request.mutable_server_def()->set_task_index(parsed_name.task); request.set_async(async); + request.set_keep_alive_secs(keep_alive_secs); auto* eager_client = remote_eager_workers->GetClient(remote_worker); if (eager_client == nullptr) { return tensorflow::errors::Internal( @@ -150,8 +151,9 @@ tensorflow::Status CreateRemoteContexts( return tensorflow::Status::OK(); } -tensorflow::Status NewRemoteAwareTFE_Context(const TFE_ContextOptions* opts, - TFE_Context** ctx) { +tensorflow::Status UpdateTFE_ContextWithServerDef( + int keep_alive_secs, const tensorflow::ServerDef& server_def, + TFE_Context* ctx) { // We don't use the TF_RETURN_IF_ERROR macro directly since that destroys the // server object (which currently CHECK-fails) and we miss the error, instead, // we log the error, and then return to allow the user to see the error @@ -165,12 +167,12 @@ tensorflow::Status NewRemoteAwareTFE_Context(const TFE_ContextOptions* opts, } \ } while (0); - string worker_name = tensorflow::strings::StrCat( - "/job:", opts->server_def.job_name(), - "/replica:0/task:", opts->server_def.task_index()); + string worker_name = + tensorflow::strings::StrCat("/job:", server_def.job_name(), + "/replica:0/task:", server_def.task_index()); std::unique_ptr server; - LOG_AND_RETURN_IF_ERROR(tensorflow::NewServer(opts->server_def, &server)); + LOG_AND_RETURN_IF_ERROR(tensorflow::NewServer(server_def, &server)); tensorflow::GrpcServer* grpc_server = dynamic_cast(server.get()); @@ -202,15 +204,15 @@ tensorflow::Status NewRemoteAwareTFE_Context(const TFE_ContextOptions* opts, // Initialize remote eager workers. tensorflow::gtl::FlatMap remote_contexts; LOG_AND_RETURN_IF_ERROR(CreateRemoteContexts( - remote_workers, rendezvous_id, opts->server_def, - remote_eager_workers.get(), opts->async, &remote_contexts)); + remote_workers, rendezvous_id, keep_alive_secs, server_def, + remote_eager_workers.get(), ctx->context.Async(), &remote_contexts)); tensorflow::RemoteRendezvous* r = grpc_server->worker_env()->rendezvous_mgr->Find(rendezvous_id); auto session_name = tensorflow::strings::StrCat("eager_", rendezvous_id); TF_RETURN_IF_ERROR(grpc_server->worker_env()->session_mgr->CreateSession( - session_name, opts->server_def, true)); + session_name, server_def, true)); std::shared_ptr worker_session; TF_RETURN_IF_ERROR( @@ -221,10 +223,11 @@ tensorflow::Status NewRemoteAwareTFE_Context(const TFE_ContextOptions* opts, TF_RETURN_IF_ERROR(r->Initialize(worker_session.get())); auto* device_mgr = grpc_server->worker_env()->device_mgr; - *ctx = new TFE_Context(opts->session_options.options, opts->policy, - opts->async, device_mgr, r, std::move(server), - std::move(remote_eager_workers), - std::move(remote_device_mgr), remote_contexts); + + ctx->context.InitializeRemote(std::move(server), + std::move(remote_eager_workers), + std::move(remote_device_mgr), remote_contexts, + r, device_mgr, keep_alive_secs); return tensorflow::Status::OK(); #undef LOG_AND_RETURN_IF_ERROR @@ -249,15 +252,6 @@ void TFE_ContextOptionsSetDevicePlacementPolicy( options->policy = policy; } -TF_CAPI_EXPORT extern void TFE_ContextOptionsSetServerDef( - TFE_ContextOptions* options, const void* proto, size_t proto_len, - TF_Status* status) { - if (!options->server_def.ParseFromArray(proto, proto_len)) { - status->status = tensorflow::errors::InvalidArgument( - "Invalid tensorflow.ServerDef protocol buffer"); - } -} - TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx, unsigned char async, TF_Status* status) { @@ -267,12 +261,6 @@ TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx, void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; } TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) { - if (!opts->server_def.job_name().empty()) { - TFE_Context* ctx = nullptr; - status->status = NewRemoteAwareTFE_Context(opts, &ctx); - return ctx; - } - std::vector devices; status->status = tensorflow::DeviceFactory::AddDevices( opts->session_options.options, "/job:localhost/replica:0/task:0", @@ -301,6 +289,22 @@ TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) { void TFE_ContextClearCaches(TFE_Context* ctx) { ctx->context.ClearCaches(); } +// Set server_def on the context, possibly updating it. +TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx, + int keep_alive_secs, + const void* proto, + size_t proto_len, + TF_Status* status) { + tensorflow::ServerDef server_def; + if (!server_def.ParseFromArray(proto, proto_len)) { + status->status = tensorflow::errors::InvalidArgument( + "Invalid tensorflow.ServerDef protocol buffer"); + return; + } + status->status = + UpdateTFE_ContextWithServerDef(keep_alive_secs, server_def, ctx); +} + void TFE_ContextSetThreadLocalDevicePlacementPolicy( TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy) { ctx->context.SetThreadLocalDevicePlacementPolicy( @@ -348,6 +352,11 @@ TF_DataType TFE_TensorHandleDataType(TFE_TensorHandle* h) { } int TFE_TensorHandleNumDims(TFE_TensorHandle* h, TF_Status* status) { + if (h == nullptr || h->handle == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "The passed in handle is a nullptr"); + return -1; + } int result; status->status = h->handle->NumDims(&result); return result; @@ -355,12 +364,22 @@ int TFE_TensorHandleNumDims(TFE_TensorHandle* h, TF_Status* status) { int64_t TFE_TensorHandleDim(TFE_TensorHandle* h, int dim_index, TF_Status* status) { + if (h == nullptr || h->handle == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "The passed in handle is a nullptr"); + return -1; + } tensorflow::int64 result; status->status = h->handle->Dim(dim_index, &result); return result; } const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) { + if (h == nullptr || h->handle == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "The passed in handle is a nullptr"); + return nullptr; + } tensorflow::Device* d = nullptr; status->status = h->handle->OpDevice(&d); return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0" @@ -368,6 +387,11 @@ const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) { } TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) { + if (h == nullptr || h->handle == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "The passed in handle is a nullptr"); + return nullptr; + } // TODO(agarwal): move this implementation inside TFE_TensorHandle. tensorflow::Device* d = nullptr; tensorflow::Device* op_device = nullptr; @@ -700,6 +724,10 @@ TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func, } } // namespace +void TFE_ContextStartStep(TFE_Context* ctx) { ctx->context.StartStep(); } + +void TFE_ContextEndStep(TFE_Context* ctx) { ctx->context.EndStep(); } + namespace tensorflow { void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op, const tensorflow::AttrValue& default_value, diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h index ea019a5711c1bbd4547819e976acf98fc06ecbde..a0ebc6fa0a22ed61be91c2974352c2988fb4cd92 100644 --- a/tensorflow/c/eager/c_api.h +++ b/tensorflow/c/eager/c_api.h @@ -81,16 +81,6 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetAsync(TFE_ContextOptions*, TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy( TFE_ContextOptions*, TFE_ContextDevicePlacementPolicy); -// A tensorflow.ServerDef specifies remote workers (in addition to the current -// workers name). Operations created on this context can then be executed on -// any of these remote workers by setting an appropriate device. -// -// If the following is set, all servers identified by the -// ServerDef must be up when the context is created. -TF_CAPI_EXPORT extern void TFE_ContextOptionsSetServerDef( - TFE_ContextOptions* options, const void* proto, size_t proto_len, - TF_Status* status); - // Destroy an options object. TF_CAPI_EXPORT extern void TFE_DeleteContextOptions(TFE_ContextOptions*); @@ -127,6 +117,18 @@ TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context*, unsigned char async, TF_Status* status); +// A tensorflow.ServerDef specifies remote workers (in addition to the current +// workers name). Operations created on this context can then be executed on +// any of these remote workers by setting an appropriate device. +// +// If the following is set, all servers identified by the +// ServerDef must be up when the context is created. +TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx, + int keep_alive_secs, + const void* proto, + size_t proto_len, + TF_Status* status); + // Causes the calling thread to block till all ops dispatched in async mode // have been executed. Note that "execution" here refers to kernel execution / // scheduling of copies, etc. Similar to sync execution, it doesn't guarantee @@ -379,6 +381,16 @@ TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf, TF_Status* status); +// Some TF ops need a step container to be set to limit the lifetime of some +// resources (mostly TensorArray and Stack, used in while loop gradients in +// graph mode). Calling this on a context tells it to start a step. +TF_CAPI_EXPORT extern void TFE_ContextStartStep(TFE_Context* ctx); + +// Ends a step. When there is no active step (that is, every started step has +// been ended) step containers will be cleared. Note: it is not safe to call +// TFE_ContextEndStep while ops which rely on the step container may be running. +TF_CAPI_EXPORT extern void TFE_ContextEndStep(TFE_Context* ctx); + #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h index 4c5077023d5bb3b83808bf3908e7110dd026e3ad..a5c0681e2e4eddae08954d9d0178ca96a3f8f29a 100644 --- a/tensorflow/c/eager/c_api_internal.h +++ b/tensorflow/c/eager/c_api_internal.h @@ -59,7 +59,6 @@ struct TFE_ContextOptions { // true if async execution is enabled. bool async = false; TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_SILENT}; - tensorflow::ServerDef server_def; }; struct TFE_Context { @@ -73,23 +72,6 @@ struct TFE_Context { default_policy), async, std::move(device_mgr), rendezvous) {} - explicit TFE_Context( - const tensorflow::SessionOptions& opts, - TFE_ContextDevicePlacementPolicy default_policy, bool async, - tensorflow::DeviceMgr* local_device_mgr, - tensorflow::Rendezvous* rendezvous, - std::unique_ptr server, - std::unique_ptr remote_eager_workers, - std::unique_ptr remote_device_mgr, - const tensorflow::gtl::FlatMap& - remote_contexts) - : context(opts, - static_cast( - default_policy), - async, local_device_mgr, rendezvous, std::move(server), - std::move(remote_eager_workers), std::move(remote_device_mgr), - remote_contexts) {} - tensorflow::EagerContext context; }; diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc index 0bdea70fe6b53ec374d856984741b211258b1d13..71d5f3613c89762633113b4e1dfb82b8199a1cd1 100644 --- a/tensorflow/c/eager/c_api_test.cc +++ b/tensorflow/c/eager/c_api_test.cc @@ -108,14 +108,14 @@ TEST(CAPI, Context) { TF_DeleteStatus(status); } -tensorflow::ServerDef GetServerDef(int num_tasks) { +tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) { tensorflow::ServerDef server_def; server_def.set_protocol("grpc"); - server_def.set_job_name("localhost"); + server_def.set_job_name(job_name); server_def.set_task_index(0); tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster(); tensorflow::JobDef* job_def = cluster_def->add_job(); - job_def->set_name("localhost"); + job_def->set_name(job_name); for (int i = 0; i < num_tasks; i++) { int port = tensorflow::testing::PickUnusedPortOrDie(); job_def->mutable_tasks()->insert( @@ -124,6 +124,10 @@ tensorflow::ServerDef GetServerDef(int num_tasks) { return server_def; } +tensorflow::ServerDef GetServerDef(int num_tasks) { + return GetServerDef("localhost", num_tasks); +} + void TestRemoteExecute(bool async) { tensorflow::ServerDef server_def = GetServerDef(2); @@ -140,9 +144,6 @@ void TestRemoteExecute(bool async) { TF_Status* status = TF_NewStatus(); TFE_ContextOptions* opts = TFE_NewContextOptions(); - TFE_ContextOptionsSetServerDef(opts, serialized.data(), serialized.size(), - status); - EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TFE_ContextOptionsSetAsync(opts, static_cast(async)); TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_EXPLICIT); @@ -150,6 +151,9 @@ void TestRemoteExecute(bool async) { EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TFE_DeleteContextOptions(opts); + TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(); TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle(); const char remote_device_name[] = @@ -195,8 +199,8 @@ void TestRemoteExecute(bool async) { TFE_DeleteOp(matmul); TFE_ContextAsyncWait(ctx, status); - TFE_DeleteContext(ctx); EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContext(ctx); TF_DeleteStatus(status); @@ -229,15 +233,15 @@ void TestRemoteExecuteSilentCopies(bool async) { TF_Status* status = TF_NewStatus(); TFE_ContextOptions* opts = TFE_NewContextOptions(); - TFE_ContextOptionsSetServerDef(opts, serialized.data(), serialized.size(), - status); - EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TFE_ContextOptionsSetAsync(opts, static_cast(async)); TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT); TFE_Context* ctx = TFE_NewContext(opts, status); EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); TFE_DeleteContextOptions(opts); + TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(); TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle(); const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0"; @@ -296,6 +300,147 @@ TEST(CAPI, RemoteExecuteSilentCopiesAsync) { TestRemoteExecuteSilentCopies(true); } +void CheckTFE_TensorHandleHasFloats(TFE_TensorHandle* handle, + const std::vector& expected_values) { + std::unique_ptr status( + TF_NewStatus(), TF_DeleteStatus); + TF_Tensor* t = TFE_TensorHandleResolve(handle, status.get()); + ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + std::unique_ptr actual_values(new float[expected_values.size()]); + EXPECT_EQ(sizeof(float) * expected_values.size(), TF_TensorByteSize(t)); + memcpy(actual_values.get(), TF_TensorData(t), TF_TensorByteSize(t)); + TF_DeleteTensor(t); + + for (int i = 0; i < expected_values.size(); i++) { + EXPECT_EQ(expected_values[i], actual_values[i]) + << "Mismatch in expected values at (zero-based) index " << i; + } +} + +void CheckRemoteMatMulExecutesOK(TFE_Context* ctx, + const char* remote_device_name, + const char* local_device_name) { + TF_Status* status = TF_NewStatus(); + TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle(); + + TFE_Op* matmul = MatMulOp(ctx, h0_task0, h0_task0); + TFE_OpSetDevice(matmul, remote_device_name, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TFE_TensorHandle* retvals[1]; + int num_retvals = 1; + TFE_Execute(matmul, &retvals[0], &num_retvals, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + auto* retval_task0 = + TFE_TensorHandleCopyToDevice(retvals[0], ctx, local_device_name, status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + CheckTFE_TensorHandleHasFloats(retval_task0, {7, 10, 15, 22}); + + TFE_DeleteTensorHandle(retval_task0); + TFE_DeleteTensorHandle(h0_task0); + TFE_DeleteTensorHandle(retvals[0]); + + TFE_DeleteOp(matmul); + + TFE_ContextAsyncWait(ctx, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TF_DeleteStatus(status); +} + +void TestRemoteExecuteChangeServerDef(bool async) { + tensorflow::ServerDef server_def = GetServerDef(2); + + // This server def has the task index set to 0. + string serialized = server_def.SerializeAsString(); + + server_def.set_task_index(1); + + std::unique_ptr worker_server; + ASSERT_TRUE(tensorflow::GrpcServer::Create( + server_def, tensorflow::Env::Default(), &worker_server) + .ok()); + ASSERT_TRUE(worker_server->Start().ok()); + + TF_Status* status = TF_NewStatus(); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + TFE_ContextOptionsSetAsync(opts, static_cast(async)); + TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT); + TFE_Context* ctx = TFE_NewContext(opts, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContextOptions(opts); + + TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + const char remote_device_name[] = + "/job:localhost/replica:0/task:1/device:CPU:0"; + const char local_device_name[] = + "/job:localhost/replica:0/task:0/device:CPU:0"; + CheckRemoteMatMulExecutesOK(ctx, remote_device_name, local_device_name); + + TFE_ContextAsyncWait(ctx, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + // TODO(nareshmodi): Figure out how to correctly shut the server down. + worker_server.release(); + + // Update the server def with a new set of names (worker instead of + // localhost). + tensorflow::ServerDef updated_server_def = GetServerDef("worker", 2); + serialized = updated_server_def.SerializeAsString(); + + updated_server_def.set_task_index(1); + tensorflow::Status s = tensorflow::GrpcServer::Create( + updated_server_def, tensorflow::Env::Default(), &worker_server); + ASSERT_TRUE(s.ok()) << s.error_message(); + ASSERT_TRUE(worker_server->Start().ok()); + + TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + // Create a new tensor_handle. + TFE_TensorHandle* h0_task0_new = TestMatrixTensorHandle(); + + // Check that copying it to the old remote device (named localhost) fails. + TFE_TensorHandleCopyToDevice(h0_task0_new, ctx, remote_device_name, status); + EXPECT_NE(TF_OK, TF_GetCode(status)) << TF_Message(status); + + // Copying and executing on the new remote device works. + const char new_remote_device_name[] = + "/job:worker/replica:0/task:1/device:CPU:0"; + const char new_local_device_name[] = + "/job:worker/replica:0/task:0/device:CPU:0"; + + auto* h0_task1_new = TFE_TensorHandleCopyToDevice( + h0_task0_new, ctx, new_remote_device_name, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TFE_DeleteTensorHandle(h0_task0_new); + TFE_DeleteTensorHandle(h0_task1_new); + + CheckRemoteMatMulExecutesOK(ctx, new_remote_device_name, + new_local_device_name); + + TFE_ContextAsyncWait(ctx, status); + EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TF_DeleteStatus(status); + + TFE_DeleteContext(ctx); + + // TODO(nareshmodi): Figure out how to correctly shut the server down. + worker_server.release(); +} + +TEST(CAPI, RemoteExecuteChangeServerDef) { + TestRemoteExecuteChangeServerDef(false); +} +TEST(CAPI, RemoteExecuteChangeServerDefAsync) { + TestRemoteExecuteChangeServerDef(true); +} + TEST(CAPI, TensorHandle) { TFE_TensorHandle* h = TestMatrixTensorHandle(); EXPECT_EQ(TF_FLOAT, TFE_TensorHandleDataType(h)); @@ -615,6 +760,42 @@ void SetAndGetOpDevices(bool async) { TF_DeleteStatus(status); } +TEST(CAPI, TensorHandleNullptr) { + TFE_TensorHandle* h = nullptr; + std::unique_ptr status( + TF_NewStatus(), TF_DeleteStatus); + + TF_Tensor* t = TFE_TensorHandleResolve(h, status.get()); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get())); + ASSERT_EQ(t, nullptr); + ASSERT_EQ("The passed in handle is a nullptr", + string(TF_Message(status.get()))); + + TF_SetStatus(status.get(), TF_OK, ""); + + const char* device_name = TFE_TensorHandleDeviceName(h, status.get()); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get())); + ASSERT_EQ(device_name, nullptr); + ASSERT_EQ("The passed in handle is a nullptr", + string(TF_Message(status.get()))); + + TF_SetStatus(status.get(), TF_OK, ""); + + int num_dims = TFE_TensorHandleNumDims(h, status.get()); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get())); + ASSERT_EQ(num_dims, -1); + ASSERT_EQ("The passed in handle is a nullptr", + string(TF_Message(status.get()))); + + TF_SetStatus(status.get(), TF_OK, ""); + + int dim = TFE_TensorHandleDim(h, 0, status.get()); + ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get())); + ASSERT_EQ(dim, -1); + ASSERT_EQ("The passed in handle is a nullptr", + string(TF_Message(status.get()))); +} + void Execute_MatMul_CPU(bool async) { TF_Status* status = TF_NewStatus(); TFE_ContextOptions* opts = TFE_NewContextOptions(); diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD index a98f0b00b2c70055f697ed4f15cb14708384b62f..f56521dac0374849081fe94f16feb08e55647b56 100644 --- a/tensorflow/cc/BUILD +++ b/tensorflow/cc/BUILD @@ -121,6 +121,7 @@ cc_library( deps = [ ":array_grad", ":data_flow_grad", + ":image_grad", ":math_grad", ":nn_grad", ], @@ -331,6 +332,36 @@ tf_cc_test( ], ) +cc_library( + name = "image_grad", + srcs = ["gradients/image_grad.cc"], + deps = [ + ":cc_ops", + ":cc_ops_internal", + ":grad_op_registry", + ":gradients", + ], + alwayslink = 1, +) + +tf_cc_test( + name = "gradients_image_grad_test", + srcs = ["gradients/image_grad_test.cc"], + deps = [ + ":cc_ops", + ":client_session", + ":grad_op_registry", + ":grad_testutil", + ":gradient_checker", + ":image_grad", + ":testutil", + "//tensorflow/core:lib_internal", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + cc_library( name = "math_grad", srcs = ["gradients/math_grad.cc"], @@ -348,9 +379,11 @@ tf_cc_test( srcs = ["gradients/math_grad_test.cc"], deps = [ ":cc_ops", + ":client_session", ":grad_op_registry", ":grad_testutil", ":gradient_checker", + ":gradients", ":math_grad", ":testutil", "//tensorflow/core:lib_internal", @@ -595,7 +628,6 @@ tf_cc_binary( copts = tf_copts(), linkopts = select({ "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//tensorflow:darwin": [ "-lm", "-lpthread", diff --git a/tensorflow/cc/client/client_session.cc b/tensorflow/cc/client/client_session.cc index ba056a8f3a84910aebf5079573cb64c19f41469d..0e61089a5950ee894ad5489317757cff8a85e966 100644 --- a/tensorflow/cc/client/client_session.cc +++ b/tensorflow/cc/client/client_session.cc @@ -127,4 +127,22 @@ Status ClientSession::Run(const RunOptions& run_options, const FeedType& inputs, target_node_names, outputs, run_metadata); } +Status ClientSession::MakeCallable(const CallableOptions& callable_options, + CallableHandle* out_handle) { + TF_RETURN_IF_ERROR(impl()->MaybeExtendGraph()); + return impl()->session_->MakeCallable(callable_options, out_handle); +} + +Status ClientSession::RunCallable(CallableHandle handle, + const std::vector& feed_tensors, + std::vector* fetch_tensors, + RunMetadata* run_metadata) { + return impl()->session_->RunCallable(handle, feed_tensors, fetch_tensors, + run_metadata); +} + +Status ClientSession::ReleaseCallable(CallableHandle handle) { + return impl()->session_->ReleaseCallable(handle); +} + } // end namespace tensorflow diff --git a/tensorflow/cc/client/client_session.h b/tensorflow/cc/client/client_session.h index 5fb4109f7d15d5997f745acd913e60a02855fd73..7dd653eec4ec729b652cb779d06e820bfb437b3c 100644 --- a/tensorflow/cc/client/client_session.h +++ b/tensorflow/cc/client/client_session.h @@ -87,7 +87,33 @@ class ClientSession { const std::vector& run_outputs, std::vector* outputs, RunMetadata* run_metadata) const; - // TODO(keveman): Add support for partial run. + /// \brief A handle to a subgraph, created with + /// `ClientSession::MakeCallable()`. + typedef int64 CallableHandle; + + /// \brief Creates a `handle` for invoking the subgraph defined by + /// `callable_options`. + /// NOTE: This API is still experimental and may change. + Status MakeCallable(const CallableOptions& callable_options, + CallableHandle* out_handle); + + /// \brief Invokes the subgraph named by `handle` with the given options and + /// input tensors. + /// + /// The order of tensors in `feed_tensors` must match the order of names in + /// `CallableOptions::feed()` and the order of tensors in `fetch_tensors` will + /// match the order of names in `CallableOptions::fetch()` when this subgraph + /// was created. + /// NOTE: This API is still experimental and may change. + Status RunCallable(CallableHandle handle, + const std::vector& feed_tensors, + std::vector* fetch_tensors, + RunMetadata* run_metadata); + + /// \brief Releases resources associated with the given `handle` in this + /// session. + /// NOTE: This API is still experimental and may change. + Status ReleaseCallable(CallableHandle handle); private: class Impl; diff --git a/tensorflow/cc/client/client_session_test.cc b/tensorflow/cc/client/client_session_test.cc index ea5cf5a1f12be316cc6e0d0a02cd3caf4d177400..559ffea7e817526e7f1396cd0e8187d01364f23b 100644 --- a/tensorflow/cc/client/client_session_test.cc +++ b/tensorflow/cc/client/client_session_test.cc @@ -95,5 +95,26 @@ TEST(ClientSessionTest, MultiThreaded) { test::ExpectTensorEqual(outputs[0], test::AsTensor({-1, 2}, {2})); } +TEST(ClientSessionTest, Callable) { + Scope root = Scope::NewRootScope(); + auto a = Placeholder(root, DT_INT32); + auto b = Placeholder(root, DT_INT32); + auto c = Add(root, a, b); + ClientSession session(root); + std::vector outputs; + + CallableOptions options; + options.add_feed(a.node()->name()); + options.add_feed(b.node()->name()); + options.add_fetch(c.node()->name()); + ClientSession::CallableHandle callable; + TF_CHECK_OK(session.MakeCallable(options, &callable)); + TF_EXPECT_OK(session.RunCallable( + callable, {test::AsTensor({1}, {}), test::AsTensor({41}, {})}, + &outputs, nullptr)); + test::ExpectTensorEqual(outputs[0], test::AsTensor({42}, {})); + TF_EXPECT_OK(session.ReleaseCallable(callable)); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/cc/framework/gradient_checker.cc b/tensorflow/cc/framework/gradient_checker.cc index de2645cb440bda1f35e764af9197ca97bb760c08..e9f9c59e3aa0e8a9dc5d5e658540e9da73adaca5 100644 --- a/tensorflow/cc/framework/gradient_checker.cc +++ b/tensorflow/cc/framework/gradient_checker.cc @@ -247,7 +247,7 @@ Status ComputeNumericJacobianTranspose(const Scope& scope, const OutputList& xs, auto y_pos_flat = y_pos[y_idx].flat(); auto y_neg_flat = y_neg[y_idx].flat(); const int64 y_size = y_shapes[y_idx].num_elements(); - const Y_T scale = Y_T{2 * delta}; + const Y_T scale = 2 * delta; auto jacobian = (*jacobian_ts)[x_idx * y_num + y_idx].matrix(); for (int c = 0; c < y_size; ++c) { SetJacobian(&jacobian, r * x_stride + unit_dimension, @@ -351,7 +351,14 @@ Status ComputeGradientErrorInternal(const Scope& scope, const OutputList& xs, auto jac_n = jacobian_ns[i].matrix(); for (int r = 0; r < jacobian_ts[i].dim_size(0); ++r) { for (int c = 0; c < jacobian_ts[i].dim_size(1); ++c) { - *max_error = std::max(*max_error, std::fabs(jac_t(r, c) - jac_n(r, c))); + auto cur_error = std::fabs(jac_t(r, c) - jac_n(r, c)); + // Treat any NaN as max_error and immediately return. + // (Note that std::max may ignore NaN arguments.) + if (std::isnan(cur_error)) { + *max_error = cur_error; + return Status::OK(); + } + *max_error = std::max(*max_error, cur_error); } } } @@ -409,6 +416,7 @@ Status ComputeGradientError(const Scope& scope, const Output& x, const Output& y, const TensorShape& y_shape, JAC_T* max_error); INSTANTIATE_GRAD_ERR_TYPE(float, float, float); +INSTANTIATE_GRAD_ERR_TYPE(double, float, double); INSTANTIATE_GRAD_ERR_TYPE(double, double, double); INSTANTIATE_GRAD_ERR_TYPE(complex64, float, float); INSTANTIATE_GRAD_ERR_TYPE(float, complex64, float); diff --git a/tensorflow/cc/framework/gradient_checker_test.cc b/tensorflow/cc/framework/gradient_checker_test.cc index d4f0a7f5ab3716be41e22c02a21aca028f76fb88..8dd762c282eff287bddd49ea6f38b2b8060949b0 100644 --- a/tensorflow/cc/framework/gradient_checker_test.cc +++ b/tensorflow/cc/framework/gradient_checker_test.cc @@ -28,12 +28,14 @@ namespace { using ops::Complex; using ops::Const; +using ops::Div; using ops::MatMul; using ops::Placeholder; using ops::Real; using ops::Split; using ops::Square; using ops::Stack; +using ops::Sub; using ops::Unstack; TEST(GradientCheckerTest, BasicFloat) { @@ -104,6 +106,20 @@ TEST(GradientCheckerTest, Complex64ToFloat) { EXPECT_LT(max_error, 1e-4); } +// When calculating gradients that are undefined, test we get NaN +// as the computed error rather than 0. +TEST(GradientCheckerTest, BasicNan) { + Scope scope = Scope::NewRootScope(); + TensorShape shape({2, 4, 3}); + auto x = Placeholder(scope, DT_FLOAT, Placeholder::Shape(shape)); + // y = x/(x-x) should always return NaN + auto y = Div(scope, x, Sub(scope, x, x)); + float max_error; + TF_ASSERT_OK((ComputeGradientError( + scope, {x}, {shape}, {y}, {shape}, &max_error))); + EXPECT_TRUE(std::isnan(max_error)); +} + TEST(GradientCheckerTest, MatMulGrad) { Scope scope = Scope::NewRootScope(); diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc index b353accddcb6db9a07c112de03ead2f02c4ee6a6..e9173227aadbf86eab666e6c17bacacb92888572 100644 --- a/tensorflow/cc/gradients/array_grad.cc +++ b/tensorflow/cc/gradients/array_grad.cc @@ -120,6 +120,24 @@ Status SplitGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Split", SplitGrad); +Status FillGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // y = fill(fill_shape, x) + // No gradient returned for the fill_shape argument. + grad_outputs->push_back(NoGradient()); + // The gradient for x (which must be a scalar) is just the sum of + // all the gradients from the shape it fills. + // We use ReduceSum to implement this, which needs an argument providing + // the indices of all the dimensions of the incoming gradient. + // grad(x) = reduce_sum(grad(y), [0..rank(grad(y))]) + auto all_dims = Range(scope, Const(scope, 0), Rank(scope, grad_inputs[0]), + Const(scope, 1)); + grad_outputs->push_back(ReduceSum(scope, grad_inputs[0], all_dims)); + return scope.status(); +} +REGISTER_GRADIENT_OP("Fill", FillGrad); + Status DiagGrad(const Scope& scope, const Operation& op, const std::vector& grad_inputs, std::vector* grad_outputs) { diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc index d09275b6487b4212aa35a0476002f2bb587fa210..f41de3dc2098df55fbbb616557f264a4e70db6b6 100644 --- a/tensorflow/cc/gradients/array_grad_test.cc +++ b/tensorflow/cc/gradients/array_grad_test.cc @@ -108,6 +108,14 @@ TEST_F(ArrayGradTest, SplitGrad) { RunTest({x}, {x_shape}, y.output, {y_shape, y_shape}); } +TEST_F(ArrayGradTest, FillGrad) { + TensorShape x_shape({}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + TensorShape y_shape({2, 5, 3}); + auto y = Fill(scope_, {2, 5, 3}, x); + RunTest(x, x_shape, y, y_shape); +} + TEST_F(ArrayGradTest, DiagGrad) { TensorShape x_shape({5, 2}); auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); diff --git a/tensorflow/cc/gradients/image_grad.cc b/tensorflow/cc/gradients/image_grad.cc new file mode 100644 index 0000000000000000000000000000000000000000..882709e1e2817431a32c453fe0f35f2b2e6c69b0 --- /dev/null +++ b/tensorflow/cc/gradients/image_grad.cc @@ -0,0 +1,74 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include "tensorflow/cc/framework/grad_op_registry.h" +#include "tensorflow/cc/framework/gradients.h" +#include "tensorflow/cc/ops/image_ops_internal.h" +#include "tensorflow/cc/ops/standard_ops.h" + +namespace tensorflow { +namespace ops { +namespace { + +Status ResizeNearestNeighborGradHelper(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + bool align_corners; + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "align_corners", &align_corners)); + // The internal gradient implementation needs the shape of the input image. + // x_shape = shape(x)[1:3] + // = slice(shape(x), {1}, {3 - 1}) + auto x_shape = Slice(scope, Shape(scope, op.input(0)), {1}, {2}); + grad_outputs->push_back(internal::ResizeNearestNeighborGrad( + scope, grad_inputs[0], x_shape, + internal::ResizeNearestNeighborGrad::AlignCorners(align_corners))); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("ResizeNearestNeighbor", ResizeNearestNeighborGradHelper); + +Status ResizeBilinearGradHelper(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + bool align_corners; + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "align_corners", &align_corners)); + grad_outputs->push_back(internal::ResizeBilinearGrad( + scope, grad_inputs[0], op.input(0), + internal::ResizeBilinearGrad::AlignCorners(align_corners))); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("ResizeBilinear", ResizeBilinearGradHelper); + +Status ResizeBicubicGradHelper(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + bool align_corners; + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "align_corners", &align_corners)); + grad_outputs->push_back(internal::ResizeBicubicGrad( + scope, grad_inputs[0], op.input(0), + internal::ResizeBicubicGrad::AlignCorners(align_corners))); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("ResizeBicubic", ResizeBicubicGradHelper); + +} // anonymous namespace +} // namespace ops +} // namespace tensorflow diff --git a/tensorflow/cc/gradients/image_grad_test.cc b/tensorflow/cc/gradients/image_grad_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e55c7561b030c50bd67bd53fd0d55710085c5d2 --- /dev/null +++ b/tensorflow/cc/gradients/image_grad_test.cc @@ -0,0 +1,157 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/cc/client/client_session.h" +#include "tensorflow/cc/framework/grad_op_registry.h" +#include "tensorflow/cc/framework/gradient_checker.h" +#include "tensorflow/cc/framework/testutil.h" +#include "tensorflow/cc/gradients/grad_testutil.h" +#include "tensorflow/cc/ops/image_ops.h" +#include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/framework/tensor_testutil.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace tensorflow { +namespace { + +using ops::Const; +using ops::ResizeBicubic; +using ops::ResizeBilinear; +using ops::ResizeNearestNeighbor; + +class ImageGradTest : public ::testing::Test { + protected: + ImageGradTest() : scope_(Scope::NewRootScope()) {} + + enum OpType { RESIZE_NEAREST, RESIZE_BILINEAR, RESIZE_BICUBIC }; + + template + Tensor MakeData(const TensorShape& data_shape) { + DataType data_type = DataTypeToEnum::v(); + Tensor data(data_type, data_shape); + auto data_flat = data.flat(); + for (int i = 0; i < data_flat.size(); ++i) { + data_flat(i) = T(i); + } + return data; + } + + template + void MakeOp(const OpType op_type, const Tensor& x_data, const Input& y_shape, + const bool align_corners, Output* x, Output* y) { + *x = Const(scope_, x_data); + switch (op_type) { + case RESIZE_NEAREST: + *y = ResizeNearestNeighbor( + scope_, *x, y_shape, + ResizeNearestNeighbor::AlignCorners(align_corners)); + return; + case RESIZE_BILINEAR: + *y = ResizeBilinear(scope_, *x, y_shape, + ResizeBilinear::AlignCorners(align_corners)); + return; + case RESIZE_BICUBIC: + *y = ResizeBicubic(scope_, *x, y_shape, + ResizeBicubic::AlignCorners(align_corners)); + return; + } + assert(false); + } + + template + void TestResizedShapeForType(const OpType op_type, const bool align_corners) { + TensorShape x_shape({1, 2, 2, 1}); + Tensor x_data = MakeData(x_shape); + Output x, y; + MakeOp(op_type, x_data, {4, 6}, align_corners, &x, &y); + + ClientSession session(scope_); + std::vector outputs; + TF_ASSERT_OK(session.Run({y}, &outputs)); + EXPECT_EQ(outputs.size(), 1); + EXPECT_EQ(outputs[0].shape(), TensorShape({1, 4, 6, 1})); + } + + void TestResizedShape(OpType op_type) { + for (const bool align_corners : {true, false}) { + TestResizedShapeForType(op_type, align_corners); + TestResizedShapeForType(op_type, align_corners); + TestResizedShapeForType(op_type, align_corners); + } + } + + template + void TestResizeToSmallerAndAlign(const OpType op_type, + const bool align_corners) { + TensorShape x_shape({1, 4, 6, 1}); + Tensor x_data = MakeData(x_shape); + Output x, y; + MakeOp(op_type, x_data, {2, 3}, align_corners, &x, &y); + JAC_T max_error; + TF_ASSERT_OK((ComputeGradientError( + scope_, x, x_data, y, {1, 2, 3, 1}, &max_error))); + EXPECT_LT(max_error, 1e-3); + } + + template + void TestResizeToLargerAndAlign(const OpType op_type, + const bool align_corners) { + TensorShape x_shape({1, 2, 3, 1}); + Tensor x_data = MakeData(x_shape); + Output x, y; + MakeOp(op_type, x_data, {4, 6}, align_corners, &x, &y); + JAC_T max_error; + TF_ASSERT_OK((ComputeGradientError( + scope_, x, x_data, y, {1, 4, 6, 1}, &max_error))); + EXPECT_LT(max_error, 1e-3); + } + + template + void TestResize(OpType op_type) { + for (const bool align_corners : {true, false}) { + TestResizeToSmallerAndAlign(op_type, align_corners); + TestResizeToLargerAndAlign(op_type, align_corners); + } + } + + Scope scope_; +}; + +TEST_F(ImageGradTest, TestNearestNeighbor) { + TestResizedShape(RESIZE_NEAREST); + TestResize(RESIZE_NEAREST); + TestResize(RESIZE_NEAREST); +} + +TEST_F(ImageGradTest, TestBilinear) { + TestResizedShape(RESIZE_BILINEAR); + TestResize(RESIZE_BILINEAR); + // Note that Y_T is always float for this op. We choose + // double for the jacobian to capture the higher precision + // between X_T and Y_T. + TestResize(RESIZE_BILINEAR); +} + +TEST_F(ImageGradTest, TestBicubic) { + TestResizedShape(RESIZE_BICUBIC); + TestResize(RESIZE_BICUBIC); + // Note that Y_T is always float for this op. We choose + // double for the jacobian to capture the higher precision + // between X_T and Y_T. + TestResize(RESIZE_BICUBIC); +} + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc index 35a01e0341cb08c9b314908b6dcd76fd99c1e68b..5dcf00857df0eabd4e99f2782c1910515a9be265 100644 --- a/tensorflow/cc/gradients/math_grad.cc +++ b/tensorflow/cc/gradients/math_grad.cc @@ -441,6 +441,22 @@ Status RealDivGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("RealDiv", RealDivGrad); +Status UnsafeDivGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + auto x_1 = ConjugateHelper(scope, op.input(0)); + auto x_2 = ConjugateHelper(scope, op.input(1)); + // y = x_1 / x_2 + // dy/dx_1 = 1/x_2 + // dy/dx_2 = -x_1/x_2^2 + auto gx_1 = UnsafeDiv(scope, grad_inputs[0], x_2); + auto gx_2 = + Mul(scope, grad_inputs[0], + UnsafeDiv(scope, UnsafeDiv(scope, Neg(scope, x_1), x_2), x_2)); + return BinaryGradCommon(scope, op, grad_outputs, gx_1, gx_2); +} +REGISTER_GRADIENT_OP("UnsafeDiv", UnsafeDivGrad); + Status SquaredDifferenceGrad(const Scope& scope, const Operation& op, const std::vector& grad_inputs, std::vector* grad_outputs) { @@ -1007,6 +1023,26 @@ Status ProdGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("Prod", ProdGrad); +Status SegmentSumGrad(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + // The SegmentSum operation sums segments of the Tensor that have the same + // index in the segment_ids parameter. + // i.e z = [2, 3, 4, 5], segment_ids [0, 0, 0, 1] + // will produce [2 + 3 + 4, 5] = [9, 5] + // The gradient that will flow back to the gather operation will look like + // [x1, x2], it will have the same shape as the output of the SegmentSum + // operation. The differentiation step of the SegmentSum operation just + // broadcast the gradient in order to retrieve the z's shape. + // dy/dz = [x1, x1, x1, x2] + grad_outputs->push_back(Gather(scope, grad_inputs[0], op.input(1))); + + // stop propagation along segment_ids + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("SegmentSum", SegmentSumGrad); + // MatMulGrad helper function used to compute two MatMul operations // based on input matrix transposition combinations. Status MatMulGradHelper(const Scope& scope, const bool is_batch, diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc index 1c9bdff5e1295135abe60c282d565c39071fd78a..88aef1fab410e11aa17a9e44578f5db95ed6e52b 100644 --- a/tensorflow/cc/gradients/math_grad_test.cc +++ b/tensorflow/cc/gradients/math_grad_test.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/cc/client/client_session.h" #include "tensorflow/cc/framework/grad_op_registry.h" #include "tensorflow/cc/framework/gradient_checker.h" +#include "tensorflow/cc/framework/gradients.h" #include "tensorflow/cc/framework/testutil.h" #include "tensorflow/cc/gradients/grad_testutil.h" #include "tensorflow/cc/ops/standard_ops.h" @@ -42,9 +44,11 @@ using ops::Placeholder; using ops::Pow; using ops::Prod; using ops::RealDiv; +using ops::SegmentSum; using ops::SquaredDifference; using ops::Sub; using ops::Sum; +using ops::UnsafeDiv; // TODO(andydavis) Test gradient function against numeric gradients output. // TODO(andydavis) As more gradients are added move common test functions @@ -850,6 +854,36 @@ TEST_F(NaryGradTest, RealDiv) { RunTest({x}, {x_shape}, {y}, {x_shape}); } +TEST_F(NaryGradTest, UnsafeDiv) { + { + TensorShape x_shape({3, 2, 5}); + const auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + // Test x / (1 + |x|) rather than x_1 / x_2 to avoid triggering large + // division errors in the numeric estimator used by the gradient checker. + const auto y = UnsafeDiv( + scope_, x, Add(scope_, Const(scope_, 1), Abs(scope_, x))); + RunTest({x}, {x_shape}, {y}, {x_shape}); + } + { + // Return 0 gradient (rather than NaN) for division by zero. + const auto x = Placeholder(scope_, DT_FLOAT); + const auto zero = Const(scope_, 0.0); + const auto y = UnsafeDiv(scope_, x, zero); + + std::vector grad_outputs; + TF_EXPECT_OK(AddSymbolicGradients(scope_, {y}, {x}, &grad_outputs)); + ClientSession session(scope_); + std::vector grad_result; + TF_EXPECT_OK( + session.Run({{x, {-3.0f, 0.0f, 3.0f}}}, grad_outputs, &grad_result)); + EXPECT_EQ(grad_result.size(), 1); + EXPECT_EQ(grad_result[0].NumElements(), 3); + EXPECT_EQ(grad_result[0].flat()(0), 0.0f); + EXPECT_EQ(grad_result[0].flat()(1), 0.0f); + EXPECT_EQ(grad_result[0].flat()(2), 0.0f); + } +} + TEST_F(NaryGradTest, SquaredDifference) { TensorShape x1_shape({3, 2, 5}); TensorShape x2_shape({2, 5}); @@ -898,5 +932,14 @@ TEST_F(NaryGradTest, Prod) { RunTest({x}, {x_shape}, {y}, {y_shape}); } +TEST_F(NaryGradTest, SegmentSum) { + TensorShape x_shape({3, 4}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + auto y = SegmentSum(scope_, x, {0, 0, 1}); + // the sum is always on the first dimension + TensorShape y_shape({2, 4}); + RunTest({x}, {x_shape}, {y}, {y_shape}); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc index 98be66a6add67a8053e286521e564286cdb8ef8d..3830416159158cca8bfb8422c2959b49fa42406d 100644 --- a/tensorflow/cc/saved_model/loader.cc +++ b/tensorflow/cc/saved_model/loader.cc @@ -170,7 +170,8 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir, variables_directory, MetaFilename(kSavedModelVariablesFilename)); if (!Env::Default()->FileExists(variables_index_path).ok()) { LOG(INFO) << "The specified SavedModel has no variables; no checkpoints " - "were restored."; + "were restored. File does not exist: " + << variables_index_path; return Status::OK(); } const string variables_path = diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD index fef8b8d4d4cdcc97a913ae2ba6d1a8b0b0084f89..1899a32e4dc5487875f091fece6acf0c44c9243f 100644 --- a/tensorflow/compiler/aot/BUILD +++ b/tensorflow/compiler/aot/BUILD @@ -8,28 +8,6 @@ load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library") load("//tensorflow:tensorflow.bzl", "tf_cc_test") load("//tensorflow:tensorflow.bzl", "tf_cc_binary") -# Optional runtime utilities for use by code generated by tfcompile. -cc_library( - name = "runtime", - srcs = ["runtime.cc"], - hdrs = ["runtime.h"], - visibility = ["//visibility:public"], - deps = [ - "//tensorflow/core:framework_lite", - ], -) - -tf_cc_test( - name = "runtime_test", - srcs = ["runtime_test.cc"], - deps = [ - ":runtime", - "//tensorflow/core:framework", - "//tensorflow/core:test", - "//tensorflow/core:test_main", - ], -) - # Don't depend on this directly; this is only used for the benchmark test # generated by tf_library. cc_library( @@ -53,9 +31,9 @@ cc_library( ], deps = [ ":embedded_protocol_buffers", - ":runtime", # needed by codegen to print aligned_buffer_bytes "//tensorflow/compiler/tf2xla", "//tensorflow/compiler/tf2xla:common", + "//tensorflow/compiler/tf2xla:cpu_function_runtime", "//tensorflow/compiler/tf2xla:tf2xla_proto", "//tensorflow/compiler/tf2xla:tf2xla_util", "//tensorflow/compiler/tf2xla:xla_compiler", @@ -70,6 +48,7 @@ cc_library( "//tensorflow/compiler/xla/client:compile_only_client", "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:compiler", + "//tensorflow/compiler/xla/service/cpu:buffer_info_util", "//tensorflow/compiler/xla/service/cpu:cpu_compiler", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework_internal", @@ -238,7 +217,6 @@ test_suite( tests = [ ":benchmark_test", ":codegen_test", - ":runtime_test", ":test_graph_tfadd_test", ":test_graph_tfunknownop2_test", ":test_graph_tfunknownop3_test", diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc index 28070d60dbbe6dd8f930b8e6509cedcf09f94e11..89fefdad54fabcc953e72c6aa7a2361468b61259 100644 --- a/tensorflow/compiler/aot/codegen.cc +++ b/tensorflow/compiler/aot/codegen.cc @@ -20,10 +20,11 @@ limitations under the License. #include #include "tensorflow/compiler/aot/embedded_protocol_buffers.h" -#include "tensorflow/compiler/aot/runtime.h" +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" #include "tensorflow/compiler/tf2xla/str_util.h" #include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/xla/service/compiler.h" +#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/errors.h" @@ -36,6 +37,8 @@ namespace tfcompile { namespace { +using BufferInfo = cpu_function_runtime::BufferInfo; + bool IsAlpha(char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } @@ -85,27 +88,36 @@ Status XLATypeToCpp(xla::PrimitiveType type, string* str) { return Status::OK(); } -// total_buffer_bytes returns the sum of each size in `sizes`, skipping -1 -// values. There are `n` entries in `sizes`. -size_t total_buffer_bytes(const intptr_t* sizes, size_t n) { - size_t total = 0; - for (size_t i = 0; i < n; ++i) { - if (sizes[i] != -1) { - total += sizes[i]; - } - } - return total; +// Returns the sum of the size of each buffer in `buffer_infos`. +size_t TotalBufferBytes(const std::vector& buffer_infos) { + return std::accumulate(buffer_infos.begin(), buffer_infos.end(), size_t{0}, + [](size_t size, const BufferInfo& buffer_info) { + return size + buffer_info.size(); + }); } -// Fills in arg_sizes with the byte size of each positional arg. -Status ComputeArgSizes(const CompileResult& compile_result, - std::vector* arg_sizes) { - const xla::ProgramShape& ps = compile_result.program_shape; - for (int i = 0; i < ps.parameters_size(); ++i) { - arg_sizes->push_back(xla::ShapeUtil::ByteSizeOf( - ps.parameters(i), compile_result.pointer_size)); - } - return Status::OK(); +// Returns a vector of BufferInfo instances in `buffer_infos` that are entry +// parameter buffers. +std::vector ExtractEntryParamBufferInfos( + const std::vector& buffer_infos) { + std::vector result; + std::copy_if(buffer_infos.begin(), buffer_infos.end(), + std::back_inserter(result), [](const BufferInfo& buffer_info) { + return buffer_info.is_entry_parameter(); + }); + return result; +} + +// Returns a vector of BufferInfo instances in `buffer_infos` that are temp +// buffers. +std::vector ExtractTempBufferInfos( + const std::vector& buffer_infos) { + std::vector result; + std::copy_if(buffer_infos.begin(), buffer_infos.end(), + std::back_inserter(result), [](const BufferInfo& buffer_info) { + return buffer_info.is_temp_buffer(); + }); + return result; } // Add (from,to) rewrite pairs based on the given shape. These rewrite pairs @@ -278,6 +290,25 @@ Status ValidateFeedFetchCppNames(const tf2xla::Config& config) { return Status::OK(); } +// Returns a list of C++ expressions that, when executed, will construct the +// BufferInfo instances in `buffer_infos`. +std::vector BufferInfosToCppExpression( + const std::vector& buffer_infos) { + std::vector buffer_infos_as_strings; + std::transform(buffer_infos.begin(), buffer_infos.end(), + std::back_inserter(buffer_infos_as_strings), + [](const BufferInfo& buffer_info) { + std::pair encoded = buffer_info.Encode(); + string encoded_second_as_str = + encoded.second == ~0ULL + ? "~0ULL" + : strings::StrCat(encoded.second, "ULL"); + return strings::StrCat( + "::tensorflow::cpu_function_runtime::BufferInfo({", + encoded.first, "ULL, ", encoded_second_as_str, "})"); + }); + return buffer_infos_as_strings; +} } // namespace Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config, @@ -286,29 +317,35 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config, TF_RETURN_IF_ERROR(ValidateConfig(config)); TF_RETURN_IF_ERROR(ValidateFeedFetchCppNames(config)); const int64 result_index = compile_result.aot->result_buffer_index(); - const xla::BufferSizes& temp_sizes = compile_result.aot->buffer_sizes(); - if (result_index < 0 || result_index >= temp_sizes.size()) { + const std::vector& buffer_infos = + compile_result.aot->buffer_infos(); + const std::vector arg_index_table = + ::xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos); + std::vector buffer_infos_as_strings = + BufferInfosToCppExpression(buffer_infos); + if (result_index < 0 || result_index >= buffer_infos.size()) { return errors::InvalidArgument("result index: ", result_index, " is outside the range of temp sizes: [0,", - temp_sizes.size(), ")"); + buffer_infos.size(), ")"); } // Compute sizes and generate methods. - std::vector arg_sizes; - TF_RETURN_IF_ERROR(ComputeArgSizes(compile_result, &arg_sizes)); + std::vector buffer_infos_for_args = + ExtractEntryParamBufferInfos(buffer_infos); + std::vector buffer_infos_for_temps = + ExtractTempBufferInfos(buffer_infos); const xla::ProgramShape& ps = compile_result.program_shape; string methods_arg, methods_result; TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg)); TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result)); - const std::vector iarg(arg_sizes.begin(), arg_sizes.end()); - const std::vector itemp(temp_sizes.begin(), temp_sizes.end()); - const size_t arg_bytes_aligned = - runtime::aligned_buffer_bytes(iarg.data(), iarg.size()); - const size_t arg_bytes_total = total_buffer_bytes(iarg.data(), iarg.size()); - const size_t temp_bytes_aligned = - runtime::aligned_buffer_bytes(itemp.data(), itemp.size()); - const size_t temp_bytes_total = - total_buffer_bytes(itemp.data(), itemp.size()); + const size_t arg_bytes_aligned = cpu_function_runtime::AlignedBufferBytes( + buffer_infos_for_args.data(), buffer_infos_for_args.size(), + /*allocate_entry_params=*/true); + const size_t arg_bytes_total = TotalBufferBytes(buffer_infos_for_args); + const size_t temp_bytes_aligned = cpu_function_runtime::AlignedBufferBytes( + buffer_infos_for_temps.data(), buffer_infos_for_temps.size(), + /*allocate_entry_params=*/true); + const size_t temp_bytes_total = TotalBufferBytes(buffer_infos_for_temps); // Create rewrite strings for namespace start and end. string ns_start; @@ -343,8 +380,8 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config, // calling HloProfilePrinter::profile_counters_size. const string assign_profile_counters_size = opts.gen_hlo_profile_printer_data - ? "data->profile_counters_size = " - "data->hlo_profile_printer_data->profile_counters_size();" + ? "data->set_profile_counters_size(" + "data->hlo_profile_printer_data()->profile_counters_size());" : ""; // Use a poor-man's text templating mechanism; first populate the full header @@ -414,9 +451,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { static constexpr size_t kNumArgs = {{ARG_NUM}}; // Byte size of each argument buffer. There are kNumArgs entries. - static const intptr_t* ArgSizes() { - static constexpr intptr_t kArgSizes[kNumArgs] = {{{ARG_SIZES}}}; - return kArgSizes; + static const ::tensorflow::int64 ArgSize(::tensorflow::int32 index) { + return BufferInfos()[ArgIndexToBufferIndex()[index]].size(); } // Returns static data used to create an XlaCompiledCpuFunction. @@ -424,17 +460,17 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { static XlaCompiledCpuFunction::StaticData* kStaticData = [](){ XlaCompiledCpuFunction::StaticData* data = new XlaCompiledCpuFunction::StaticData; - data->raw_function = {{ENTRY}}; - data->arg_sizes = ArgSizes(); - data->num_args = kNumArgs; - data->temp_sizes = TempSizes(); - data->num_temps = kNumTemps; - data->result_index = kResultIndex; - data->arg_names = StaticArgNames(); - data->result_names = StaticResultNames(); - data->program_shape = StaticProgramShape(); - data->hlo_profile_printer_data = StaticHloProfilePrinterData(); - {{ASSIGN_PROFILE_COUNTERS_SIZE}} + data->set_raw_function({{ENTRY}}); + data->set_buffer_infos(BufferInfos()); + data->set_num_buffers(kNumBuffers); + data->set_arg_index_table(ArgIndexToBufferIndex()); + data->set_num_args(kNumArgs); + data->set_result_index(kResultIndex); + data->set_arg_names(StaticArgNames()); + data->set_result_names(StaticResultNames()); + data->set_program_shape(StaticProgramShape()); + data->set_hlo_profile_printer_data(StaticHloProfilePrinterData()); +{{ASSIGN_PROFILE_COUNTERS_SIZE}} return data; }(); return *kStaticData; @@ -482,17 +518,27 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { {{METHODS_RESULT}} private: - // Number of result and temporary buffers for the compiled computation. - static constexpr size_t kNumTemps = {{TEMP_NUM}}; - // The 0-based index of the result tuple in the temporary buffers. - static constexpr size_t kResultIndex = {{RESULT_INDEX}}; + // Number of buffers for the compiled computation. + static constexpr size_t kNumBuffers = {{NUM_BUFFERS}}; - // Byte size of each result / temporary buffer. There are kNumTemps entries. - static const intptr_t* TempSizes() { - static constexpr intptr_t kTempSizes[kNumTemps] = {{{TEMP_SIZES}}}; - return kTempSizes; + static const ::tensorflow::cpu_function_runtime::BufferInfo* BufferInfos() { + static const ::tensorflow::cpu_function_runtime::BufferInfo + kBufferInfos[kNumBuffers] = { +{{BUFFER_INFOS_AS_STRING}} + }; + return kBufferInfos; } + static const ::tensorflow::int32* ArgIndexToBufferIndex() { + static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = { +{{ARG_INDEX_TABLE}} + }; + return kArgIndexToBufferIndex; + } + + // The 0-based index of the result tuple in the temporary buffers. + static constexpr size_t kResultIndex = {{RESULT_INDEX}}; + // Array of names of each positional argument, terminated by nullptr. static const char** StaticArgNames() {{ARG_NAMES_CODE}} @@ -523,8 +569,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { {"{{ARG_BYTES_ALIGNED}}", strings::StrCat(arg_bytes_aligned)}, {"{{ARG_BYTES_TOTAL}}", strings::StrCat(arg_bytes_total)}, {"{{ARG_NAMES_CODE}}", arg_names_code}, - {"{{ARG_NUM}}", strings::StrCat(arg_sizes.size())}, - {"{{ARG_SIZES}}", str_util::Join(arg_sizes, ", ")}, + {"{{ARG_NUM}}", strings::StrCat(arg_index_table.size())}, + {"{{ARG_INDEX_TABLE}}", str_util::Join(arg_index_table, ", ")}, {"{{ASSIGN_PROFILE_COUNTERS_SIZE}}", assign_profile_counters_size}, {"{{CLASS}}", opts.class_name}, {"{{DECLS_FROM_OBJ_FILE}}", @@ -546,8 +592,9 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { {"{{RESULT_NAMES_CODE}}", result_names_code}, {"{{TEMP_BYTES_ALIGNED}}", strings::StrCat(temp_bytes_aligned)}, {"{{TEMP_BYTES_TOTAL}}", strings::StrCat(temp_bytes_total)}, - {"{{TEMP_NUM}}", strings::StrCat(temp_sizes.size())}, - {"{{TEMP_SIZES}}", str_util::Join(temp_sizes, ", ")}}; + {"{{NUM_BUFFERS}}", strings::StrCat(buffer_infos.size())}, + {"{{BUFFER_INFOS_AS_STRING}}", + str_util::Join(buffer_infos_as_strings, ",\n")}}; str_util::ReplaceAllPairs(header, rewrites); return Status::OK(); } diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc index 29bc9c13b889c86c2ba8776c7b067c54cb05bc43..60d59ae996e8f7ec490c98aeab05182626e61976 100644 --- a/tensorflow/compiler/aot/codegen_test.cc +++ b/tensorflow/compiler/aot/codegen_test.cc @@ -32,6 +32,8 @@ namespace tensorflow { namespace tfcompile { namespace { +using ::tensorflow::cpu_function_runtime::BufferInfo; + void ExpectErrorContains(const Status& status, StringPiece str) { EXPECT_NE(Status::OK(), status); EXPECT_TRUE(str_util::StrContains(status.error_message(), str)) @@ -171,8 +173,14 @@ TEST(CodegenTest, Golden) { fetch->mutable_id()->set_node_name("fetch0"); fetch->set_name("myfetch"); CompileResult compile_result; - compile_result.aot.reset( - new xla::cpu::CpuAotCompilationResult({}, {1, -1, 2, -1, 3, 120}, 5, {})); + compile_result.aot.reset(new xla::cpu::CpuAotCompilationResult( + {}, + {BufferInfo::MakeTempBuffer(1), + BufferInfo::MakeEntryParameter(/*size=*/8, /*param_number=*/0), + BufferInfo::MakeTempBuffer(2), + BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/1), + BufferInfo::MakeTempBuffer(3), BufferInfo::MakeTempBuffer(120)}, + 5, {})); compile_result.program_shape = xla::ShapeUtil::MakeProgramShape( { xla::ShapeUtil::MakeShape(xla::F32, {1, 2}), diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden index 6641d45e83020f4144616a6a2837c844330298f5..e4d8a02877c75fa72c5747650ab9c7ac229955b3 100644 --- a/tensorflow/compiler/aot/codegen_test_h.golden +++ b/tensorflow/compiler/aot/codegen_test_h.golden @@ -65,9 +65,8 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { static constexpr size_t kNumArgs = 2; // Byte size of each argument buffer. There are kNumArgs entries. - static const intptr_t* ArgSizes() { - static constexpr intptr_t kArgSizes[kNumArgs] = {8, 96}; - return kArgSizes; + static const ::tensorflow::int64 ArgSize(::tensorflow::int32 index) { + return BufferInfos()[ArgIndexToBufferIndex()[index]].size(); } // Returns static data used to create an XlaCompiledCpuFunction. @@ -75,17 +74,17 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { static XlaCompiledCpuFunction::StaticData* kStaticData = [](){ XlaCompiledCpuFunction::StaticData* data = new XlaCompiledCpuFunction::StaticData; - data->raw_function = entry_point; - data->arg_sizes = ArgSizes(); - data->num_args = kNumArgs; - data->temp_sizes = TempSizes(); - data->num_temps = kNumTemps; - data->result_index = kResultIndex; - data->arg_names = StaticArgNames(); - data->result_names = StaticResultNames(); - data->program_shape = StaticProgramShape(); - data->hlo_profile_printer_data = StaticHloProfilePrinterData(); - + data->set_raw_function(entry_point); + data->set_buffer_infos(BufferInfos()); + data->set_num_buffers(kNumBuffers); + data->set_arg_index_table(ArgIndexToBufferIndex()); + data->set_num_args(kNumArgs); + data->set_result_index(kResultIndex); + data->set_arg_names(StaticArgNames()); + data->set_result_names(StaticResultNames()); + data->set_program_shape(StaticProgramShape()); + data->set_hlo_profile_printer_data(StaticHloProfilePrinterData()); + return data; }(); return *kStaticData; @@ -215,17 +214,32 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { } private: - // Number of result and temporary buffers for the compiled computation. - static constexpr size_t kNumTemps = 6; - // The 0-based index of the result tuple in the temporary buffers. - static constexpr size_t kResultIndex = 5; + // Number of buffers for the compiled computation. + static constexpr size_t kNumBuffers = 6; + + static const ::tensorflow::cpu_function_runtime::BufferInfo* BufferInfos() { + static const ::tensorflow::cpu_function_runtime::BufferInfo + kBufferInfos[kNumBuffers] = { +::tensorflow::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({34ULL, 0ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({9ULL, ~0ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({386ULL, 1ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({13ULL, ~0ULL}), +::tensorflow::cpu_function_runtime::BufferInfo({481ULL, ~0ULL}) + }; + return kBufferInfos; + } - // Byte size of each result / temporary buffer. There are kNumTemps entries. - static const intptr_t* TempSizes() { - static constexpr intptr_t kTempSizes[kNumTemps] = {1, -1, 2, -1, 3, 120}; - return kTempSizes; + static const ::tensorflow::int32* ArgIndexToBufferIndex() { + static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = { +1, 3 + }; + return kArgIndexToBufferIndex; } + // The 0-based index of the result tuple in the temporary buffers. + static constexpr size_t kResultIndex = 5; + // Array of names of each positional argument, terminated by nullptr. static const char** StaticArgNames() { static const char* kNames[] = {"myfeed", nullptr}; diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h deleted file mode 100644 index d1a669ceb17b9fd71d26e978035283f8824b0376..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/aot/runtime.h +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// This file contains utilities to make it easier to invoke functions generated -// by tfcompile. Usage of these utilities is optional. - -#ifndef TENSORFLOW_COMPILER_AOT_RUNTIME_H_ -#define TENSORFLOW_COMPILER_AOT_RUNTIME_H_ - -#include "tensorflow/core/platform/types.h" - -namespace tensorflow { -namespace tfcompile { -namespace runtime { - -// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment. -static constexpr size_t kAlign = 64; - -// aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1 -// values. There are `n` entries in `sizes`. Each buffer is aligned to kAlign -// byte boundaries. -size_t aligned_buffer_bytes(const intptr_t* sizes, size_t n); - -// MallocContiguousBuffers allocates buffers for use by the entry point -// generated by tfcompile. `sizes` is an array of byte sizes for each buffer, -// where -1 causes the buffer pointer to be nullptr. There are `n` entries in -// `sizes`. If `annotate_initialized` is set, the allocated memory will be -// annotated as having been initialized - this is useful when allocating -// temporary buffers. -// -// A single contiguous block of memory is allocated, and portions of it are -// parceled out into `bufs`, which must have space for `n` entries. Returns the -// head of the allocated contiguous block, which should be passed to -// FreeContiguous when the buffers are no longer in use. -void* MallocContiguousBuffers(const intptr_t* sizes, size_t n, void** bufs, - bool annotate_initialized); - -// FreeContiguous frees the contiguous block of memory allocated by -// MallocContiguousBuffers. -void FreeContiguous(void* contiguous); - -} // namespace runtime -} // namespace tfcompile -} // namespace tensorflow - -#endif // TENSORFLOW_COMPILER_AOT_RUNTIME_H_ diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc index 6b098049cbd7539a2b2e2696b13139a8a6b28e0f..5deb47d12310d24dce847227bd119249210ffb8d 100644 --- a/tensorflow/compiler/aot/test.cc +++ b/tensorflow/compiler/aot/test.cc @@ -51,11 +51,9 @@ namespace tensorflow { namespace tfcompile { namespace { -void zero_buffers(void** bufs, const intptr_t* sizes, size_t n) { - for (int i = 0; i < n; ++i) { - if (sizes[i] != -1) { - memset(bufs[i], 0, sizes[i]); - } +void zero_buffers(XlaCompiledCpuFunction* computation) { + for (int i = 0; i < computation->num_args(); ++i) { + memset(computation->arg_data(i), 0, computation->arg_size(i)); } } @@ -66,7 +64,7 @@ TEST(TEST_NAME, NoCrash) { CPP_CLASS computation; computation.set_thread_pool(&device); - zero_buffers(computation.args(), CPP_CLASS::ArgSizes(), CPP_CLASS::kNumArgs); + zero_buffers(&computation); EXPECT_TRUE(computation.Run()); } @@ -80,7 +78,7 @@ void BM_NAME(int iters) { CPP_CLASS computation; computation.set_thread_pool(&device); - zero_buffers(computation.args(), CPP_CLASS::ArgSizes(), CPP_CLASS::kNumArgs); + zero_buffers(&computation); testing::StartTiming(); while (--iters) { diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc index fee46280e9a0e7ba2cf7c3ed46469ae8cc0841d4..0c0c676ece78565e03578d3e33633c7e23b77669 100644 --- a/tensorflow/compiler/aot/tests/tfcompile_test.cc +++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc @@ -44,8 +44,8 @@ using ::testing::IsSupersetOf; TEST(TFCompileTest, Add) { AddComp add; - EXPECT_EQ(add.arg0_data(), add.args()[0]); - EXPECT_EQ(add.arg1_data(), add.args()[1]); + EXPECT_EQ(add.arg0_data(), add.arg_data(0)); + EXPECT_EQ(add.arg1_data(), add.arg_data(1)); add.arg0() = 1; add.arg1() = 2; @@ -67,10 +67,10 @@ TEST(TFCompileTest, Add) { EXPECT_EQ(add_const.error_msg(), ""); EXPECT_EQ(add_const.arg0(), 123); EXPECT_EQ(add_const.arg0_data()[0], 123); - EXPECT_EQ(add_const.arg0_data(), add.args()[0]); + EXPECT_EQ(add_const.arg0_data(), add.arg_data(0)); EXPECT_EQ(add_const.arg1(), 456); EXPECT_EQ(add_const.arg1_data()[0], 456); - EXPECT_EQ(add_const.arg1_data(), add.args()[1]); + EXPECT_EQ(add_const.arg1_data(), add.arg_data(1)); EXPECT_EQ(add_const.result0(), 579); EXPECT_EQ(add_const.result0_data()[0], 579); EXPECT_EQ(add_const.result0_data(), add_const.results()[0]); @@ -85,8 +85,8 @@ TEST(TFCompileTest, Add_SetArg) { int32 arg_y = 32; add.set_arg0_data(&arg_x); add.set_arg1_data(&arg_y); - EXPECT_EQ(add.arg0_data(), add.args()[0]); - EXPECT_EQ(add.arg1_data(), add.args()[1]); + EXPECT_EQ(add.arg0_data(), add.arg_data(0)); + EXPECT_EQ(add.arg1_data(), add.arg_data(1)); EXPECT_TRUE(add.Run()); EXPECT_EQ(add.error_msg(), ""); @@ -97,7 +97,7 @@ TEST(TFCompileTest, Add_SetArg) { TEST(TFCompileTest, AddWithCkpt) { AddWithCkptComp add; - EXPECT_EQ(add.arg0_data(), add.args()[0]); + EXPECT_EQ(add.arg0_data(), add.arg_data(0)); add.arg0() = 1; EXPECT_TRUE(add.Run()); @@ -117,7 +117,7 @@ TEST(TFCompileTest, AddWithCkpt) { EXPECT_EQ(add_const.error_msg(), ""); EXPECT_EQ(add_const.arg0(), 111); EXPECT_EQ(add_const.arg0_data()[0], 111); - EXPECT_EQ(add_const.arg0_data(), add_const.args()[0]); + EXPECT_EQ(add_const.arg0_data(), add_const.arg_data(0)); EXPECT_EQ(add_const.result0(), 153); EXPECT_EQ(add_const.result0_data()[0], 153); EXPECT_EQ(add_const.result0_data(), add_const.results()[0]); @@ -125,7 +125,7 @@ TEST(TFCompileTest, AddWithCkpt) { TEST(TFCompileTest, AddWithCkptSaver) { AddWithCkptSaverComp add; - EXPECT_EQ(add.arg0_data(), add.args()[0]); + EXPECT_EQ(add.arg0_data(), add.arg_data(0)); add.arg0() = 1; EXPECT_TRUE(add.Run()); @@ -145,7 +145,7 @@ TEST(TFCompileTest, AddWithCkptSaver) { EXPECT_EQ(add_const.error_msg(), ""); EXPECT_EQ(add_const.arg0(), 111); EXPECT_EQ(add_const.arg0_data()[0], 111); - EXPECT_EQ(add_const.arg0_data(), add_const.args()[0]); + EXPECT_EQ(add_const.arg0_data(), add_const.arg_data(0)); EXPECT_EQ(add_const.result0(), 153); EXPECT_EQ(add_const.result0_data()[0], 153); EXPECT_EQ(add_const.result0_data(), add_const.results()[0]); @@ -153,9 +153,9 @@ TEST(TFCompileTest, AddWithCkptSaver) { TEST(TFCompileTest, Cond) { CondComp cond; - EXPECT_EQ(cond.arg0_data(), cond.args()[0]); - EXPECT_EQ(cond.arg1_data(), cond.args()[1]); - EXPECT_EQ(cond.arg2_data(), cond.args()[2]); + EXPECT_EQ(cond.arg0_data(), cond.arg_data(0)); + EXPECT_EQ(cond.arg1_data(), cond.arg_data(1)); + EXPECT_EQ(cond.arg2_data(), cond.arg_data(2)); cond.arg1() = 10; cond.arg2() = 20; { @@ -178,8 +178,8 @@ TEST(TFCompileTest, Cond) { TEST(TFCompileTest, Gather) { GatherComp gather; - EXPECT_EQ(gather.arg0_data(), gather.args()[0]); - EXPECT_EQ(gather.arg1_data(), gather.args()[1]); + EXPECT_EQ(gather.arg0_data(), gather.arg_data(0)); + EXPECT_EQ(gather.arg1_data(), gather.arg_data(1)); // Successful gather. { @@ -202,12 +202,12 @@ TEST(TFCompileTest, Gather) { EXPECT_EQ(gather_const.arg0(i), params[i]); EXPECT_EQ(gather_const.arg0_data()[i], params[i]); } - EXPECT_EQ(gather_const.arg0_data(), gather_const.args()[0]); + EXPECT_EQ(gather_const.arg0_data(), gather_const.arg_data(0)); for (int i = 0; i < 2; ++i) { EXPECT_EQ(gather_const.arg1(i), indices[i]); EXPECT_EQ(gather_const.arg1_data()[i], indices[i]); } - EXPECT_EQ(gather_const.arg1_data(), gather_const.args()[1]); + EXPECT_EQ(gather_const.arg1_data(), gather_const.arg_data(1)); for (int i = 0; i < 2; ++i) { EXPECT_EQ(gather_const.result0(i), results[i]); EXPECT_EQ(gather_const.result0_data()[i], results[i]); @@ -222,8 +222,8 @@ TEST(TFCompileTest, MatMul2) { foo::bar::MatMulComp matmul; matmul.set_thread_pool(&device); - EXPECT_EQ(matmul.arg0_data(), matmul.args()[0]); - EXPECT_EQ(matmul.arg1_data(), matmul.args()[1]); + EXPECT_EQ(matmul.arg0_data(), matmul.arg_data(0)); + EXPECT_EQ(matmul.arg1_data(), matmul.arg_data(1)); // Test using the argN() methods. { @@ -271,12 +271,12 @@ TEST(TFCompileTest, MatMul2) { EXPECT_EQ(matmul_const.arg0(i / 3, i % 3), args[i]); EXPECT_EQ(matmul_const.arg0_data()[i], args[i]); } - EXPECT_EQ(matmul_const.arg0_data(), matmul.args()[0]); + EXPECT_EQ(matmul_const.arg0_data(), matmul.arg_data(0)); for (int i = 0; i < 6; ++i) { EXPECT_EQ(matmul_const.arg1(i / 2, i % 2), args[i + 6]); EXPECT_EQ(matmul_const.arg1_data()[i], args[i + 6]); } - EXPECT_EQ(matmul_const.arg1_data(), matmul.args()[1]); + EXPECT_EQ(matmul_const.arg1_data(), matmul.arg_data(1)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(matmul_const.result0(i / 2, i % 2), results[i]); EXPECT_EQ(matmul_const.result0_data()[i], results[i]); @@ -300,8 +300,8 @@ TEST(TFCompileTest, MatMul2_SetArg) { float arg1[3][2] = {{7, 8}, {9, 10}, {11, 12}}; matmul.set_arg0_data(&arg0); matmul.set_arg1_data(&arg1); - EXPECT_EQ(matmul.arg0_data(), matmul.args()[0]); - EXPECT_EQ(matmul.arg1_data(), matmul.args()[1]); + EXPECT_EQ(matmul.arg0_data(), matmul.arg_data(0)); + EXPECT_EQ(matmul.arg1_data(), matmul.arg_data(1)); EXPECT_TRUE(matmul.Run()); EXPECT_EQ(matmul.error_msg(), ""); @@ -319,8 +319,8 @@ TEST(TFCompileTest, MatMulAndAdd1) { MatMulAndAddComp muladd; muladd.set_thread_pool(&device); - EXPECT_EQ(muladd.arg0_data(), muladd.args()[0]); - EXPECT_EQ(muladd.arg1_data(), muladd.args()[1]); + EXPECT_EQ(muladd.arg0_data(), muladd.arg_data(0)); + EXPECT_EQ(muladd.arg1_data(), muladd.arg_data(1)); // Test methods with positional args and results. { @@ -346,12 +346,12 @@ TEST(TFCompileTest, MatMulAndAdd1) { EXPECT_EQ(muladd_const.arg0(i / 2, i % 2), args[i]); EXPECT_EQ(muladd_const.arg0_data()[i], args[i]); } - EXPECT_EQ(muladd_const.arg0_data(), muladd.args()[0]); + EXPECT_EQ(muladd_const.arg0_data(), muladd.arg_data(0)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(muladd_const.arg1(i / 2, i % 2), args[i + 4]); EXPECT_EQ(muladd_const.arg1_data()[i], args[i + 4]); } - EXPECT_EQ(muladd_const.arg1_data(), muladd.args()[1]); + EXPECT_EQ(muladd_const.arg1_data(), muladd.arg_data(1)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(muladd_const.result0(i / 2, i % 2), results0[i]); EXPECT_EQ(muladd_const.result0_data()[i], results0[i]); @@ -387,12 +387,12 @@ TEST(TFCompileTest, MatMulAndAdd1) { EXPECT_EQ(muladd_const.arg_x(i / 2, i % 2), args[i]); EXPECT_EQ(muladd_const.arg_x_data()[i], args[i]); } - EXPECT_EQ(muladd_const.arg_x_data(), muladd.args()[0]); + EXPECT_EQ(muladd_const.arg_x_data(), muladd.arg_data(0)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(muladd_const.arg_y(i / 2, i % 2), args[i + 4]); EXPECT_EQ(muladd_const.arg_y_data()[i], args[i + 4]); } - EXPECT_EQ(muladd_const.arg_y_data(), muladd.args()[1]); + EXPECT_EQ(muladd_const.arg_y_data(), muladd.arg_data(1)); for (int i = 0; i < 4; ++i) { EXPECT_EQ(muladd_const.result_x_y_prod(i / 2, i % 2), results0[i]); EXPECT_EQ(muladd_const.result_x_y_prod_data()[i], results0[i]); @@ -407,8 +407,8 @@ TEST(TFCompileTest, MatMulAndAdd1) { TEST(TFCompileTest, Function) { // The function is equivalent to an addition FunctionComp add_fn; - EXPECT_EQ(add_fn.arg0_data(), add_fn.args()[0]); - EXPECT_EQ(add_fn.arg1_data(), add_fn.args()[1]); + EXPECT_EQ(add_fn.arg0_data(), add_fn.arg_data(0)); + EXPECT_EQ(add_fn.arg1_data(), add_fn.arg_data(1)); add_fn.arg0() = 1; add_fn.arg1() = 2; @@ -451,8 +451,8 @@ TEST(TFCompileTest, AssertEqAndReturnDiff) { // Assert is converted into a no-op in XLA, so there is no failure even if the // two args are different. AssertComp assert; - EXPECT_EQ(assert.arg0_data(), assert.args()[0]); - EXPECT_EQ(assert.arg1_data(), assert.args()[1]); + EXPECT_EQ(assert.arg0_data(), assert.arg_data(0)); + EXPECT_EQ(assert.arg1_data(), assert.arg_data(1)); assert.arg0() = 2; assert.arg1() = 1; diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl index 5c57fee326ca743dcb8aaae354d261ed4d7f44be..326f73b975aec3a7a6bc7cdc9a92f540ad545ad6 100644 --- a/tensorflow/compiler/aot/tfcompile.bzl +++ b/tensorflow/compiler/aot/tfcompile.bzl @@ -16,339 +16,365 @@ tf_library( ) """ -load("//tensorflow:tensorflow.bzl", - "if_android", "tf_cc_test", "tf_copts") - -def tf_library(name, graph, config, - freeze_checkpoint=None, freeze_saver=None, - cpp_class=None, gen_test=True, gen_benchmark=True, - visibility=None, testonly=None, - tfcompile_flags=None, - tfcompile_tool="//tensorflow/compiler/aot:tfcompile", - include_standard_runtime_deps=True, - enable_xla_hlo_profiling=False, deps=None, tags=None): - """Runs tfcompile to compile a TensorFlow graph into executable code. - - Given an invocation of tf_library(name="foo", ...), generates the following - build targets: - foo: A cc_library containing the generated header and computation. - foo_test: A cc_test with simple tests and benchmarks. Only created if - gen_test=True. - foo_benchmark: A cc_binary that runs a minimal-dependency benchmark, useful - for mobile devices or other platforms that can't compile the - full test libraries. Only created if gen_benchmark=True. - - Args: - name: The name of the build rule. - graph: The TensorFlow GraphDef to compile. If the file ends in '.pbtxt' it - is expected to be in the human-readable proto text format, otherwise it is - expected to be in the proto binary format. - config: File containing tensorflow.tf2xla.Config proto. If the file ends - in '.pbtxt' it is expected to be in the human-readable proto text format, - otherwise it is expected to be in the proto binary format. - freeze_checkpoint: If provided, run freeze_graph with this checkpoint to - convert variables into constants. - freeze_saver: If provided, run freeze_graph with this saver, in SaverDef - binary form, to convert variables into constants. - cpp_class: The name of the generated C++ class, wrapping the generated - function. The syntax of this flag is - [[::],...]. This mirrors the C++ syntax - for referring to a class, where multiple namespaces may precede the class - name, separated by double-colons. The class will be generated in the - given namespace(s), or if no namespaces are given, within the global - namespace. - gen_test: If True, also generate a cc_test rule that builds a simple - test and benchmark. - gen_benchmark: If True, also generate a binary with a simple benchmark. - Unlike the output of gen_test, this benchmark can be run on android. - visibility: Bazel build visibility. - testonly: Bazel testonly attribute. - tfcompile_flags: Extra flags to pass to tfcompile to control compilation. - tfcompile_tool: The tfcompile binary. A non-default can be passed to - use a tfcompile built with extra dependencies. - include_standard_runtime_deps: If True, the standard list of kernel/runtime - deps is added to deps. If False, deps must contain the full set of deps - needed by the generated library. - enable_xla_hlo_profiling: Enable XLA HLO profiling in the generated program, - and emit metadata that lets us pretty-print the gathered profile counters. - deps: a list of deps to include on the build rules for the generated - library, added to the standard deps if standard_runtime_deps is True. - tags: tags to apply to subsidiary build rules. - - The output header is called .h. - """ - if not cpp_class: - fail("cpp_class must be specified") - - tfcompile_graph = graph - if freeze_checkpoint or freeze_saver: - if not freeze_checkpoint: - fail("freeze_checkpoint must be specified when freeze_saver is specified") +load( + "//tensorflow:tensorflow.bzl", + "if_android", + "tf_cc_test", + "tf_copts", +) - freeze_name = "freeze_" + name - freeze_file = freeze_name + ".pb" +def tf_library( + name, + graph, + config, + freeze_checkpoint = None, + freeze_saver = None, + cpp_class = None, + gen_test = True, + gen_benchmark = True, + visibility = None, + testonly = None, + tfcompile_flags = None, + tfcompile_tool = "//tensorflow/compiler/aot:tfcompile", + include_standard_runtime_deps = True, + enable_xla_hlo_profiling = False, + deps = None, + tags = None): + """Runs tfcompile to compile a TensorFlow graph into executable code. - # First run tfcompile to generate the list of out_nodes. - out_nodes_file = "out_nodes_" + freeze_name - native.genrule( - name=("gen_" + out_nodes_file), - srcs=[config], - outs=[out_nodes_file], - cmd=("$(location " + tfcompile_tool + ")" + - " --config=$(location " + config + ")" + - " --dump_fetch_nodes > $@"), - tools=[tfcompile_tool], - # Run tfcompile on the build host, rather than forge, since it's - # typically way faster on the local machine. - local=1, - tags=tags, - ) + Given an invocation of tf_library(name="foo", ...), generates the following + build targets: + foo: A cc_library containing the generated header and + computation. + foo_test: A cc_test with simple tests and benchmarks. Only created if + gen_test=True. + foo_benchmark: A cc_binary that runs a minimal-dependency benchmark, + useful for mobile devices or other platforms that can't + compile the full test libraries. Only created if + gen_benchmark=True. + The output header is called .h. - # Now run freeze_graph to convert variables into constants. - freeze_args = (" --input_graph=$(location " + graph + ")" + - " --checkpoint_version=1" + - " --input_binary=" + str(not graph.endswith(".pbtxt")) + - " --input_checkpoint=$(location " + freeze_checkpoint + ")" + - " --output_graph=$(location " + freeze_file + ")" + - " --output_node_names=$$(<$(location " + out_nodes_file + - "))") - freeze_saver_srcs = [] - if freeze_saver: - freeze_args += " --input_saver=$(location " + freeze_saver + ")" - freeze_saver_srcs += [freeze_saver] - native.genrule( - name=freeze_name, - srcs=[ - graph, - freeze_checkpoint, - out_nodes_file, - ] + freeze_saver_srcs, - outs=[freeze_file], - cmd=("$(location //tensorflow/python/tools:freeze_graph)" + - freeze_args), - tools=["//tensorflow/python/tools:freeze_graph"], - tags=tags, - ) - tfcompile_graph = freeze_file + Args: + name: The name of the build rule. + graph: The TensorFlow GraphDef to compile. If the file ends in '.pbtxt' + it is expected to be in the human-readable proto text format, otherwise + it is expected to be in the proto binary format. + config: File containing tensorflow.tf2xla.Config proto. If the file ends + in '.pbtxt' it is expected to be in the human-readable proto text + format, otherwise it is expected to be in the proto binary format. + freeze_checkpoint: If provided, run freeze_graph with this checkpoint to + convert variables into constants. + freeze_saver: If provided, run freeze_graph with this saver, in SaverDef + binary form, to convert variables into constants. + cpp_class: The name of the generated C++ class, wrapping the generated + function. The syntax of this flag is + [[::],...]. This mirrors the C++ syntax + for referring to a class, where multiple namespaces may precede the + class name, separated by double-colons. The class will be generated in + the given namespace(s), or if no namespaces are given, within the global + namespace. + gen_test: If True, also generate a cc_test rule that builds a simple + test and benchmark. + gen_benchmark: If True, also generate a binary with a simple benchmark. + Unlike the output of gen_test, this benchmark can be run on android. + visibility: Bazel build visibility. + testonly: Bazel testonly attribute. + tfcompile_flags: Extra flags to pass to tfcompile to control compilation. + tfcompile_tool: The tfcompile binary. A non-default can be passed to + use a tfcompile built with extra dependencies. + include_standard_runtime_deps: If True, the standard list of + kernel/runtime deps is added to deps. If False, deps must contain the + full set of deps needed by the generated library. + enable_xla_hlo_profiling: Enable XLA HLO profiling in the generated + program, and emit metadata that lets us pretty-print the gathered + profile counters. + deps: a list of deps to include on the build rules for the generated + library, added to the standard deps if standard_runtime_deps is True. + tags: tags to apply to subsidiary build rules. + """ + if not cpp_class: + fail("cpp_class must be specified") - # Rule that runs tfcompile to produce the header and object file. - header_file = name + ".h" - metadata_object_file = name + "_tfcompile_metadata.o" - function_object_file = name + "_tfcompile_function.o" - ep = ("__" + native.package_name() + "__" + name).replace("/", "_") - if type(tfcompile_flags) == type(""): - flags = tfcompile_flags - else: - flags = " ".join(["'" + arg.replace("'", "'\\''") + "'" for arg in (tfcompile_flags or [])]) - if enable_xla_hlo_profiling: - profiling_flag = "--xla_hlo_profile" - else: - profiling_flag = "" - native.genrule( - name=("gen_" + name), - srcs=[ - tfcompile_graph, - config, - ], - outs=[ - header_file, - metadata_object_file, - function_object_file, - ], - cmd=("$(location " + tfcompile_tool + ")" + - " --graph=$(location " + tfcompile_graph + ")" + - " --config=$(location " + config + ")" + - " --entry_point=" + ep + - " --cpp_class=" + cpp_class + - " --target_triple=" + target_llvm_triple() + - " --out_header=$(@D)/" + header_file + - " --out_metadata_object=$(@D)/" + metadata_object_file + - " --out_function_object=$(@D)/" + function_object_file + - " " + flags + " " + profiling_flag), - tools=[tfcompile_tool], - visibility=visibility, - testonly=testonly, - # Run tfcompile on the build host since it's typically faster on the local - # machine. - # - # Note that setting the local=1 attribute on a *test target* causes the - # test infrastructure to skip that test. However this is a genrule, not a - # test target, and runs with --genrule_strategy=forced_forge, meaning the - # local=1 attribute is ignored, and the genrule is still run. - # - # https://www.bazel.io/versions/master/docs/be/general.html#genrule - local=1, - tags=tags, - ) + tfcompile_graph = graph + if freeze_checkpoint or freeze_saver: + if not freeze_checkpoint: + fail("freeze_checkpoint must be specified when freeze_saver is " + + "specified") - # Rule that runs tfcompile to produce the SessionModule proto, useful for - # debugging. TODO(b/64813587): Once the SessionModule proto is - # deterministic, move this into the main rule above. - session_module_pb = name + "_session_module.pb" - native.genrule( - name=(name + "_session_module"), - srcs=[ - tfcompile_graph, - config, - ], - outs=[ - session_module_pb, - ], - cmd=("$(location " + tfcompile_tool + ")" + - " --graph=$(location " + tfcompile_graph + ")" + - " --config=$(location " + config + ")" + - " --entry_point=" + ep + - " --cpp_class=" + cpp_class + - " --target_triple=" + target_llvm_triple() + - " --out_session_module=$(@D)/" + session_module_pb + - " " + flags), - tools=[tfcompile_tool], - visibility=visibility, - testonly=testonly, - local=1, - tags=tags, - ) + freeze_name = "freeze_" + name + freeze_file = freeze_name + ".pb" - # The cc_library rule packaging up the header and object file, and needed - # kernel implementations. - need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1) - native.cc_library( - name=name, - srcs=[function_object_file, metadata_object_file], - hdrs=[header_file], - visibility=visibility, - testonly=testonly, - deps = [ - # These deps are required by all tf_library targets even if - # include_standard_runtime_deps is False. Without them, the - # generated code will fail to compile. - "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function", - "//tensorflow/core:framework_lite", - ] + (need_xla_data_proto and [ - # If we're generating the program shape, we must depend on the proto. - "//tensorflow/compiler/xla:xla_data_proto", - ] or []) + (enable_xla_hlo_profiling and [ - "//tensorflow/compiler/xla/service:hlo_profile_printer_data" - ] or []) + (include_standard_runtime_deps and [ - # TODO(cwhipkey): only depend on kernel code that the model actually needed. - "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d", - "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d", - "//tensorflow/compiler/xla/service/cpu:runtime_conv2d", - "//tensorflow/compiler/xla/service/cpu:runtime_matmul", - "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d", - "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul", - "//third_party/eigen3", - ] or []) + (deps or []), - tags=tags, - ) + # First run tfcompile to generate the list of out_nodes. + out_nodes_file = "out_nodes_" + freeze_name + native.genrule( + name = ("gen_" + out_nodes_file), + srcs = [config], + outs = [out_nodes_file], + cmd = ("$(location " + tfcompile_tool + ")" + + " --config=$(location " + config + ")" + + " --dump_fetch_nodes > $@"), + tools = [tfcompile_tool], + # Run tfcompile on the build host, rather than forge, since it's + # typically way faster on the local machine. + local = 1, + tags = tags, + ) - # Variables used for gen_test and gen_benchmark. - no_ns_name = "" - cpp_class_split = cpp_class.rsplit("::", maxsplit=2) - if len(cpp_class_split) == 1: - no_ns_name = cpp_class_split[0] - else: - no_ns_name = cpp_class_split[1] - sed_replace = ( - "-e \"s|{{TFCOMPILE_HEADER}}|$(location " + header_file + ")|g\" " + - "-e \"s|{{TFCOMPILE_CPP_CLASS}}|" + cpp_class + "|g\" " + - "-e \"s|{{TFCOMPILE_NAME}}|" + no_ns_name + "|g\" ") + # Now run freeze_graph to convert variables into constants. + freeze_args = ( + " --input_graph=$(location " + graph + ")" + + " --checkpoint_version=1" + + " --input_binary=" + str(not graph.endswith(".pbtxt")) + + " --input_checkpoint=$(location " + freeze_checkpoint + ")" + + " --output_graph=$(location " + freeze_file + ")" + + " --output_node_names=$$(<$(location " + out_nodes_file + + "))" + ) + freeze_saver_srcs = [] + if freeze_saver: + freeze_args += " --input_saver=$(location " + freeze_saver + ")" + freeze_saver_srcs += [freeze_saver] + native.genrule( + name = freeze_name, + srcs = [ + graph, + freeze_checkpoint, + out_nodes_file, + ] + freeze_saver_srcs, + outs = [freeze_file], + cmd = ("$(location " + + "//tensorflow/python/tools:freeze_graph)" + + freeze_args), + tools = ["//tensorflow/python/tools:freeze_graph"], + tags = tags, + ) + tfcompile_graph = freeze_file - if gen_test: - test_name = name + "_test" - test_file = test_name + ".cc" - # Rule to rewrite test.cc to produce the test_file. + # Rule that runs tfcompile to produce the header and object file. + header_file = name + ".h" + metadata_object_file = name + "_tfcompile_metadata.o" + function_object_file = name + "_tfcompile_function.o" + ep = ("__" + native.package_name() + "__" + name).replace("/", "_") + if type(tfcompile_flags) == type(""): + flags = tfcompile_flags + else: + flags = " ".join([ + "'" + arg.replace("'", "'\\''") + "'" + for arg in (tfcompile_flags or []) + ]) + if enable_xla_hlo_profiling: + profiling_flag = "--xla_hlo_profile" + else: + profiling_flag = "" native.genrule( - name=("gen_" + test_name), - testonly=1, - srcs=[ - "//tensorflow/compiler/aot:test.cc", + name = ("gen_" + name), + srcs = [ + tfcompile_graph, + config, + ], + outs = [ header_file, + metadata_object_file, + function_object_file, ], - outs=[test_file], - cmd=("sed " + sed_replace + - " $(location //tensorflow/compiler/aot:test.cc) " + - "> $(OUTS)"), - tags=tags, - ) - - # The cc_test rule for the generated code. To ensure that this works - # reliably across build configurations, we must use tf_cc_test instead of - # native.cc_test. This is related to how we build - # //tensorflow/core:lib -- see the note in tensorflow/core/BUILD - # for more details. - tf_cc_test( - name=test_name, - srcs=[test_file], - deps=[ - ":" + name, - "//tensorflow/compiler/aot:runtime", - "//tensorflow/compiler/aot:tf_library_test_main", - "//tensorflow/compiler/xla:executable_run_options", - "//third_party/eigen3", - "//tensorflow/core:lib", - "//tensorflow/core:test", - ], - tags=tags, + cmd = ("$(location " + tfcompile_tool + ")" + + " --graph=$(location " + tfcompile_graph + ")" + + " --config=$(location " + config + ")" + + " --entry_point=" + ep + + " --cpp_class=" + cpp_class + + " --target_triple=" + target_llvm_triple() + + " --out_header=$(@D)/" + header_file + + " --out_metadata_object=$(@D)/" + metadata_object_file + + " --out_function_object=$(@D)/" + function_object_file + + " " + flags + " " + profiling_flag), + tools = [tfcompile_tool], + visibility = visibility, + testonly = testonly, + # Run tfcompile on the build host since it's typically faster on the + # local machine. + # + # Note that setting the local=1 attribute on a *test target* causes the + # test infrastructure to skip that test. However this is a genrule, not + # a test target, and runs with --genrule_strategy=forced_forge, meaning + # the local=1 attribute is ignored, and the genrule is still run. + # + # https://www.bazel.io/versions/master/docs/be/general.html#genrule + local = 1, + tags = tags, ) - if gen_benchmark: - benchmark_name = name + "_benchmark" - benchmark_file = benchmark_name + ".cc" - benchmark_main = ("//tensorflow/compiler/aot:" + - "benchmark_main.template") - - # Rule to rewrite benchmark.cc to produce the benchmark_file. + # Rule that runs tfcompile to produce the SessionModule proto, useful for + # debugging. TODO(b/64813587): Once the SessionModule proto is + # deterministic, move this into the main rule above. + session_module_pb = name + "_session_module.pb" native.genrule( - name=("gen_" + benchmark_name), - srcs=[ - benchmark_main, - header_file, + name = (name + "_session_module"), + srcs = [ + tfcompile_graph, + config, ], + outs = [ + session_module_pb, + ], + cmd = ("$(location " + tfcompile_tool + ")" + + " --graph=$(location " + tfcompile_graph + ")" + + " --config=$(location " + config + ")" + + " --entry_point=" + ep + + " --cpp_class=" + cpp_class + + " --target_triple=" + target_llvm_triple() + + " --out_session_module=$(@D)/" + session_module_pb + + " " + flags), + tools = [tfcompile_tool], + visibility = visibility, testonly = testonly, - outs=[benchmark_file], - cmd=("sed " + sed_replace + - " $(location " + benchmark_main + ") " + - "> $(OUTS)"), - tags=tags, + local = 1, + tags = tags, ) - # The cc_benchmark rule for the generated code. This does not need the - # tf_cc_binary since we (by deliberate design) do not depend on - # //tensorflow/core:lib. - # - # Note: to get smaller size on android for comparison, compile with: - # --copt=-fvisibility=hidden - # --copt=-D_LIBCPP_TYPE_VIS=_LIBCPP_HIDDEN - # --copt=-D_LIBCPP_EXCEPTION_ABI=_LIBCPP_HIDDEN - native.cc_binary( - name=benchmark_name, - srcs=[benchmark_file], + # The cc_library rule packaging up the header and object file, and needed + # kernel implementations. + need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1) + native.cc_library( + name = name, + srcs = [function_object_file, metadata_object_file], + hdrs = [header_file], + visibility = visibility, testonly = testonly, - copts = tf_copts(), - linkopts = if_android(["-pie", "-s"]), - deps=[ - ":" + name, - "//tensorflow/compiler/aot:benchmark", - "//tensorflow/compiler/aot:runtime", - "//tensorflow/compiler/xla:executable_run_options", + deps = [ + # These deps are required by all tf_library targets even if + # include_standard_runtime_deps is False. Without them, the + # generated code will fail to compile. + "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function", + "//tensorflow/core:framework_lite", + ] + (need_xla_data_proto and [ + # If we're generating the program shape, we must depend on the + # proto. + "//tensorflow/compiler/xla:xla_data_proto", + ] or []) + (enable_xla_hlo_profiling and [ + "//tensorflow/compiler/xla/service:hlo_profile_printer_data", + ] or []) + (include_standard_runtime_deps and [ + # TODO(cwhipkey): only depend on kernel code that the model actually + # needed. + "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d", + "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_2d", + "//tensorflow/compiler/xla/service/cpu:runtime_conv2d", + "//tensorflow/compiler/xla/service/cpu:runtime_matmul", + "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d", + "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul", "//third_party/eigen3", - ] + if_android([ - "//tensorflow/compiler/aot:benchmark_extra_android", - ]), - tags=tags, + ] or []) + (deps or []), + tags = tags, + ) + + # Variables used for gen_test and gen_benchmark. + cpp_class_split = cpp_class.rsplit("::", maxsplit = 2) + if len(cpp_class_split) == 1: + no_ns_name = cpp_class_split[0] + else: + no_ns_name = cpp_class_split[1] + sed_replace = ( + "-e \"s|{{TFCOMPILE_HEADER}}|$(location " + header_file + ")|g\" " + + "-e \"s|{{TFCOMPILE_CPP_CLASS}}|" + cpp_class + "|g\" " + + "-e \"s|{{TFCOMPILE_NAME}}|" + no_ns_name + "|g\" " ) + if gen_test: + test_name = name + "_test" + test_file = test_name + ".cc" + + # Rule to rewrite test.cc to produce the test_file. + native.genrule( + name = ("gen_" + test_name), + testonly = 1, + srcs = [ + "//tensorflow/compiler/aot:test.cc", + header_file, + ], + outs = [test_file], + cmd = ( + "sed " + sed_replace + + " $(location //tensorflow/compiler/aot:test.cc) " + + "> $(OUTS)" + ), + tags = tags, + ) + + # The cc_test rule for the generated code. To ensure that this works + # reliably across build configurations, we must use tf_cc_test instead + # of native.cc_test. This is related to how we build + # //tensorflow/core:lib -- see the note in + # tensorflow/core/BUILD for more details. + tf_cc_test( + name = test_name, + srcs = [test_file], + deps = [ + ":" + name, + "//tensorflow/compiler/aot:tf_library_test_main", + "//tensorflow/compiler/xla:executable_run_options", + "//third_party/eigen3", + "//tensorflow/core:lib", + "//tensorflow/core:test", + ], + tags = tags, + ) + + if gen_benchmark: + benchmark_name = name + "_benchmark" + benchmark_file = benchmark_name + ".cc" + benchmark_main = ("//tensorflow/compiler/aot:" + + "benchmark_main.template") + + # Rule to rewrite benchmark.cc to produce the benchmark_file. + native.genrule( + name = ("gen_" + benchmark_name), + srcs = [ + benchmark_main, + header_file, + ], + testonly = testonly, + outs = [benchmark_file], + cmd = ("sed " + sed_replace + + " $(location " + benchmark_main + ") " + + "> $(OUTS)"), + tags = tags, + ) + + # The cc_benchmark rule for the generated code. This does not need the + # tf_cc_binary since we (by deliberate design) do not depend on + # //tensorflow/core:lib. + # + # Note: to get smaller size on android for comparison, compile with: + # --copt=-fvisibility=hidden + # --copt=-D_LIBCPP_TYPE_VIS=_LIBCPP_HIDDEN + # --copt=-D_LIBCPP_EXCEPTION_ABI=_LIBCPP_HIDDEN + native.cc_binary( + name = benchmark_name, + srcs = [benchmark_file], + testonly = testonly, + copts = tf_copts(), + linkopts = if_android(["-pie", "-s"]), + deps = [ + ":" + name, + "//tensorflow/compiler/aot:benchmark", + "//tensorflow/compiler/xla:executable_run_options", + "//third_party/eigen3", + ] + if_android([ + "//tensorflow/compiler/aot:benchmark_extra_android", + ]), + tags = tags, + ) + def target_llvm_triple(): - """Returns the target LLVM triple to be used for compiling the target.""" - # TODO(toddw): Add target_triple for other targets. For details see: - # http://llvm.org/docs/doxygen/html/Triple_8h_source.html - return select({ - "//tensorflow:android_armeabi": "armv5-none-android", - "//tensorflow:android_arm": "armv7-none-android", - "//tensorflow:android_arm64": "aarch64-none-android", - "//tensorflow:android_x86": "i686-none-android", - "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu", - "//tensorflow:darwin": "x86_64-none-darwin", - "//conditions:default": "x86_64-pc-linux", - }) + """Returns the target LLVM triple to be used for compiling the target.""" + + # TODO(toddw): Add target_triple for other targets. For details see: + # http://llvm.org/docs/doxygen/html/Triple_8h_source.html + return select({ + "//tensorflow:android_armeabi": "armv5-none-android", + "//tensorflow:android_arm": "armv7-none-android", + "//tensorflow:android_arm64": "aarch64-none-android", + "//tensorflow:android_x86": "i686-none-android", + "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu", + "//tensorflow:darwin": "x86_64-none-darwin", + "//conditions:default": "x86_64-pc-linux", + }) diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index e34347b9d4e31be1b37b7ef1cb30911dd290ea7b..9e6d7fa0b11879046a8b37cba3cb9635b52e191c 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -160,6 +160,7 @@ cc_library( "//tensorflow/compiler/jit/ops:xla_ops", "//tensorflow/compiler/tf2xla:common", "//tensorflow/compiler/tf2xla:dump_graph", + "//tensorflow/compiler/tf2xla:tf2xla_util", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/tf2xla/kernels:xla_ops", "//tensorflow/compiler/xla:util", @@ -178,6 +179,7 @@ cc_library( "//tensorflow/core/kernels:constant_op", "//tensorflow/core/kernels:control_flow_ops", "//tensorflow/core/kernels:fifo_queue", + "//tensorflow/core/kernels:function_ops", "//tensorflow/core/kernels:identity_n_op", "//tensorflow/core/kernels:identity_op", "//tensorflow/core/kernels:no_op", @@ -186,6 +188,9 @@ cc_library( "//tensorflow/core/kernels:sendrecv_ops", "//tensorflow/core/kernels:shape_ops", "//tensorflow/core/kernels:variable_ops", + "//tensorflow/core/kernels/data:generator_dataset_op", + "//tensorflow/core/kernels/data:iterator_ops", + "//tensorflow/core/kernels/data:prefetch_dataset_op", ], ) @@ -306,14 +311,17 @@ cc_library( srcs = [ "build_xla_launch_ops_pass.cc", "deadness_analysis.cc", + "deadness_analysis_internal.h", "encapsulate_subgraphs_pass.cc", "mark_for_compilation_pass.cc", + "mark_for_compilation_pass_test_helper.cc", ], hdrs = [ "build_xla_launch_ops_pass.h", "deadness_analysis.h", "encapsulate_subgraphs_pass.h", "mark_for_compilation_pass.h", + "mark_for_compilation_pass_test_helper.h", ], deps = [ ":common", @@ -378,10 +386,38 @@ tf_cc_test( ) tf_cc_test( - name = "compilation_passes_test", + name = "deadness_analysis_test", size = "small", srcs = [ + "deadness_analysis_internal.h", "deadness_analysis_test.cc", + ], + deps = [ + ":common", + ":compilation_passes", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:cc_ops_internal", + "//tensorflow/cc:function_ops", + "//tensorflow/cc:ops", + "//tensorflow/cc:sendrecv_ops", + "//tensorflow/compiler/jit/kernels:xla_launch_op", + "//tensorflow/compiler/tf2xla:xla_compiler", + "//tensorflow/compiler/tf2xla/kernels:xla_ops", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:framework_internal", + "//tensorflow/core:graph", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + +tf_cc_test( + name = "compilation_passes_test", + size = "small", + srcs = [ "encapsulate_subgraphs_pass_test.cc", "mark_for_compilation_pass_test.cc", ], diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc index d81e5fe9008975c126bcd8e0ea7cef19f1eb1bf3..62007e6115d3fb81def844fcfa462094e223f565 100644 --- a/tensorflow/compiler/jit/deadness_analysis.cc +++ b/tensorflow/compiler/jit/deadness_analysis.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/jit/deadness_analysis.h" +#include "tensorflow/compiler/jit/deadness_analysis_internal.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/tensor_id.h" #include "tensorflow/core/lib/gtl/flatset.h" @@ -45,6 +46,7 @@ class Predicate { virtual string ToString() const = 0; int64 hash() const { return hash_; } + virtual gtl::ArraySlice GetOperands() const = 0; virtual Kind kind() const = 0; virtual ~Predicate() {} @@ -89,7 +91,8 @@ class AndPredicate : public Predicate { Kind kind() const override { return Kind::kAnd; } - const gtl::ArraySlice operands() const { return operands_; } + gtl::ArraySlice GetOperands() const override { return operands_; } + gtl::ArraySlice operands() const { return operands_; } private: std::vector operands_; @@ -116,7 +119,8 @@ class OrPredicate : public Predicate { } Kind kind() const override { return Kind::kOr; } - const gtl::ArraySlice operands() const { return operands_; } + gtl::ArraySlice GetOperands() const override { return operands_; } + gtl::ArraySlice operands() const { return operands_; } private: std::vector operands_; @@ -127,17 +131,18 @@ class NotPredicate : public Predicate { public: explicit NotPredicate(Predicate* operand) : Predicate(HashPredicateSequence(Kind::kNot, {operand})), - operand_(operand) {} + operands_({operand}) {} string ToString() const override { return strings::StrCat("~", operand()->ToString()); } Kind kind() const override { return Kind::kNot; } - Predicate* operand() const { return operand_; } + Predicate* operand() const { return operands_[0]; } + gtl::ArraySlice GetOperands() const override { return operands_; } private: - Predicate* operand_; + std::array operands_; }; // Represents an uninterpreted symbol in a logical predicate. @@ -151,8 +156,13 @@ class SymbolPredicate : public Predicate { tensor_id_(std::move(tensor_id)), must_be_true_(must_be_true) {} - string ToString() const override { return tensor_id_.ToString(); } + string ToString() const override { + return must_be_true() ? strings::StrCat("*", tensor_id_.ToString()) + : tensor_id_.ToString(); + } + Kind kind() const override { return Kind::kSymbol; } + gtl::ArraySlice GetOperands() const override { return {}; } // If `must_be_true()` is true this SymbolPredicate represents the proposition // "tensor_id() is live and evaluates to true". @@ -283,10 +293,7 @@ Predicate* PredicateFactory::MakeAndOrImpl(gtl::ArraySlice operands, if (op->kind() == pred_kind) { // "Inline" the operands of an inner And/Or into the parent And/Or. - gtl::ArraySlice operands = - is_and ? dynamic_cast(op)->operands() - : dynamic_cast(op)->operands(); - for (Predicate* subop : operands) { + for (Predicate* subop : op->GetOperands()) { if (simplified_ops_set.insert(subop).second) { simplified_ops.push_back(subop); } @@ -348,6 +355,7 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis { Status Populate(); bool HasInputsWithMismatchingDeadness(const Node& node) override; void Print() const override; + gtl::FlatMap PredicateMapAsString() const; private: enum class EdgeKind { kDataAndControl, kDataOnly, kControlOnly }; @@ -563,4 +571,24 @@ DeadnessAnalysis::~DeadnessAnalysis() {} return Status::OK(); } +gtl::FlatMap +DeadnessAnalysisImpl::PredicateMapAsString() const { + gtl::FlatMap result; + std::vector tensor_ids; + for (const auto& kv_pair : predicate_map_) { + CHECK(result.insert({kv_pair.first, kv_pair.second->ToString()}).second); + } + return result; +} + +namespace deadness_analysis_internal { +Status ComputePredicates(const Graph& graph, + PredicateMapTy* out_predicate_map) { + DeadnessAnalysisImpl impl(&graph); + TF_RETURN_IF_ERROR(impl.Populate()); + *out_predicate_map = impl.PredicateMapAsString(); + return Status::OK(); +} +} // namespace deadness_analysis_internal + } // namespace tensorflow diff --git a/tensorflow/compiler/jit/deadness_analysis_internal.h b/tensorflow/compiler/jit/deadness_analysis_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..cdef4051108fdc5d063ab592676c7644989155bf --- /dev/null +++ b/tensorflow/compiler/jit/deadness_analysis_internal.h @@ -0,0 +1,32 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_ +#define TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_ + +#include "tensorflow/core/graph/tensor_id.h" +#include "tensorflow/core/lib/gtl/flatmap.h" + +namespace tensorflow { +namespace deadness_analysis_internal { + +// Returns a map describing the predicate each Tensor was mapped to. For +// testing purposes only. +using PredicateMapTy = gtl::FlatMap; +Status ComputePredicates(const Graph& graph, PredicateMapTy* out_predicate_map); +} // namespace deadness_analysis_internal +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_JIT_DEADNESS_ANALYSIS_INTERNAL_H_ diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc index 584385cab7665dce9c7c92eab6293436ca22c9b7..6881095b51758d2e0b06c60021bc8c2860ac566e 100644 --- a/tensorflow/compiler/jit/deadness_analysis_test.cc +++ b/tensorflow/compiler/jit/deadness_analysis_test.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/cc/ops/function_ops.h" #include "tensorflow/cc/ops/sendrecv_ops.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/jit/deadness_analysis_internal.h" #include "tensorflow/compiler/jit/defs.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" @@ -439,5 +440,28 @@ TEST(DeadnessAnalysisTest, RecvVsSwitch) { EXPECT_TRUE(result->HasInputsWithMismatchingDeadness(*logical_and.node())); } +TEST(DeadnessAnalysisTest, RecvVsSwitchText) { + // Demonstrates why we need the must_be_true bit on SymbolP. + Scope root = Scope::NewRootScope().ExitOnError(); + + Output recv = ops::_Recv(root.WithOpName("recv"), DT_BOOL, "tensor", "sender", + 0, "receiver"); + Output value = ops::Placeholder(root.WithOpName("value"), DT_BOOL); + ops::Switch sw(root.WithOpName("switch"), value, recv); + Output logical_and = + ops::LogicalAnd(root.WithOpName("and"), recv, sw.output_true); + + std::unique_ptr result; + TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result)); + + deadness_analysis_internal::PredicateMapTy predicate_map; + TF_ASSERT_OK(deadness_analysis_internal::ComputePredicates(*root.graph(), + &predicate_map)); + + TensorId logical_and_output_0 = {logical_and.node()->name(), + Graph::kControlSlot}; + EXPECT_EQ(predicate_map[logical_and_output_0], "(recv:0 & *recv:0)"); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc index fdd71c6a588ad96301f543651c8531e6f9c3ca05..f150bf1819d407e1c6a279673a89de4307b5426b 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc @@ -1161,8 +1161,7 @@ Status Encapsulator::Subgraph::ReplaceFunctionDef( strings::StrCat("replace_encapsulate_fdef_", name), fdef); } - TF_RETURN_IF_ERROR(library->RemoveFunction(name)); - TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef)); + TF_RETURN_IF_ERROR(library->ReplaceFunction(name, fdef)); return Status::OK(); } diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD index 00a6f4075f9a18efc3895b033eb6d08e36088a53..8f78c110cb15f3cbc0344d102764241996b0d7de 100644 --- a/tensorflow/compiler/jit/kernels/BUILD +++ b/tensorflow/compiler/jit/kernels/BUILD @@ -16,6 +16,7 @@ cc_library( "//tensorflow/compiler/jit:xla_device", "//tensorflow/compiler/jit:xla_launch_util", "//tensorflow/compiler/tf2xla:common", + "//tensorflow/compiler/tf2xla:tf2xla_util", "//tensorflow/compiler/tf2xla:xla_compiler", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla/client:client_library", diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc index b313d48011b561eaab618692df49d1558c34a77c..7f4370b5b07b249bc9cf1f2ecf4086de359be68c 100644 --- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc +++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/jit/xla_device.h" #include "tensorflow/compiler/jit/xla_launch_util.h" #include "tensorflow/compiler/tf2xla/shape_util.h" +#include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/client_library.h" @@ -199,7 +200,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { run_options.set_stream(stream); run_options.set_allocator(xla_allocator); run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device()); - run_options.set_rng_seed(ctx->step_id()); + run_options.set_rng_seed(GetXLARandomSeed()); Env* env = Env::Default(); auto start_time = env->NowMicros(); @@ -209,7 +210,8 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) { auto elapsed = env->NowMicros() - start_time; VLOG(2) << "Elapsed time: " << elapsed << "us"; - launch_context.PopulateOutputs(ctx, kernel, run_result.ConsumeValueOrDie()); + OP_REQUIRES_OK(ctx, launch_context.PopulateOutputs( + ctx, kernel, run_result.ConsumeValueOrDie())); VLOG(1) << "Done"; } diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 38eb6d830f4d4e889810acd0f928e93d0b22bde8..d33287fcc38337fa37bdfd2f441a9755058a54ab 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -65,6 +65,7 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) { // XLA cluster so it can't implement the forward-tensor-ref semantic. Leave // such nodes out of XLA clusters. if (HasForwardedRefInput(node)) { + VLOG(2) << "Rejecting " << node.name() << ": Identity with unsafe cast."; return false; } @@ -84,14 +85,13 @@ bool IsCompilableCall(const NodeDef& call_def, bool IsCompilableWhile(const Node& while_node, const DeviceType& jit_device_type, int depth, FunctionLibraryRuntime* lib_runtime) { - VLOG(2) << "Loop marking: " << while_node.type_string(); - const NameAttrList* name_attr; NodeDef call; Status status; status = GetNodeAttr(while_node.attrs(), "cond", &name_attr); if (!status.ok()) { - VLOG(2) << "Missing 'cond' attribute on While node."; + VLOG(2) << "Rejecting While " << while_node.name() + << ": missing 'cond' attribute on While node."; return false; } const string cond_func = name_attr->name(); @@ -99,12 +99,14 @@ bool IsCompilableWhile(const Node& while_node, call.set_op(cond_func); *call.mutable_attr() = name_attr->attr(); if (!IsCompilableCall(call, jit_device_type, depth + 1, lib_runtime)) { - VLOG(2) << "Can't compile loop condition: " << cond_func; + VLOG(2) << "Rejecting While " << while_node.name() + << ": can't compile loop condition: " << cond_func; return false; } status = GetNodeAttr(while_node.attrs(), "body", &name_attr); if (!status.ok()) { - VLOG(2) << "Missing 'body' attribute on While node."; + VLOG(2) << "Rejecting While " << while_node.name() + << ": missing 'body' attribute on While node."; return false; } const string body_func = name_attr->name(); @@ -112,10 +114,10 @@ bool IsCompilableWhile(const Node& while_node, call.set_op(body_func); *call.mutable_attr() = name_attr->attr(); if (!IsCompilableCall(call, jit_device_type, depth + 1, lib_runtime)) { - VLOG(2) << "Can't compile loop body: " << body_func; + VLOG(2) << "Rejecting While " << while_node.name() + << ": can't compile loop body: " << body_func; return false; } - VLOG(2) << "Loop is compilable."; return true; } @@ -125,10 +127,9 @@ bool IsCompilableWhile(const Node& while_node, bool IsCompilableCall(const NodeDef& call_def, const DeviceType& jit_device_type, int depth, FunctionLibraryRuntime* lib_runtime) { - VLOG(2) << "Function marking: " << call_def.op(); - if (depth > kMaxRecursionDepth) { - VLOG(2) << "Function depth limit exceeded"; + VLOG(2) << "Rejecting " << call_def.op() + << ": function depth limit exceeded."; return false; } @@ -136,7 +137,8 @@ bool IsCompilableCall(const NodeDef& call_def, Status status = lib_runtime->Instantiate(call_def.op(), AttrSlice(call_def), &handle); if (!status.ok()) { - VLOG(2) << "Could not instantiate " << call_def.op() << ": " << status; + VLOG(2) << "Rejecting " << call_def.op() + << ": could not instantiate: " << status; return false; } const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle); @@ -150,7 +152,8 @@ bool IsCompilableCall(const NodeDef& call_def, // tf2xla to translate the TF graph into XLA. So we avoid this for now. // // TODO(b/36139787): Create a mechanism to set inlining hints. - VLOG(2) << "Can't compile noinline function: " << fdef.DebugString(); + VLOG(2) << "Rejecting " << call_def.op() + << ": can't compile noinline function."; return false; } @@ -164,12 +167,11 @@ bool IsCompilableCall(const NodeDef& call_def, if (!HasXLAKernel(*node, jit_device_type) && !IsCompilableCall(node->def(), jit_device_type, depth + 1, lib_runtime)) { - VLOG(2) << "Function marking failed: unsupported op " << node->name() - << ": " << node->def().ShortDebugString(); + VLOG(2) << "Rejecting " << call_def.op() << ": unsupported op " + << node->name() << ": " << node->def().ShortDebugString(); return false; } } - VLOG(2) << "Function is compilable: " << call_def.op(); return true; } @@ -357,24 +359,27 @@ Status FindCompilationCandidates( } std::sort(sorted_nodes.begin(), sorted_nodes.end(), NodeComparatorID()); + if (fuel >= std::numeric_limits::max() / 2) { + // The assumption is that if fuel started out as INT64_MAX, it will forever + // stay greater than INT64_MAX / 2. + VLOG(2) << "Starting fuel: infinity"; + } else { + VLOG(2) << "Starting fuel: " << fuel; + } + for (Node* node : sorted_nodes) { - VLOG(2) << "Fuel: " << fuel; if (fuel <= 0) { - VLOG(2) + VLOG(1) << "Hit fuel limit; not marking any remaining ops as clusterable."; break; } - VLOG(2) << "FindCompilationCandidates(): Processing " - << node->DebugString(); - DeviceType device_type(""); TF_RETURN_IF_ERROR( DeviceToDeviceType(node->assigned_device_name(), &device_type)); if (is_compilable_fn && !is_compilable_fn(node, device_type)) { - VLOG(2) << "Compilation rejected node: not compilable " << node->name() - << ": " << node->type_string(); + // is_compilable_fn has already logged the reason if it returned false. continue; } @@ -384,14 +389,14 @@ Status FindCompilationCandidates( DeviceType jit_device_type(registration->compilation_device_name); if (!HasXLAKernel(*node, jit_device_type) && !IsCompilableCall(node->def(), jit_device_type, 0, lib_runtime)) { - VLOG(2) << "Compilation rejected node: unsupported op " << node->name() - << ": " << node->type_string(); + VLOG(2) << "Rejecting " << node->name() << ": unsupported op " + << node->type_string(); continue; } if (!registration->compile_resource_ops && HasResourceInputOrOutput(*node)) { - VLOG(2) << "Compilation rejected node: resource input/output " - << node->name() << ": " << node->type_string(); + VLOG(2) << "Rejecting: " << node->name() << ": resource input/output " + << node->type_string(); continue; } if (node->type_string() == "While" && @@ -401,15 +406,11 @@ Status FindCompilationCandidates( // _Arg nodes in a top-level function represent feeds. // Do not compile them. if (node->type_string() == "_Arg") { - VLOG(2) << "Skipping jit compilation for '_Arg'-typed node " - << node->DebugString(); continue; } // _Retval nodes in a top-level function represent fetches. // Do not compile them. if (node->type_string() == "_Retval") { - VLOG(2) << "Compilation rejected node: return value " << node->name() - << ": " << node->type_string(); continue; } candidates->insert(node); @@ -462,6 +463,7 @@ Status MarkForCompilationPass::Run( VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit; VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only; + VLOG(1) << "flags->tf_xla_auto_jit = " << flags->tf_xla_auto_jit; const FunctionLibraryDefinition* fld = options.flib_def; std::unique_ptr deadness; @@ -474,6 +476,7 @@ Status MarkForCompilationPass::Run( const XlaOpRegistry::DeviceRegistration* registration; if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), ®istration)) { + VLOG(2) << "Rejecting " << node->name() << ": could not find JIT device."; return false; } @@ -483,21 +486,36 @@ Status MarkForCompilationPass::Run( // If there is a _XlaCompile annotation, use its value. bool compile = false; Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile); - if (status.ok()) return compile; + if (status.ok()) { + if (!compile) { + VLOG(2) << "Rejecting " << node->name() << ": kXlaCompileAttr(" + << kXlaCompileAttr << ") is false."; + } + return compile; + } status = fld->GetAttr(*node, kXlaCompileAttr, &compile); - if (status.ok()) return compile; + if (status.ok()) { + if (!compile) { + VLOG(2) << "Rejecting " << node->name() << ": kXlaCompileAttr(" + << kXlaCompileAttr << ") on callee is false."; + } + return compile; + } // If inputs to `node` can have conflicting deadness (i.e. some are alive // and some are dead) then don't compile it. XLA cannot represent the // deadness semantics of these nodes correctly and auto-clustering these // nodes can cause deadness to propagate to nodes that should be live. if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) { + VLOG(2) << "Rejecting " << node->name() << ": mismatching deadness."; return false; } // Check for fusable ops only if requested. if (global_jit_level > 0 && fusion_only && !IsXlaFusable(node->def())) { + VLOG(2) << "Rejecting " << node->name() + << ": not fusable op but fusion_only enabled."; return false; } @@ -505,8 +523,17 @@ Status MarkForCompilationPass::Run( // Ignore enable_jit_by_default if global jit compilation for CPU // is explicitly requested via tf_xla_cpu_global_jit flag bool ignore_registration = cpu_global_jit && device_type == DEVICE_CPU; - return (ignore_registration || registration->enable_jit_by_default) && - global_jit_level > 0; + bool should_compile = + (ignore_registration || registration->enable_jit_by_default) && + global_jit_level > 0; + if (!should_compile) { + if (global_jit_level <= 0) { + VLOG(2) << "Rejecting " << node->name() << ": global jit disabled."; + } else { + VLOG(2) << "Rejecting " << node->name() << ": JIT for device disabled."; + } + } + return should_compile; }; return RunImpl(options, is_compilable); } diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.h b/tensorflow/compiler/jit/mark_for_compilation_pass.h index e9acbfb19e42cb43cb0b986c438a569de29b2ebc..f1137af3c1e8539fda318d88d2c5b5187953ccab 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.h +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h @@ -40,20 +40,18 @@ class MarkForCompilationPass : public GraphOptimizationPass { Status Run(const GraphOptimizationPassOptions& options) override; - // Run() just calls RunImpl() if --tf_xla_auto_jit is enabled. To run the pass - // unconditionally, call RunImpl() directly. - // is_compilable_fn, if set, is a predicate that must be true for a node to - // be compiled. + private: Status RunImpl(const GraphOptimizationPassOptions& options, const std::function& is_compilable_fn = {}); + + friend class MarkForCompilationPassTestHelper; }; // Returns true iff 'ndef' is a call to a function that is compilable. A // function is compilable iff every operator in the function body is // compilable. bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef); - } // namespace tensorflow #endif // TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_H_ diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc index 2c5f4fb774fcab082c0d0d316cdc6757cacc1e96..a780d4a936a3b757495c26d337f19c80a67f343a 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/jit/mark_for_compilation_pass.h" +#include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h" #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/ops/array_ops.h" @@ -39,27 +39,6 @@ namespace { REGISTER_OP("UncompilableNullary").Output("o: float"); REGISTER_OP("UncompilableUnary").Input("a: float").Output("o: float"); -Status MarkForCompilation(std::unique_ptr* graph, - FunctionLibraryDefinition* flib_def) { - // Assign all nodes to the CPU device. - static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0"; - for (Node* n : (*graph)->nodes()) { - n->set_assigned_device_name(kCpuDevice); - } - - GraphOptimizationPassOptions opt_options; - opt_options.graph = graph; - opt_options.flib_def = flib_def; - MarkForCompilationPass pass; - return pass.RunImpl(opt_options); -} - -Status MarkForCompilation(std::unique_ptr* graph) { - FunctionDefLibrary flib; - FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib); - return MarkForCompilation(graph, &flib_def); -} - std::unordered_map GetClusters(const Graph& graph) { std::unordered_map ids; for (Node* node : graph.nodes()) { @@ -88,7 +67,7 @@ TEST(XlaCompilationTest, Chains) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(4, clusters.size()); EXPECT_EQ(clusters["B"], clusters["C"]); @@ -113,7 +92,7 @@ TEST(XlaCompilationTest, UncompilableCycles) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_TRUE(clusters.empty()); @@ -133,7 +112,7 @@ TEST(XlaCompilationTest, CompilableCycles) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(3, clusters.size()); @@ -156,7 +135,7 @@ TEST(XlaCompilationTest, Complex128Unsupported) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_TRUE(clusters.empty()); } @@ -177,7 +156,7 @@ TEST(XlaCompilationTest, HalfSupported) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_FALSE(clusters.empty()); } @@ -206,7 +185,7 @@ TEST(XlaCompilationTest, ConcatWithConstArg) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(3, clusters.size()); // Everything should be compiled. } @@ -241,7 +220,8 @@ TEST(XlaCompilationTest, FunctionCalls) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph, &flib_def)); + TF_ASSERT_OK( + MarkForCompilationPassTestHelper::MarkForCompilation(&graph, &flib_def)); auto clusters = GetClusters(*graph); EXPECT_EQ(2, clusters.size()); @@ -272,7 +252,7 @@ TEST(XlaCompilationTest, MetadataOpsDontStartClusters) { ops::UnaryOp("Shape", d, builder.opts().WithName("E")); TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(0, clusters.size()); // Nothing should be compiled. } @@ -359,7 +339,7 @@ TEST(XlaCompilationTest, SymbolicGradients) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(2, clusters.size()); @@ -384,7 +364,7 @@ TEST(XlaCompilationTest, Loops) { std::unique_ptr graph(new Graph(OpRegistry::Global())); TF_EXPECT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); // Nothing should be compiled. In particular, 'd' and 'c' must not be @@ -411,7 +391,7 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) { TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); // The computation is: C = A + relu(A) @@ -442,7 +422,7 @@ TEST(XlaCompilationTest, CyclesWithSplittingScopes) { TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); // The computation is: D = relu(A) + (A @ relu(A)) @@ -472,7 +452,7 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) { TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); // The computation is: C = A @ relu(A) @@ -512,7 +492,7 @@ TEST(XlaCompilationTest, Resources) { ops::UnaryOp("Relu", d, builder.opts().WithName("E")); TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(0, clusters.size()); // Nothing should be compiled. } @@ -542,7 +522,7 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) { TF_EXPECT_OK(root.ToGraph(graph.get())); - Status status = MarkForCompilation(&graph); + Status status = MarkForCompilationPassTestHelper::MarkForCompilation(&graph); EXPECT_FALSE(status.ok()); EXPECT_TRUE(str_util::StrContains(status.ToString(), "Edge from c to a would create a cycle.\n" @@ -570,7 +550,7 @@ TEST(XlaCompilationTest, Retval) { TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get())); } - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_EQ(2, clusters.size()); @@ -588,7 +568,7 @@ TEST(XlaCompilationTest, DontCountIdentityOps) { auto r = ops::_Retval(root.WithOpName("R"), c, 0); } TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_TRUE(clusters.empty()); @@ -604,7 +584,7 @@ TEST(XlaCompilationTest, DontCountIdentityOpsWithLocalJit) { auto r = ops::_Retval(root.WithOpName("R"), b, 0); } TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); auto clusters = GetClusters(*graph); EXPECT_TRUE(clusters.empty()); @@ -618,7 +598,7 @@ TEST(XlaCompilationTest, ConstOp) { auto c = ops::Const(root.WithOpName("const"), 0.5f); c.node()->AddAttr(kXlaCompileAttr, true); TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); EXPECT_EQ(1, GetClusters(*graph).size()); } @@ -629,7 +609,7 @@ TEST(XlaCompilationTest, ConstOp) { auto c = ops::Const(root.WithOpName("const"), string("string")); c.node()->AddAttr(kXlaCompileAttr, true); TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); EXPECT_TRUE(GetClusters(*graph).empty()); } } @@ -644,7 +624,7 @@ TEST(XlaCompilationTest, DontClusterIdentityWithRefInput) { std::unique_ptr graph(new Graph(OpRegistry::Global())); TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); std::unordered_map clusters = GetClusters(*graph); @@ -667,7 +647,7 @@ TEST(XlaCompilationTest, ClusterIdentityWithNonRefInput) { std::unique_ptr graph(new Graph(OpRegistry::Global())); TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); std::unordered_map clusters = GetClusters(*graph); @@ -699,7 +679,7 @@ TEST(XlaCompilationTest, ClusterControlTrigger) { std::unique_ptr graph(new Graph(OpRegistry::Global())); TF_ASSERT_OK(root.ToGraph(graph.get())); - TF_ASSERT_OK(MarkForCompilation(&graph)); + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); std::unordered_map clusters = GetClusters(*graph); diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..a84b82e47923b2e7eec0e7eb848bd4377befbd07 --- /dev/null +++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc @@ -0,0 +1,40 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h" + +namespace tensorflow { +/*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation( + std::unique_ptr* graph, FunctionLibraryDefinition* flib_def) { + // Assign all nodes to the CPU device. + static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0"; + for (Node* n : (*graph)->nodes()) { + n->set_assigned_device_name(kCpuDevice); + } + + GraphOptimizationPassOptions opt_options; + opt_options.graph = graph; + opt_options.flib_def = flib_def; + MarkForCompilationPass pass; + return pass.RunImpl(opt_options); +} + +/*static*/ Status MarkForCompilationPassTestHelper::MarkForCompilation( + std::unique_ptr* graph) { + FunctionDefLibrary flib; + FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib); + return MarkForCompilation(graph, &flib_def); +} +} // namespace tensorflow diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..b9a0531cb0e431a98d57a6d9a2e3e41b51e7b743 --- /dev/null +++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h @@ -0,0 +1,35 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_ +#define TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_ + +#include "tensorflow/compiler/jit/mark_for_compilation_pass.h" + +namespace tensorflow { +class MarkForCompilationPassTestHelper { + public: + // Runs the MarkForCompilation pass on `graph` after assigning all nodes in + // `graph` to the CPU device. To make testing easier, ignores device + // registration, _XlaCompile attributes, input deadness and global jit level. + static Status MarkForCompilation(std::unique_ptr* graph, + FunctionLibraryDefinition* flib_def); + + // Like `MarkForCompilation` but creates `flib_def` from the op registry. + static Status MarkForCompilation(std::unique_ptr* graph); +}; +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_JIT_MARK_FOR_COMPILATION_PASS_TEST_HELPER_H_ diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc index 08c357c87919760fffa43f0d014e5ce82035d138..7140d47a9421ec73d0144e855b490f89569e6ae9 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.cc +++ b/tensorflow/compiler/jit/xla_compilation_cache.cc @@ -258,6 +258,7 @@ Status XlaCompilationCache::CompileImpl( xla::LocalExecutable** executable, const XlaCompiler::CompileOptions* compile_options, bool compile_single_op) { + CHECK_NE(executable, nullptr); VLOG(1) << "XlaCompilationCache::Compile " << DebugString(); if (VLOG_IS_ON(2)) { @@ -295,7 +296,7 @@ Status XlaCompilationCache::CompileImpl( // protect the contents of the cache entry. Entry* entry; { - mutex_lock lock(mu_); + mutex_lock lock(compile_cache_mu_); // Find or create a cache entry. std::unique_ptr& e = cache_[signature]; if (!e) { @@ -311,6 +312,8 @@ Status XlaCompilationCache::CompileImpl( if (!entry->compiled) { VLOG(1) << "Compilation cache miss for signature: " << SignatureDebugString(signature); + tensorflow::Env* env = tensorflow::Env::Default(); + const uint64 compile_start_us = env->NowMicros(); // Do the actual JIT compilation without holding the lock (it can take // a long time.) std::vector args; @@ -329,18 +332,35 @@ Status XlaCompilationCache::CompileImpl( compile_options ? *compile_options : XlaCompiler::CompileOptions(), function, args, &entry->compilation_result); } - } - *compilation_result = &entry->compilation_result; - if (entry->compilation_status.ok() && executable) { - if (entry->executable == nullptr) { - entry->compilation_status = BuildExecutable( - options, entry->compilation_result, &entry->executable); + TF_RETURN_IF_ERROR(entry->compilation_status); + CHECK_EQ(entry->executable.get(), nullptr); + entry->compilation_status = + BuildExecutable(options, entry->compilation_result, &entry->executable); + + const uint64 compile_end_us = env->NowMicros(); + const uint64 compile_time_us = compile_end_us - compile_start_us; + { + mutex_lock lock(compile_stats_mu_); + auto it = compile_stats_.emplace(function.name(), CompileStats{}).first; + it->second.compile_count++; + it->second.cumulative_compile_time_us += compile_time_us; + VLOG(1) << "compiled " << function.name() << " " + << it->second.compile_count + << " times, compile time: " << compile_time_us + << " us, cumulative: " << it->second.cumulative_compile_time_us + << " us (" + << tensorflow::strings::HumanReadableElapsedTime(compile_time_us / + 1.0e6) + << " / " + << tensorflow::strings::HumanReadableElapsedTime( + it->second.cumulative_compile_time_us / 1.0e6) + << ")"; } - *executable = entry->executable.get(); } - - Status status = entry->compilation_status; - return status; + TF_RETURN_IF_ERROR(entry->compilation_status); + *compilation_result = &entry->compilation_result; + *executable = entry->executable.get(); + return Status::OK(); } } // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h index be1043d8c3fc0573922837e541615114a6d7a1a5..fc5f008f4f52c32d97e680784082d0e7bcb7d8eb 100644 --- a/tensorflow/compiler/jit/xla_compilation_cache.h +++ b/tensorflow/compiler/jit/xla_compilation_cache.h @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -150,9 +151,22 @@ class XlaCompilationCache : public ResourceBase { std::unique_ptr executable GUARDED_BY(mu); }; - mutex mu_; - std::unordered_map, Signature::Hash> cache_ - GUARDED_BY(mu_); + mutex compile_cache_mu_; + gtl::FlatMap, Signature::Hash> cache_ + GUARDED_BY(compile_cache_mu_); + + struct CompileStats { + // Number of times the cluster has been (re-)compiled. + int64 compile_count = 0; + + // Cumulative time spent compiling the cluster. + int64 cumulative_compile_time_us = 0; + }; + mutex compile_stats_mu_; + + // Maps cluster names to compilation statistics for said cluster. + gtl::FlatMap compile_stats_ + GUARDED_BY(compile_stats_mu_); TF_DISALLOW_COPY_AND_ASSIGN(XlaCompilationCache); }; diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc index d288d37bc75380168a31937024dd41bdbe7dce9d..dd84fb34c171f8d2174444ddd3b3b476e7142718 100644 --- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc +++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h" #include "tensorflow/compiler/jit/xla_device.h" #include "tensorflow/compiler/jit/xla_launch_util.h" +#include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" @@ -71,13 +72,14 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx, run_options.set_stream(stream); run_options.set_allocator(client->backend().memory_allocator()); run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device()); - run_options.set_rng_seed(ctx->step_id()); + run_options.set_rng_seed(GetXLARandomSeed()); xla::StatusOr run_result = executable->Run(launch_context.arguments(), run_options); TF_RETURN_IF_ERROR(run_result.status()); - launch_context.PopulateOutputs(ctx, result, run_result.ConsumeValueOrDie()); + TF_RETURN_IF_ERROR(launch_context.PopulateOutputs( + ctx, result, run_result.ConsumeValueOrDie())); return Status::OK(); } diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc index c55eba2f79ddcf10931ea659a64df559cef06ec5..2a2691a6a404520da4df451293ec0cb6028a165d 100644 --- a/tensorflow/compiler/jit/xla_device.cc +++ b/tensorflow/compiler/jit/xla_device.cc @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/client_library.h" +#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/dma_helper.h" @@ -211,17 +212,20 @@ XlaDevice::XlaDevice( use_multiple_streams), device_ordinal_(device_ordinal), jit_device_name_(jit_device_name), - xla_allocator_(nullptr), platform_(platform), use_multiple_streams_(use_multiple_streams), transfer_as_literal_(transfer_as_literal), shape_representation_fn_(shape_representation_fn) { - VLOG(1) << "Created XLA device " << jit_device_name; + VLOG(1) << "Created XLA device " << jit_device_name << " " << this; + thread_pool_.reset(new thread::ThreadPool(options.env, "xla_device", + /*num_threads=*/1)); } XlaDevice::~XlaDevice() { - if (gpu_device_info_ != nullptr) { - gpu_device_info_->default_context->Unref(); + VLOG(1) << "Destroying XLA device " << jit_device_name_ << " " << this; + mutex_lock lock(mu_); + if (device_context_) { + device_context_->Unref(); } } @@ -237,6 +241,11 @@ xla::LocalClient* XlaDevice::client() const { } Allocator* XlaDevice::GetAllocator(AllocatorAttributes attr) { + mutex_lock lock(mu_); + return GetAllocatorLocked(attr); +} + +Allocator* XlaDevice::GetAllocatorLocked(AllocatorAttributes attr) { if (attr.on_host()) { return cpu_allocator(); } @@ -249,83 +258,111 @@ Allocator* XlaDevice::GetAllocator(AllocatorAttributes attr) { return xla_allocator_; } -xla::StatusOr XlaDevice::GetStream() { - if (!stream_) { - xla::Backend* backend = client()->mutable_backend(); - TF_ASSIGN_OR_RETURN(stream_, backend->BorrowStream(device_ordinal_)); - } - return stream_.get(); +Status XlaDevice::EnsureDeviceContextOk() { + mutex_lock lock(mu_); + return GetDeviceContextLocked().status(); } -xla::StatusOr XlaDevice::GetDeviceToHostStream() { - if (!use_multiple_streams_) { - return GetStream(); - } - if (!device_to_host_stream_) { - xla::Backend* backend = client()->mutable_backend(); - TF_ASSIGN_OR_RETURN(device_to_host_stream_, - backend->BorrowStream(device_ordinal_)); +Status XlaDevice::EnsureStreamOkLocked(xla::Backend* backend, + const string& name, + std::shared_ptr* stream, + bool* stream_was_changed) { + if (!(*stream) || !(*stream)->ok()) { + xla::StreamPool::Ptr ptr; + TF_ASSIGN_OR_RETURN(ptr, backend->BorrowStream(device_ordinal_)); + *stream = std::shared_ptr(std::move(ptr)); + VLOG(1) << "XlaDevice " << this << " new " << name << " " + << (*stream)->DebugStreamPointers(); + *stream_was_changed = true; } - return device_to_host_stream_.get(); + return Status::OK(); } -xla::StatusOr XlaDevice::GetHostToDeviceStream() { - if (!use_multiple_streams_) { - return GetStream(); +xla::StatusOr XlaDevice::GetDeviceContextLocked() { + xla::Backend* backend = client()->mutable_backend(); + + // Ensure all our streams are valid, borrowing new streams if necessary. + bool need_new_device_context = !device_context_; + TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "stream", &stream_, + &need_new_device_context)); + + std::shared_ptr host_to_device_stream = stream_; + std::shared_ptr device_to_host_stream = stream_; + if (use_multiple_streams_) { + TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "host_to_device_stream", + &host_to_device_stream_, + &need_new_device_context)); + TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "device_to_host_stream", + &device_to_host_stream_, + &need_new_device_context)); + host_to_device_stream = host_to_device_stream_; + device_to_host_stream = device_to_host_stream_; } - if (!host_to_device_stream_) { - xla::Backend* backend = client()->mutable_backend(); - TF_ASSIGN_OR_RETURN(host_to_device_stream_, - backend->BorrowStream(device_ordinal_)); + + if (!need_new_device_context) { + return device_context_; } - return host_to_device_stream_.get(); -} -Status XlaDevice::CreateAndSetGpuDeviceInfo() { - if (gpu_device_info_ == nullptr) { - TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); - // Call GetAllocator for the side-effect of ensuring the allocator - // is created. - GetAllocator({}); - // XlaDevice owns both gpu_device_info_ and - // gpu_device_info_->default_context. - gpu_device_info_ = MakeUnique(); - gpu_device_info_->stream = stream; - gpu_device_info_->default_context = - new XlaDeviceContext(stream, stream, stream, client(), - transfer_as_literal_, shape_representation_fn_); - set_tensorflow_gpu_device_info(gpu_device_info_.get()); + // At this point we know we need a new device context. + // Call GetAllocator for the side-effect of ensuring the allocator is created. + GetAllocatorLocked({}); + if (device_context_) { + device_context_->Unref(); + } + // The XlaDeviceContext keeps a reference count to the streams, and the + // XlaDeviceContext remains live for the duration of a Executor run. This + // ensures that the streams remain live for the duration of a run, even if + // an error is encountered and the streams are replaced with new ones. + device_context_ = new XlaDeviceContext( + stream_, host_to_device_stream, device_to_host_stream, client(), + transfer_as_literal_, shape_representation_fn_, thread_pool_.get()); + VLOG(1) << "XlaDevice " << this << " new XlaDeviceContext " + << device_context_; + + // Create and set a new GpuDeviceInfo, if necessary. + // + // TODO(b/78232898): This isn't thread-safe; there is a race between the call + // to set_tensorflow_gpu_device_info() with ops that call the getter + // tensorflow_gpu_device_info(). This isn't trivially fixed by adding locking + // to those methods; see the bug for details. Our only saving grace at the + // moment is that this race doesn't seem to occur in practice. + if (use_gpu_device_info_) { + auto gpu_device_info = MakeUnique(); + gpu_device_info->stream = stream_.get(); + gpu_device_info->default_context = device_context_; + set_tensorflow_gpu_device_info(gpu_device_info.get()); + gpu_device_info_ = std::move(gpu_device_info); + VLOG(1) << "XlaDevice " << this << " new GpuDeviceInfo " + << gpu_device_info_.get(); } - return Status::OK(); + return device_context_; +} + +Status XlaDevice::UseGpuDeviceInfo() { + mutex_lock lock(mu_); + use_gpu_device_info_ = true; + return GetDeviceContextLocked().status(); } Status XlaDevice::FillContextMap(const Graph* graph, DeviceContextMap* device_context_map) { VLOG(1) << "XlaDevice::FillContextMap"; - device_context_map->resize(graph->num_node_ids()); - TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); - TF_ASSIGN_OR_RETURN(se::Stream * device_to_host_stream, - GetDeviceToHostStream()); - TF_ASSIGN_OR_RETURN(se::Stream * host_to_device_stream, - GetHostToDeviceStream()); + mutex_lock lock(mu_); + TF_ASSIGN_OR_RETURN(XlaDeviceContext * device_context, + GetDeviceContextLocked()); - // Call GetAllocator for the side-effect of ensuring the allocator is created. - GetAllocator({}); - auto ctx = new XlaDeviceContext( - stream, host_to_device_stream, device_to_host_stream, client(), - transfer_as_literal_, shape_representation_fn_); + device_context_map->resize(graph->num_node_ids()); for (Node* n : graph->nodes()) { VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name(); - ctx->Ref(); - (*device_context_map)[n->id()] = ctx; + device_context->Ref(); + (*device_context_map)[n->id()] = device_context; } - ctx->Unref(); return Status::OK(); } void XlaDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { - VLOG(1) << "XlaDevice::Compute " << op_kernel->name() << ":" + VLOG(2) << "XlaDevice::Compute " << op_kernel->name() << ":" << op_kernel->type_string(); // When Xprof profiling is off (which is the default), constructing the // activity is simple enough that its overhead is negligible. @@ -336,13 +373,29 @@ void XlaDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, AsyncOpKernel::DoneCallback done) { - VLOG(1) << "XlaDevice::ComputeAsync " << op_kernel->name() << ":" + VLOG(2) << "XlaDevice::ComputeAsync " << op_kernel->name() << ":" << op_kernel->type_string(); tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(), op_kernel->IsExpensive()); op_kernel->ComputeAsync(context, done); } +Status XlaDevice::Sync() { + VLOG(1) << "XlaDevice::Sync"; + std::shared_ptr stream; + { + mutex_lock lock(mu_); + stream = stream_; + } + if (!stream) return Status::OK(); + + if (!stream->parent()->SynchronizeAllActivity() || !stream->ok()) { + return errors::Internal("XlaDevice::Sync() failed."); + } + VLOG(1) << "XlaDevice::Sync completed"; + return Status::OK(); +} + Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs, Tensor* tensor) { @@ -358,21 +411,17 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto, if (alloc_attrs.on_host()) { *tensor = parsed; } else { - Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape()); + mutex_lock lock(mu_); + TF_ASSIGN_OR_RETURN(XlaDeviceContext * device_context, + GetDeviceContextLocked()); + Allocator* allocator = GetAllocatorLocked(alloc_attrs); + Tensor copy(allocator, parsed.dtype(), parsed.shape()); Notification n; - TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); - TF_ASSIGN_OR_RETURN(se::Stream * device_to_host_stream, - GetDeviceToHostStream()); - TF_ASSIGN_OR_RETURN(se::Stream * host_to_device_stream, - GetHostToDeviceStream()); - XlaTransferManager manager(stream, host_to_device_stream, - device_to_host_stream, client(), - transfer_as_literal_, shape_representation_fn_); - manager.CopyCPUTensorToDevice(&parsed, this, ©, - [&n, &status](const Status& s) { - status = s; - n.Notify(); - }); + device_context->CopyCPUTensorToDevice(&parsed, this, ©, + [&n, &status](const Status& s) { + status = s; + n.Notify(); + }); n.WaitForNotification(); *tensor = copy; } diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h index 4a5942fbd7f5bfd28e1ec96c6b0dc9e28dd418c5..dbf35f349f84268ebac0f73a86c9ca0704e90835 100644 --- a/tensorflow/compiler/jit/xla_device.h +++ b/tensorflow/compiler/jit/xla_device.h @@ -25,11 +25,11 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_ #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_ +#include "tensorflow/compiler/jit/xla_device_context.h" #include "tensorflow/compiler/jit/xla_tensor.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/local_device.h" #include "tensorflow/core/framework/allocator.h" @@ -40,6 +40,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace tensorflow { @@ -117,62 +118,88 @@ class XlaDevice : public LocalDevice { const PaddedShapeFn& padded_shape_fn); ~XlaDevice() override; - Allocator* GetAllocator(AllocatorAttributes attr) override; + Allocator* GetAllocator(AllocatorAttributes attr) override + LOCKS_EXCLUDED(mu_); void Compute(OpKernel* op_kernel, OpKernelContext* context) override; void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, AsyncOpKernel::DoneCallback done) override; - Status Sync() override { return Status::OK(); } + Status Sync() override; Status FillContextMap(const Graph* graph, - DeviceContextMap* device_context_map) override; + DeviceContextMap* device_context_map) override + LOCKS_EXCLUDED(mu_); Status MakeTensorFromProto(const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs, - Tensor* tensor) override; + Tensor* tensor) override LOCKS_EXCLUDED(mu_); - xla::LocalClient* client() const; const Metadata& metadata() { return xla_metadata_; } - xla::StatusOr GetStream(); - xla::StatusOr GetHostToDeviceStream(); - xla::StatusOr GetDeviceToHostStream(); - // If not already set, create and set GpuDeviceInfo. - // Not thread-safe - Status CreateAndSetGpuDeviceInfo(); + // Ensures the DeviceContext associated with this XlaDevice is created and + // valid (i.e. all streams are ok). If any state is not valid, a new + // DeviceContext will be created. + // + // TODO(b/111859745): The Eager context needs to call this method to recover + // from failures. + Status EnsureDeviceContextOk() LOCKS_EXCLUDED(mu_); + + // Instructs this XlaDevice to set a GpuDeviceInfo, which holds extra + // information for GPU and TPU devices. + Status UseGpuDeviceInfo() LOCKS_EXCLUDED(mu_); private: + xla::LocalClient* client() const; + Allocator* GetAllocatorLocked(AllocatorAttributes attr) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + Status EnsureStreamOkLocked(xla::Backend* backend, const string& name, + std::shared_ptr* stream, + bool* stream_was_changed) + EXCLUSIVE_LOCKS_REQUIRED(mu_); + xla::StatusOr GetDeviceContextLocked() + EXCLUSIVE_LOCKS_REQUIRED(mu_); + + mutex mu_; // The metadata of this XlaDevice. const Metadata xla_metadata_; // Which hardware device in the client's platform this XlaDevice controls. const int device_ordinal_; // The name of the device that is used to compile Ops for this XlaDevice. - DeviceType jit_device_name_; + const DeviceType jit_device_name_; + // The platform for this device. + se::Platform* const platform_; // Not owned. // Memory allocator associated with this device. - Allocator* xla_allocator_; // Not owned. - se::Platform* platform_; // Not owned. + Allocator* xla_allocator_ GUARDED_BY(mu_) = nullptr; // Not owned. // Stream associated with this device. Operations enqueued on this // stream are executed on the device. Operations include data // copying back and forth between CPU and the device, and // computations enqueued by XLA. - xla::StreamPool::Ptr stream_; - // If true, only stream_ is valid and all computation and transfers use - // stream_. If false, computation is performed by stream_ and transfers are + std::shared_ptr stream_ GUARDED_BY(mu_); + // If false, only stream_ is valid and all computation and transfers use + // stream_. If true, computation is performed by stream_ and transfers are // performed by host_to_device/device_to_host_stream. - bool use_multiple_streams_; + const bool use_multiple_streams_; // If use_multiple_streams_, host to device transfers are performed using this // stream. - xla::StreamPool::Ptr host_to_device_stream_; + std::shared_ptr host_to_device_stream_ GUARDED_BY(mu_); // If use_multiple_streams_, device to host transfers are performed using this // stream. - xla::StreamPool::Ptr device_to_host_stream_; + std::shared_ptr device_to_host_stream_ GUARDED_BY(mu_); // Must we use XLA's transfer manager for correct host<->device transfers? if // false, we can use ThenMemcpy() instead. - bool transfer_as_literal_; - XlaCompiler::ShapeRepresentationFn shape_representation_fn_; + const bool transfer_as_literal_; + const XlaCompiler::ShapeRepresentationFn shape_representation_fn_; + + // The device context accessed by all users of the XlaDevice, set by calls to + // EnsureDeviceContextOk. If gpu_device_info_ is non-null, this pointer is + // also filled in to that struct. XlaDeviceContext is a ref-counted object. + XlaDeviceContext* device_context_ GUARDED_BY(mu_) = nullptr; + + // Holds extra information for GPU and TPU devices, e.g. the device context. + bool use_gpu_device_info_ GUARDED_BY(mu_) = false; + std::unique_ptr gpu_device_info_ GUARDED_BY(mu_); - // If set, holds default device context (that we must Unref) - // and its stream. - std::unique_ptr gpu_device_info_; + // Thread pool used for running closures + std::unique_ptr thread_pool_; }; // Builds OpKernel registrations on 'device' for the JIT operators diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc index 8cf198239c84c3720585f53ebc95876ce4396793..0a0c0892411e8ebcd5624a29f3bd020fe6483944 100644 --- a/tensorflow/compiler/jit/xla_device_context.cc +++ b/tensorflow/compiler/jit/xla_device_context.cc @@ -15,6 +15,9 @@ limitations under the License. #include "tensorflow/compiler/jit/xla_device_context.h" +#include + +#include "tensorflow/compiler/jit/xla_device.h" #include "tensorflow/compiler/jit/xla_launch_util.h" #include "tensorflow/compiler/tf2xla/literal_util.h" #include "tensorflow/compiler/tf2xla/shape_util.h" @@ -48,17 +51,20 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) { void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); } XlaTransferManager::XlaTransferManager( - se::Stream* compute_stream, se::Stream* host_to_device_stream, - se::Stream* device_to_host_stream, xla::LocalClient* client, + std::shared_ptr compute_stream, + std::shared_ptr host_to_device_stream, + std::shared_ptr device_to_host_stream, xla::LocalClient* client, bool transfer_as_literal, - XlaCompiler::ShapeRepresentationFn shape_representation_fn) - : stream_(compute_stream), - host_to_device_stream_(host_to_device_stream), - device_to_host_stream_(device_to_host_stream), + XlaCompiler::ShapeRepresentationFn shape_representation_fn, + thread::ThreadPool* thread_pool) + : stream_(std::move(compute_stream)), + host_to_device_stream_(std::move(host_to_device_stream)), + device_to_host_stream_(std::move(device_to_host_stream)), client_(client), transfer_manager_(client->backend().transfer_manager()), transfer_as_literal_(transfer_as_literal), - shape_representation_fn_(std::move(shape_representation_fn)) { + shape_representation_fn_(std::move(shape_representation_fn)), + thread_pool_(thread_pool) { CHECK(host_to_device_stream_ != nullptr); CHECK(device_to_host_stream_ != nullptr); CHECK(stream_ != nullptr); @@ -88,47 +94,40 @@ Status XlaTransferManager::TransferLiteralToDevice( if (UseMultipleStreams()) { // Initially wait for the compute stream so that memory allocations are // synchronized. - host_to_device_stream_->ThenWaitFor(stream_); + host_to_device_stream_->ThenWaitFor(stream_.get()); } TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync( - host_to_device_stream_, *literal, shaped_buffer)); + host_to_device_stream_.get(), *literal, shaped_buffer)); if (UseMultipleStreams()) { - se::Event event(stream_->parent()); - TF_RET_CHECK(event.Init()) << "Event failed to initialize!"; - host_to_device_stream_->ThenRecordEvent(&event); - xla_tensor->SetDefinedOn(host_to_device_stream_, std::move(event)); + auto event = std::make_shared(stream_->parent()); + TF_RET_CHECK(event->Init()) << "Event failed to initialize!"; + host_to_device_stream_->ThenRecordEvent(event.get()); + xla_tensor->SetDefinedOn(host_to_device_stream_.get(), std::move(event)); } // Unref the host tensor, and capture the literal shared_ptr too so it goes // out of scope when the lambda completes. host_to_device_stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); }); + return Status::OK(); } void XlaTransferManager::TransferLiteralFromDevice( Tensor* host_tensor, const Tensor& device_tensor, const StatusCallback& done) const { + xla::MutableBorrowingLiteral literal; + TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(host_tensor, &literal)); + const xla::ShapedBuffer& shaped_buffer = XlaTensor::FromTensor(&device_tensor)->shaped_buffer(); TensorReference ref(device_tensor); transfer_manager_->TransferLiteralFromDevice( - device_to_host_stream_, shaped_buffer, - [=, &shaped_buffer]( - xla::StatusOr > literal_or) { + device_to_host_stream_.get(), shaped_buffer, literal, + [=, &shaped_buffer, &literal](xla::Status status) { ref.Unref(); done([&]() -> Status { - TF_ASSIGN_OR_RETURN(auto literal, std::move(literal_or)); - VLOG(1) << "Transfer from device as literal: " << literal->ToString() + VLOG(1) << "Transfer from device as literal: " << literal.ToString() << " " << shaped_buffer.ToString(); - Tensor tensor; - TF_RETURN_IF_ERROR( - LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor)); - // Reshape the tensor back to its declared shape. - Status status; - if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) { - status = errors::Internal( - "Tensor::CopyFrom failed when copying from XLA device to CPU"); - } return status; }()); }); @@ -186,8 +185,14 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor, status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor); if (status.ok()) { xla_tensor->set_host_tensor(*cpu_tensor); - host_to_device_stream_->ThenDoHostCallback( - [done]() { done(Status::OK()); }); + host_to_device_stream_->ThenDoHostCallback([this, done]() { + // We must not call the done closure directly from DoHostCallback + // to avoid a deadlock. If done() is the callback that ends an + // Executor's run, the Executor may call XlaDevice::Sync() inside the + // callback. This deadlocks, because XlaDevice::Sync() waits for all + // stream activity to complete. + thread_pool_->Schedule([done]() { done(Status::OK()); }); + }); return; } } else { @@ -199,7 +204,7 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor, if (!block_status.ok()) { status = xla::InternalError( "Failed to complete data transfer on stream %p: %s", - host_to_device_stream_, block_status.error_message().c_str()); + host_to_device_stream_.get(), block_status.error_message().c_str()); } } xla_tensor->set_host_tensor(*cpu_tensor); @@ -232,9 +237,9 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor, XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor); if (se::Event* event = - xla_tensor->GetDefinitionEvent(device_to_host_stream_)) { + xla_tensor->GetDefinitionEvent(device_to_host_stream_.get())) { device_to_host_stream_->ThenWaitFor(event); - xla_tensor->SetDefinedOn(device_to_host_stream_); + xla_tensor->SetDefinedOn(device_to_host_stream_.get()); } Status status; @@ -247,7 +252,7 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor, Status block_status = device_to_host_stream_->BlockHostUntilDone(); if (!block_status.ok()) { status = xla::InternalError( - "Failed to complete data transfer on stream %p: %s", stream_, + "Failed to complete data transfer on stream %p: %s", stream_.get(), block_status.error_message().c_str()); } } @@ -285,14 +290,14 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor, if (stream_ != device_to_device_stream) { // Initially wait for the compute stream so that memory allocations are // synchronized. - device_to_device_stream->ThenWaitFor(stream_); + device_to_device_stream->ThenWaitFor(stream_.get()); } } if (se::Event* event = - xla_src->GetDefinitionEvent(device_to_device_stream)) { + xla_src->GetDefinitionEvent(device_to_device_stream.get())) { device_to_device_stream->ThenWaitFor(event); - xla_src->SetDefinedOn(device_to_device_stream); + xla_src->SetDefinedOn(device_to_device_stream.get()); } auto from_iter = xla_src->shaped_buffer().buffers().begin(); @@ -304,28 +309,37 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor, } if (UseMultipleStreams()) { - se::Event event(stream_->parent()); - CHECK(event.Init()); - device_to_device_stream->ThenRecordEvent(&event); - xla_dst->SetDefinedOn(device_to_device_stream, std::move(event)); + auto event = std::make_shared(stream_->parent()); + TF_RET_CHECK(event->Init()) << "Event failed to initialize"; + device_to_device_stream->ThenRecordEvent(event.get()); + xla_dst->SetDefinedOn(device_to_device_stream.get(), std::move(event)); } return Status::OK(); }(); if (!status.ok()) { return done(status); } else { - stream_->ThenDoHostCallback([=]() { done(Status::OK()); }); + stream_->ThenDoHostCallback([this, done]() { + // We must not call the done closure directly from DoHostCallback to avoid + // a deadlock. If done() is the callback that ends an Executor's run, the + // Executor may call XlaDevice::Sync() inside the callback. This + // deadlocks, because XlaDevice::Sync() waits for all stream activity to + // complete. + thread_pool_->Schedule([done]() { done(Status::OK()); }); + }); } } XlaDeviceContext::XlaDeviceContext( - se::Stream* compute_stream, se::Stream* host_to_device_stream, - se::Stream* device_to_host_stream, xla::LocalClient* client, + std::shared_ptr compute_stream, + std::shared_ptr host_to_device_stream, + std::shared_ptr device_to_host_stream, xla::LocalClient* client, bool transfer_as_literal, - XlaCompiler::ShapeRepresentationFn shape_representation_fn) - : manager_(compute_stream, host_to_device_stream, device_to_host_stream, - client, transfer_as_literal, - std::move(shape_representation_fn)) {} + XlaCompiler::ShapeRepresentationFn shape_representation_fn, + thread::ThreadPool* thread_pool) + : manager_(std::move(compute_stream), std::move(host_to_device_stream), + std::move(device_to_host_stream), client, transfer_as_literal, + std::move(shape_representation_fn), thread_pool) {} void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h index 912f8d779e72f44821bc4fb25efa30bd35d01412..2e7445340cbaf788bfd06260f4376596895231c1 100644 --- a/tensorflow/compiler/jit/xla_device_context.h +++ b/tensorflow/compiler/jit/xla_device_context.h @@ -47,10 +47,12 @@ class XlaDeviceAllocator : public Allocator { class XlaTransferManager { public: explicit XlaTransferManager( - se::Stream* compute_stream, se::Stream* host_to_device_stream, - se::Stream* device_to_host_stream, xla::LocalClient* client, - bool transfer_as_literal, - XlaCompiler::ShapeRepresentationFn shape_representation_fn); + std::shared_ptr compute_stream, + std::shared_ptr host_to_device_stream, + std::shared_ptr device_to_host_stream, + xla::LocalClient* client, bool transfer_as_literal, + XlaCompiler::ShapeRepresentationFn shape_representation_fn, + thread::ThreadPool* thread_pool); void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, Tensor* device_tensor, StatusCallback done) const; @@ -61,7 +63,7 @@ class XlaTransferManager { void CopyDeviceTensorToDevice(const Tensor& src_tensor, Tensor* dst_tensor, const StatusCallback& done); - se::Stream* stream() const { return stream_; } + se::Stream* stream() const { return stream_.get(); } private: Status TransferLiteralToDevice(const Tensor& host_tensor, @@ -73,13 +75,13 @@ class XlaTransferManager { // The main compute stream of the device, used to synchronize the transfer // streams if they are set. - se::Stream* stream_; + std::shared_ptr stream_; // The stream to use for transferring data from host to device. Can be // idential to stream_, but must not be nullptr. - se::Stream* host_to_device_stream_; + std::shared_ptr host_to_device_stream_; // The stream to use for transferring data from device to host. Can be // idential to stream_, but must not be nullptr. - se::Stream* device_to_host_stream_; + std::shared_ptr device_to_host_stream_; // For the underlying memory allocator and XLA's TransferManager. xla::LocalClient* client_; // Transfer manager, for marshalling data to and from the device. @@ -87,6 +89,9 @@ class XlaTransferManager { // True if we must use XLA's TransferManager for correct device transfers. const bool transfer_as_literal_; XlaCompiler::ShapeRepresentationFn shape_representation_fn_; + + // Thread pool used for running closures + thread::ThreadPool* thread_pool_; }; // DeviceContext for operators assigned to XlaDevice devices. The @@ -95,10 +100,12 @@ class XlaTransferManager { class XlaDeviceContext : public DeviceContext { public: explicit XlaDeviceContext( - se::Stream* compute_stream, se::Stream* host_to_device_stream, - se::Stream* device_to_host_stream, xla::LocalClient* client, - bool transfer_as_literal, - XlaCompiler::ShapeRepresentationFn shape_representation_fn); + std::shared_ptr compute_stream, + std::shared_ptr host_to_device_stream, + std::shared_ptr device_to_host_stream, + xla::LocalClient* client, bool transfer_as_literal, + XlaCompiler::ShapeRepresentationFn shape_representation_fn, + thread::ThreadPool* thread_pool); void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, Tensor* device_tensor, diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h index 6adda327f186a607b4e7371bf4c5071dd86582da..da3e329247e825d4a33a53dc310899d6ba6ce9cf 100644 --- a/tensorflow/compiler/jit/xla_device_ops.h +++ b/tensorflow/compiler/jit/xla_device_ops.h @@ -23,7 +23,11 @@ limitations under the License. #include "tensorflow/core/kernels/cast_op.h" #include "tensorflow/core/kernels/constant_op.h" #include "tensorflow/core/kernels/control_flow_ops.h" +#include "tensorflow/core/kernels/data/generator_dataset_op.h" +#include "tensorflow/core/kernels/data/iterator_ops.h" +#include "tensorflow/core/kernels/data/prefetch_dataset_op.h" #include "tensorflow/core/kernels/fifo_queue.h" +#include "tensorflow/core/kernels/function_ops.h" #include "tensorflow/core/kernels/identity_n_op.h" #include "tensorflow/core/kernels/identity_op.h" #include "tensorflow/core/kernels/no_op.h" @@ -166,7 +170,69 @@ class XlaAssignVariableOp : public AsyncOpKernel { QueueIsClosedOp); \ \ REGISTER_KERNEL_BUILDER( \ - Name("FIFOQueueV2").Device(DEVICE).HostMemory("handle"), FIFOQueueOp); + Name("FIFOQueueV2").Device(DEVICE).HostMemory("handle"), FIFOQueueOp); \ + \ + REGISTER_KERNEL_BUILDER( \ + Name(kArgOp).Device(DEVICE).HostMemory("output").TypeConstraint("T", \ + TYPES), \ + ArgOp); \ + REGISTER_KERNEL_BUILDER(Name(kArgOp) \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + ArgOp); \ + \ + REGISTER_KERNEL_BUILDER(Name(kRetOp) \ + .Device(DEVICE) \ + .TypeConstraint("T", TYPES) \ + .HostMemory("input"), \ + RetvalOp); \ + REGISTER_KERNEL_BUILDER(Name(kRetOp) \ + .Device(DEVICE) \ + .TypeConstraint("T") \ + .HostMemory("input"), \ + RetvalOp); \ + \ + REGISTER_KERNEL_BUILDER( \ + Name("RemoteCall").Device(DEVICE).HostMemory("target"), RemoteCallOp); \ + \ + REGISTER_KERNEL_BUILDER( \ + Name("GeneratorDataset").Device(DEVICE).HostMemory("handle"), \ + GeneratorDatasetOp); \ + REGISTER_KERNEL_BUILDER(Name("PrefetchDataset") \ + .Device(DEVICE) \ + .HostMemory("buffer_size") \ + .HostMemory("input_dataset") \ + .HostMemory("handle"), \ + PrefetchDatasetOp); \ + \ + REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE), \ + IteratorHandleOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("MakeIterator").Device(DEVICE).HostMemory("dataset"), \ + MakeIteratorOp); \ + REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE), \ + AnonymousIteratorHandleOp); \ + REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE), \ + IteratorGetNextOp); \ + REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle") \ + .Device(DEVICE) \ + .HostMemory("string_handle"), \ + IteratorToStringHandleOp); \ + REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2") \ + .Device(DEVICE) \ + .HostMemory("string_handle"), \ + IteratorFromStringHandleOp); \ + REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kArgOp) \ + .Device(DEVICE) \ + .HostMemory("output") \ + .TypeConstraint("T"), \ + ArgOp); \ + REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kRetOp) \ + .Device(DEVICE) \ + .TypeConstraint("T") \ + .HostMemory("input"), \ + RetvalOp); // TODO(phawkins): currently we do not register the QueueEnqueueMany, // QueueDequeueMany, or QueueDequeueUpTo kernels because they attempt to read diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index 851b118b0c18cfd752302b8f8dec27dae3e12acd..ef4466f0056ea98adc1ae6774105466af0d14293 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -59,7 +59,7 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options, } // TODO(b/78468222): Uncomment after fixing this bug - // status = device->CreateAndSetGpuDeviceInfo(); + // status = device->UseGpuDeviceInfo(); // if (!status.ok()) { // errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT, // " device"); diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index 6134b8c6946429918a5ca37188cbff13a6cd1c79..4efbb2d5d7cf09d9cf1e35c8cf5403e7e0dfe733 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/compiler/jit/xla_launch_util.h" +#include + #include "tensorflow/compiler/jit/defs.h" #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" @@ -182,7 +184,7 @@ void XlaComputationLaunchContext::PopulateInputs( } } -void XlaComputationLaunchContext::PopulateOutputs( +Status XlaComputationLaunchContext::PopulateOutputs( OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, ScopedShapedBuffer output) { se::Stream* stream = @@ -211,6 +213,15 @@ void XlaComputationLaunchContext::PopulateOutputs( output = ScopedShapedBuffer(std::move(buffer), output.memory_allocator()); } + std::shared_ptr definition_event; + if (use_multiple_streams_) { + definition_event = std::make_shared(stream->parent()); + if (!definition_event->Init()) { + return errors::Internal("Failed to initialize tensor definition event."); + } + stream->ThenRecordEvent(definition_event.get()); + } + // Copy XLA results to the OpOutputList. int output_num = 0; for (int i = 0; i < ctx->num_outputs(); ++i) { @@ -228,12 +239,13 @@ void XlaComputationLaunchContext::PopulateOutputs( // reallocate the device buffer later. VLOG(1) << "Constant output tensor on device"; - OP_REQUIRES_OK( - ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor)); + TF_RETURN_IF_ERROR( + ctx->allocate_output(i, const_tensor.shape(), &output_tensor)); Device* device = dynamic_cast(ctx->device()); - OP_REQUIRES(ctx, device != nullptr, - errors::Internal("DeviceBase was not a Device.")); + if (device == nullptr) { + return errors::Internal("DeviceBase was not a Device."); + } ctx->op_device_context()->CopyCPUTensorToDevice( &const_tensor, device, output_tensor, [&](Status status) { TF_CHECK_OK(status); }); @@ -263,16 +275,13 @@ void XlaComputationLaunchContext::PopulateOutputs( se::DeviceMemoryBase buffer = output.buffer({output_num}); if (allocate_xla_tensors_) { Tensor* output_tensor; - OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor)); + TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor)); XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor); if (xla_tensor) { xla_tensor->set_shaped_buffer(ScopedShapedBuffer( ExtractSubShapedBuffer(&output, output_num, xla_allocator_))); if (use_multiple_streams_) { - se::Event event(stream->parent()); - CHECK(event.Init()); - stream->ThenRecordEvent(&event); - xla_tensor->SetDefinedOn(stream, std::move(event)); + xla_tensor->SetDefinedOn(stream, definition_event); } } else { // xla_tensor wasn't valid, which must mean this is a zero-element @@ -298,41 +307,39 @@ void XlaComputationLaunchContext::PopulateOutputs( for (int i = 0; i < kernel->resource_updates.size(); ++i) { Allocator* allocator = ctx->device()->GetAllocator({}); const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i]; - OP_REQUIRES(ctx, - write.input_index >= 0 && write.input_index < ctx->num_inputs(), - errors::Internal("Invalid input index for variable write.")); + if (write.input_index < 0 || write.input_index >= ctx->num_inputs()) { + return errors::Internal("Invalid input index for variable write."); + } se::DeviceMemoryBase buffer = output.buffer({output_num}); Var* variable = nullptr; // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor, // not a Tensor. - OP_REQUIRES_OK(ctx, LookupOrCreateResource( - ctx, HandleFromInput(ctx, write.input_index), - &variable, [this, ctx, &write](Var** ptr) { - *ptr = new Var(write.type); - return Status::OK(); - })); + TF_RETURN_IF_ERROR(LookupOrCreateResource( + ctx, HandleFromInput(ctx, write.input_index), &variable, + [&write](Var** ptr) { + *ptr = new Var(write.type); + return Status::OK(); + })); core::ScopedUnref s(variable); mutex_lock ml(*variable->mu()); - OP_REQUIRES(ctx, variable->tensor()->dtype() == write.type, - errors::Internal("Mismatched type in variable write")); + if (variable->tensor()->dtype() != write.type) { + return errors::Internal("Mismatched type in variable write"); + } if (allocate_xla_tensors_) { Tensor output_tensor; - OP_REQUIRES_OK( - ctx, ctx->allocate_temp(write.type, write.shape, &output_tensor)); + TF_RETURN_IF_ERROR( + ctx->allocate_temp(write.type, write.shape, &output_tensor)); XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor); CHECK(xla_tensor); xla_tensor->set_shaped_buffer( ExtractSubShapedBuffer(&output, output_num, xla_allocator_)); if (use_multiple_streams_) { - se::Event event(stream->parent()); - CHECK(event.Init()); - stream->ThenRecordEvent(&event); - xla_tensor->SetDefinedOn(stream, std::move(event)); + xla_tensor->SetDefinedOn(stream, definition_event); } *variable->tensor() = output_tensor; } else { @@ -343,6 +350,7 @@ void XlaComputationLaunchContext::PopulateOutputs( } ++output_num; } + return Status::OK(); } } // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h index 1ea3fa4cf29266e8c452385226e56bd0b82622d9..4232f514b3b48681bf510ee568f916f5f4ebe882 100644 --- a/tensorflow/compiler/jit/xla_launch_util.h +++ b/tensorflow/compiler/jit/xla_launch_util.h @@ -93,9 +93,9 @@ class XlaComputationLaunchContext { const std::map& variables); // Given the XLA output in `output`, populate all outputs of `ctx`. - void PopulateOutputs(OpKernelContext* ctx, - const XlaCompiler::CompilationResult* kernel, - xla::ScopedShapedBuffer output); + Status PopulateOutputs(OpKernelContext* ctx, + const XlaCompiler::CompilationResult* kernel, + xla::ScopedShapedBuffer output); // Return the argument list. Only valid after PopulateInputs() has been // called. diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc index d777dfa5a34fb9615ddcf393ed53be1491cb70af..92ba7de1b7d32fcf693cd12a380d7a1e0d861d71 100644 --- a/tensorflow/compiler/jit/xla_tensor.cc +++ b/tensorflow/compiler/jit/xla_tensor.cc @@ -75,7 +75,7 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape, se::Event* XlaTensor::GetDefinitionEvent(se::Stream* stream) { mutex_lock lock(mu_); - if (!definition_event_.has_value()) { + if (!definition_event_) { return nullptr; } @@ -87,10 +87,11 @@ se::Event* XlaTensor::GetDefinitionEvent(se::Stream* stream) { return nullptr; } - return &*definition_event_; + return definition_event_.get(); } -void XlaTensor::SetDefinedOn(se::Stream* stream, se::Event event) { +void XlaTensor::SetDefinedOn(se::Stream* stream, + std::shared_ptr event) { mutex_lock lock(mu_); definition_event_ = std::move(event); streams_defined_on_ = {stream}; diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h index f7e401c731163200c518074f2caa6907efb1f684..8d36d0fa0a8230bcd1b16cc67de104e09358144f 100644 --- a/tensorflow/compiler/jit/xla_tensor.h +++ b/tensorflow/compiler/jit/xla_tensor.h @@ -16,6 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_ #define TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_ +#include + #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/core/framework/allocator.h" @@ -94,7 +96,7 @@ class XlaTensor { // Assert that the tensor's content is defined on 'stream' by the time 'event' // triggers. - void SetDefinedOn(se::Stream* stream, se::Event event); + void SetDefinedOn(se::Stream* stream, std::shared_ptr event); // Assert that the tensor's content is defined on 'stream'. This version does // not provide an event, and must be called *after* SetDefinedOn(Stream, @@ -116,7 +118,7 @@ class XlaTensor { // An optional event that is triggered when the tensor's content has been // defined. If this event is nullptr, it is assumed that the tensor's content // is always defined. - gtl::optional definition_event_; + std::shared_ptr definition_event_; // A list of all streams for which the tensor's content is defined for any // newly enqueued command. gtl::InlinedVector streams_defined_on_ GUARDED_BY(mu_); diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 080bed50e68ba353a5029f5eb959003b51327f4a..ae98b3f0f9d5dac66b9716ad84a9f0371511e9b6 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -673,6 +673,7 @@ tf_xla_py_test( "cpu", "cpu_ondemand", ], + shard_count = 5, tags = ["optonly"], deps = [ ":xla_test", @@ -690,11 +691,7 @@ tf_xla_py_test( size = "small", srcs = ["random_ops_test.py"], disabled_backends = [ - # TODO(b/110300529): RngNormal doesn't return values with the expected variance - "cpu", "cpu_ondemand", - # TODO(b/31361304): enable RNG ops on GPU when parallelized. - "gpu", ], deps = [ ":xla_test", @@ -1002,6 +999,7 @@ tf_xla_py_test( name = "sort_ops_test", size = "medium", srcs = ["sort_ops_test.py"], + shard_count = 5, # Times out in fastbuild mode. tags = ["optonly"], deps = [ diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py index 03554d6933aca39b428c6af4be0c78e2c7ccb0c9..0d2e4d029636577adc74784d9a8b3494b94dc67d 100644 --- a/tensorflow/compiler/tests/adam_test.py +++ b/tensorflow/compiler/tests/adam_test.py @@ -52,6 +52,9 @@ class AdamOptimizerTest(xla_test.XLATestCase): def testBasic(self): for dtype in self.float_types: + # TODO: test fails for float16 due to excessive precision requirements. + if dtype == np.float16: + continue with self.test_session(), self.test_scope(): variable_scope.get_variable_scope().set_use_resource(True) @@ -91,6 +94,9 @@ class AdamOptimizerTest(xla_test.XLATestCase): def testTensorLearningRate(self): for dtype in self.float_types: + # TODO: test fails for float16 due to excessive precision requirements. + if dtype == np.float16: + continue with self.test_session(), self.test_scope(): variable_scope.get_variable_scope().set_use_resource(True) @@ -130,6 +136,9 @@ class AdamOptimizerTest(xla_test.XLATestCase): def testSharing(self): for dtype in self.float_types: + # TODO: test fails for float16 due to excessive precision requirements. + if dtype == np.float16: + continue with self.test_session(), self.test_scope(): variable_scope.get_variable_scope().set_use_resource(True) diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py index 6ead15da13b86b9d2b4cf2c19e5cf2a90b061b91..ff097f80f1f2586bd483a54d532750c90b2a8b03 100644 --- a/tensorflow/compiler/tests/eager_test.py +++ b/tensorflow/compiler/tests/eager_test.py @@ -32,6 +32,7 @@ from tensorflow.python.layers import convolutional from tensorflow.python.layers import pooling from tensorflow.python.ops import array_ops from tensorflow.python.ops import embedding_ops +from tensorflow.python.ops import gen_random_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops @@ -122,6 +123,14 @@ class EagerTest(xla_test.XLATestCase): with self.test_scope(): self.assertAllEqual(2, array_ops.identity(2)) + def testRandomOps(self): + with self.test_scope(): + tensor = gen_random_ops.random_uniform((2, 2), dtypes.float32) + row0 = tensor[0].numpy() + row1 = tensor[1].numpy() + # It should be very unlikely to rng to generate two equal rows. + self.assertFalse((row0 == row1).all()) + def testIdentityOnVariable(self): with self.test_scope(): v = resource_variable_ops.ResourceVariable(True) @@ -400,6 +409,21 @@ class EagerFunctionTest(xla_test.XLATestCase): self.assertEqual(75, y.numpy()) self.assertEqual(30, dy.numpy()) + def testGradientTapeInDefun(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(5.0) + + @function.defun + def f(): + x = constant_op.constant(1.0) + with backprop.GradientTape() as tape: + y = v0 * x + dy = tape.gradient(y, v0) + return dy + + dy = f() + self.assertEqual(1.0, dy.numpy()) + def testSliceInDefun(self): with self.test_scope(): diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py index 14c5e7a975e478ca6ceed37c28339b40612801c8..8c4e16e4e075726d741f6ff8cdfb6b1aad6cd33e 100644 --- a/tensorflow/compiler/tests/random_ops_test.py +++ b/tensorflow/compiler/tests/random_ops_test.py @@ -57,7 +57,8 @@ class RandomOpsTest(xla_test.XLATestCase): def testRandomUniformIsNotConstant(self): def rng(dtype): - return random_ops.random_uniform(shape=[2], dtype=dtype, maxval=1000000) + dtype = dtypes.as_dtype(dtype) + return random_ops.random_uniform(shape=[2], dtype=dtype, maxval=dtype.max) for dtype in self._random_types(): self._testRngIsNotConstant(rng, dtype) @@ -73,6 +74,11 @@ class RandomOpsTest(xla_test.XLATestCase): def testRandomUniformIsInRange(self): for dtype in self._random_types(): + # TODO (b/112272078): enable bfloat16 for CPU and GPU when the bug is + # fixed. + if (self.device in ["XLA_GPU", "XLA_CPU" + ]) and (dtype in [dtypes.bfloat16, dtypes.half]): + continue with self.test_session() as sess: with self.test_scope(): x = random_ops.random_uniform( @@ -95,7 +101,7 @@ class RandomOpsTest(xla_test.XLATestCase): for dtype in [dtypes.float32]: with self.test_session() as sess: with self.test_scope(): - x = random_ops.truncated_normal(shape=[count], dtype=dtype, seed=42) + x = random_ops.truncated_normal(shape=[count], dtype=dtype) y = sess.run(x) def normal_cdf(x): @@ -124,20 +130,23 @@ class RandomOpsTest(xla_test.XLATestCase): # Department of Scientific Computing website. Florida State University. expected_mean = mu + (normal_pdf(alpha) - normal_pdf(beta)) / z * sigma actual_mean = np.mean(y) - self.assertAllClose(actual_mean, expected_mean, atol=2e-4) + self.assertAllClose(actual_mean, expected_mean, atol=2e-3) expected_median = mu + probit( (normal_cdf(alpha) + normal_cdf(beta)) / 2.) * sigma actual_median = np.median(y) - self.assertAllClose(actual_median, expected_median, atol=8e-4) + self.assertAllClose(actual_median, expected_median, atol=1e-2) expected_variance = sigma**2 * (1 + ( (alpha * normal_pdf(alpha) - beta * normal_pdf(beta)) / z) - ( (normal_pdf(alpha) - normal_pdf(beta)) / z)**2) actual_variance = np.var(y) - self.assertAllClose(actual_variance, expected_variance, rtol=3e-4) + self.assertAllClose(actual_variance, expected_variance, rtol=2*1e-3) def testShuffle1d(self): + # TODO(b/26783907): this test requires the CPU backend to implement sort. + if self.device in ["XLA_CPU"]: + return with self.test_session() as sess: with self.test_scope(): x = math_ops.range(1 << 16) diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc index 16f293891d56d78885dd515bb7b9899faf0690f7..c0ea242044540b1cef44186880ba3cd92b8849d6 100644 --- a/tensorflow/compiler/tests/randomized_tests.cc +++ b/tensorflow/compiler/tests/randomized_tests.cc @@ -62,6 +62,7 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/lib/gtl/flatset.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/public/session.h" #include "tensorflow/core/public/session_options.h" @@ -101,6 +102,9 @@ class OpTestBuilder { OpTestBuilder& RandomInput(DataType type); OpTestBuilder& RandomInput(DataType type, std::vector dims); + // As RandomInput but the values are unique. + OpTestBuilder& RandomUniqueInput(DataType type, std::vector dims); + // Sets an attribute. template OpTestBuilder& Attr(StringPiece attr_name, T&& value); @@ -126,6 +130,7 @@ class OpTestBuilder { DataType type = DT_INVALID; bool has_dims = false; + bool needs_unique_values = false; std::vector dims; }; @@ -167,6 +172,18 @@ OpTestBuilder& OpTestBuilder::RandomInput(DataType type, return *this; } +OpTestBuilder& OpTestBuilder::RandomUniqueInput(DataType type, + std::vector dims) { + VLOG(1) << "Adding input: " << type << " " << TensorShape(dims).DebugString(); + InputDescription input; + input.type = type; + input.has_dims = true; + input.needs_unique_values = true; + input.dims = std::move(dims); + inputs_.push_back(input); + return *this; +} + template OpTestBuilder& OpTestBuilder::Attr(StringPiece attr_name, T&& value) { AddNodeAttr(attr_name, std::forward(value), &node_def_); @@ -289,7 +306,8 @@ class OpTest : public ::testing::Test { // Returns a tensor filled with random but "reasonable" values from the middle // of the type's range. If the shape is omitted, a random shape is used. // TODO(phawkins): generalize this code to a caller-supplied distribution. - Tensor RandomTensor(DataType dtype, gtl::ArraySlice shape); + Tensor RandomTensor(DataType dtype, bool needs_unique_values, + gtl::ArraySlice shape); Tensor RandomTensor(DataType dtype); // Like RandomTensor, but uses values >= 0. @@ -432,49 +450,90 @@ std::vector OpTest::RandomDims(int min_rank, int max_rank, return dims; } -Tensor OpTest::RandomTensor(DataType dtype, gtl::ArraySlice shape) { +Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values, + gtl::ArraySlice shape) { Tensor tensor(dtype, TensorShape(shape)); switch (dtype) { case DT_FLOAT: { + gtl::FlatSet already_generated; std::uniform_real_distribution distribution(-1.0f, 1.0f); - test::FillFn(&tensor, [this, &distribution](int i) -> float { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> float { + float generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } case DT_DOUBLE: { + gtl::FlatSet already_generated; std::uniform_real_distribution distribution(-1.0, 1.0); - test::FillFn(&tensor, [this, &distribution](int i) -> double { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> double { + double generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } case DT_COMPLEX64: { + gtl::FlatSet> already_generated; std::uniform_real_distribution distribution(-1.0f, 1.0f); - test::FillFn(&tensor, [this, &distribution](int i) { - return complex64(distribution(generator()), distribution(generator())); + test::FillFn(&tensor, [&](int i) { + complex64 generated; + do { + generated = + complex64(distribution(generator()), distribution(generator())); + } while ( + needs_unique_values && + !already_generated + .insert(std::make_pair(generated.real(), generated.imag())) + .second); + return generated; }); break; } case DT_INT32: { + gtl::FlatSet already_generated; std::uniform_int_distribution distribution(-(1 << 20), 1 << 20); - test::FillFn(&tensor, [this, &distribution](int i) -> int32 { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> int32 { + int32 generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } case DT_INT64: { + gtl::FlatSet already_generated; std::uniform_int_distribution distribution(-(1LL << 40), 1LL << 40); - test::FillFn(&tensor, [this, &distribution](int i) -> int64 { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> int64 { + int64 generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } case DT_BOOL: { + gtl::FlatSet already_generated; std::bernoulli_distribution distribution; - test::FillFn(&tensor, [this, &distribution](int i) -> bool { - return distribution(generator()); + test::FillFn(&tensor, [&](int i) -> bool { + bool generated; + do { + generated = distribution(generator()); + } while (needs_unique_values && + !already_generated.insert(generated).second); + return generated; }); break; } @@ -485,7 +544,7 @@ Tensor OpTest::RandomTensor(DataType dtype, gtl::ArraySlice shape) { } Tensor OpTest::RandomTensor(DataType dtype) { - return RandomTensor(dtype, RandomDims()); + return RandomTensor(dtype, /*needs_unique_values=*/false, RandomDims()); } Tensor OpTest::RandomNonNegativeTensor(DataType dtype, @@ -761,7 +820,8 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose( VLOG(1) << "Ignoring oversize dims."; return kInvalid; } - input_tensors.push_back(RandomTensor(input.type, dims)); + input_tensors.push_back( + RandomTensor(input.type, input.needs_unique_values, dims)); } VLOG(1) << "Input: " << input_tensors.back().DebugString(); } @@ -960,7 +1020,7 @@ TEST_F(OpTest, ArgMax) { std::uniform_int_distribution(-num_dims, num_dims)(generator()); return ExpectTfAndXlaOutputsAreClose( OpTestBuilder("ArgMax") - .RandomInput(DT_FLOAT, dims) + .RandomUniqueInput(DT_FLOAT, dims) .Input(test::AsScalar(reduce_dim)) .Attr("T", DT_FLOAT) .Attr("Tidx", DT_INT32) @@ -976,7 +1036,7 @@ TEST_F(OpTest, ArgMin) { std::uniform_int_distribution(-num_dims, num_dims)(generator()); return ExpectTfAndXlaOutputsAreClose( OpTestBuilder("ArgMin") - .RandomInput(DT_FLOAT, dims) + .RandomUniqueInput(DT_FLOAT, dims) .Input(test::AsScalar(reduce_dim)) .Attr("T", DT_FLOAT) .Attr("Tidx", DT_INT32) diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py index 5f25ff9002964e94db384d7b01f07cfc4f8938b1..73adb0d243b3b27e6c6ba669b2fd134a5976a2ec 100644 --- a/tensorflow/compiler/tests/unary_ops_test.py +++ b/tensorflow/compiler/tests/unary_ops_test.py @@ -361,6 +361,12 @@ class UnaryOpsTest(xla_test.XLATestCase): np.array([[-0.05, 6.05, 5]], dtype=dtype), expected=np.array([[0, 6, 5]], dtype=dtype)) + self._assertOpOutputMatchesExpected( + nn_ops.softmax, + np.array([1, 2, 3, 4], dtype=dtype), + expected=np.array([0.032058604, 0.087144323, 0.23688284, 0.64391428], + dtype=dtype)) + self._assertOpOutputMatchesExpected( nn_ops.softmax, np.array([[1, 1, 1, 1], [1, 2, 3, 4]], dtype=dtype), @@ -369,6 +375,14 @@ class UnaryOpsTest(xla_test.XLATestCase): [0.032058604, 0.087144323, 0.23688284, 0.64391428]], dtype=dtype)) + self._assertOpOutputMatchesExpected( + nn_ops.softmax, + np.array([[[1, 1], [1, 1]], [[1, 2], [3, 4]]], dtype=dtype), + expected=np.array( + [[[0.5, 0.5], [0.5, 0.5]], + [[0.26894142, 0.73105858], [0.26894142, 0.73105858]]], + dtype=dtype)) + self._assertOpOutputMatchesExpected( nn_ops.softsign, np.array([[-2, -1, 0, 1, 2]], dtype=dtype), diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py index 06d977b93c28792704b910c688af510bc650d2a4..85084bb1240cf05f6eabfbea772df113cabe613c 100644 --- a/tensorflow/compiler/tests/xla_device_test.py +++ b/tensorflow/compiler/tests/xla_device_test.py @@ -21,6 +21,8 @@ from __future__ import print_function import numpy as np from tensorflow.compiler.tests import xla_test +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_control_flow_ops @@ -47,6 +49,34 @@ class XlaDeviceTest(xla_test.XLATestCase): result = sess.run(z, {x: inputs}) self.assertAllCloseAccordingToType(result, inputs + inputs) + def testCopiesOfUnsupportedTypesFailGracefully(self): + """Tests that copies of unsupported types don't crash.""" + test_types = set([ + np.uint8, np.uint16, np.uint32, np.uint64, np.int8, np.int16, np.int32, + np.int64, np.float16, np.float32, np.float16, + dtypes.bfloat16.as_numpy_dtype + ]) + shape = (10, 10) + for unsupported_dtype in test_types - self.all_types: + with self.test_session() as sess: + with ops.device("CPU"): + x = array_ops.placeholder(unsupported_dtype, shape) + with self.test_scope(): + y, = array_ops.identity_n([x]) + with ops.device("CPU"): + z = array_ops.identity(y) + + inputs = np.random.randint(-100, 100, shape) + inputs = inputs.astype(unsupported_dtype) + # Execution should either succeed or raise an InvalidArgumentError, + # but not crash. Even "unsupported types" may succeed here since some + # backends (e.g., the CPU backend) are happy to handle buffers of + # unsupported types, even if they cannot compute with them. + try: + sess.run(z, {x: inputs}) + except errors.InvalidArgumentError: + pass + def testControlTrigger(self): with self.test_session() as sess: with self.test_scope(): diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index 338943201bb11a66370d82f301736a0d8d0fc7ed..fda32c8a1c9491e0dadceec0d7265e1002d41528 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -91,6 +91,22 @@ cc_library( ], ) +cc_library( + name = "cpu_function_runtime", + srcs = ["cpu_function_runtime.cc"], + hdrs = ["cpu_function_runtime.h"], + visibility = [ + "//tensorflow/compiler/aot:__pkg__", + "//tensorflow/compiler/xla/service/cpu:__pkg__", + ], + deps = [ + # Keep dependencies to a minimum here; this library is used in every AOT + # binary produced by tfcompile. + "//tensorflow/compiler/xla:executable_run_options", + "//tensorflow/core:framework_lite", + ], +) + cc_library( name = "xla_compiled_cpu_function", srcs = ["xla_compiled_cpu_function.cc"], @@ -99,12 +115,23 @@ cc_library( deps = [ # Keep dependencies to a minimum here; this library is used in every AOT # binary produced by tfcompile. - "//tensorflow/compiler/aot:runtime", + ":cpu_function_runtime", "//tensorflow/compiler/xla:executable_run_options", "//tensorflow/core:framework_lite", ], ) +tf_cc_test( + name = "cpu_function_runtime_test", + srcs = ["cpu_function_runtime_test.cc"], + deps = [ + ":cpu_function_runtime", + "//tensorflow/core:framework", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + ], +) + cc_library( name = "xla_jit_compiled_cpu_function", srcs = ["xla_jit_compiled_cpu_function.cc"], @@ -121,6 +148,7 @@ cc_library( "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:cpu_plugin", + "//tensorflow/compiler/xla/service/cpu:buffer_info_util", "//tensorflow/compiler/xla/service/cpu:cpu_executable", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", diff --git a/tensorflow/compiler/aot/runtime.cc b/tensorflow/compiler/tf2xla/cpu_function_runtime.cc similarity index 70% rename from tensorflow/compiler/aot/runtime.cc rename to tensorflow/compiler/tf2xla/cpu_function_runtime.cc index 5e74079fc158379b8977ada6412141e39142c3d3..fcc4095e39673b786544984a41988c3e9c5b0efb 100644 --- a/tensorflow/compiler/aot/runtime.cc +++ b/tensorflow/compiler/tf2xla/cpu_function_runtime.cc @@ -1,4 +1,4 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,22 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/aot/runtime.h" - -#include +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" #include "tensorflow/core/platform/dynamic_annotations.h" namespace tensorflow { -namespace tfcompile { -namespace runtime { - namespace { - // Inline memory allocation routines here, because depending on '//base' brings // in libraries which use c++ streams, which adds considerable code size on // android. -inline void* aligned_malloc(size_t size, int minimum_alignment) { +void* aligned_malloc(size_t size, int minimum_alignment) { #if defined(__ANDROID__) || defined(OS_ANDROID) || defined(OS_CYGWIN) return memalign(minimum_alignment, size); #elif defined(_WIN32) @@ -47,7 +41,7 @@ inline void* aligned_malloc(size_t size, int minimum_alignment) { #endif } -inline void aligned_free(void* aligned_memory) { +void aligned_free(void* aligned_memory) { #if defined(_WIN32) _aligned_free(aligned_memory); #else @@ -58,22 +52,29 @@ inline void aligned_free(void* aligned_memory) { size_t align_to(size_t n, size_t align) { return (((n - 1) / align) + 1) * align; } - } // namespace -size_t aligned_buffer_bytes(const intptr_t* sizes, size_t n) { +namespace cpu_function_runtime { +size_t AlignedBufferBytes(const BufferInfo* buffer_infos, size_t n, + bool allocate_entry_params) { size_t total = 0; for (size_t i = 0; i < n; ++i) { - if (sizes[i] != -1) { - total += align_to(sizes[i], kAlign); + bool should_allocate = + buffer_infos[i].is_temp_buffer() || + (buffer_infos[i].is_entry_parameter() && allocate_entry_params); + + if (should_allocate) { + total += align_to(buffer_infos[i].size(), kAlign); } } return total; } -void* MallocContiguousBuffers(const intptr_t* sizes, size_t n, void** bufs, +void* MallocContiguousBuffers(const BufferInfo* buffer_infos, size_t n, + bool allocate_entry_params, void** bufs, bool annotate_initialized) { - const size_t total = aligned_buffer_bytes(sizes, n); + const size_t total = + AlignedBufferBytes(buffer_infos, n, allocate_entry_params); void* contiguous = nullptr; if (total > 0) { contiguous = aligned_malloc(total, kAlign); @@ -85,11 +86,14 @@ void* MallocContiguousBuffers(const intptr_t* sizes, size_t n, void** bufs, } uintptr_t pos = reinterpret_cast(contiguous); for (size_t i = 0; i < n; ++i) { - if (sizes[i] == -1) { - bufs[i] = nullptr; - } else { + bool should_allocate = + buffer_infos[i].is_temp_buffer() || + (buffer_infos[i].is_entry_parameter() && allocate_entry_params); + if (should_allocate) { bufs[i] = reinterpret_cast(pos); - pos += align_to(sizes[i], kAlign); + pos += align_to(buffer_infos[i].size(), kAlign); + } else { + bufs[i] = nullptr; } } return contiguous; @@ -100,7 +104,5 @@ void FreeContiguous(void* contiguous) { aligned_free(contiguous); } } - -} // namespace runtime -} // namespace tfcompile +} // namespace cpu_function_runtime } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime.h b/tensorflow/compiler/tf2xla/cpu_function_runtime.h new file mode 100644 index 0000000000000000000000000000000000000000..dfc1e8b8aebcf3142e9f61f60171c6b58634c71d --- /dev/null +++ b/tensorflow/compiler/tf2xla/cpu_function_runtime.h @@ -0,0 +1,165 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2XLA_CPU_FUNCTION_RUNTIME_H_ +#define TENSORFLOW_COMPILER_TF2XLA_CPU_FUNCTION_RUNTIME_H_ + +#include "tensorflow/core/platform/types.h" + +#include + +namespace tensorflow { +namespace cpu_function_runtime { +// Stores information about one buffer used by an XLA:CPU compiled function. +// These buffers are used for holding inputs to the computation, outputs from +// the computation and as temporary scratch space. +class BufferInfo { + public: + // Creates a BufferInfo from a serialized encoding generated by `Encode`. + explicit BufferInfo(std::pair encoding) + : entry_param_number_(encoding.second) { + Kind kind; + uint64 size; + Unpack(encoding.first, &kind, &size); + kind_ = kind; + size_ = size; + } + + // Returns true if this buffer stores a constant. These never need to be + // allocated by the runtime. + bool is_constant() const { return kind() == Kind::kConstant; } + + // Returns true if this buffer stores an entry parameter. These may or may + // not need to be allocated by the runtime, depending on + // XlaCompiledCpuFunction::AllocMode. + bool is_entry_parameter() const { return kind() == Kind::kEntryParameter; } + + // Returns the entry parameter number of this buffer. + uint64 entry_parameter_number() const { + assert(is_entry_parameter()); + return entry_param_number_; + } + + // Returns true if this buffer is temporary scratch space required by the XLA + // computations. These are always allocated by the runtime. + bool is_temp_buffer() const { return kind() == Kind::kTempBuffer; } + + // Returns true if this buffer is allocated on the C stack or into registers. + // These buffers are never allocated by the runtime. + bool is_on_stack_buffer() const { return kind() == Kind::kOnStackBuffer; } + + // Returns the size for this buffer. + uint64 size() const { return size_; } + + // Encodes this BufferInfo into two 64 bit integers that can be used to + // reconstruct the BufferInfo later using the constructor. We need this + // because we use BufferInfo in places where using protocol buffers would + // negatively impact binary size. + std::pair Encode() const { + static_assert(sizeof(*this) == 16, ""); + uint64 upper = Pack(kind(), size_); + uint64 lower = entry_param_number_; + return {upper, lower}; + } + + bool operator==(const BufferInfo& buffer_info) const { + if (kind() != buffer_info.kind() || size() != buffer_info.size()) { + return false; + } + return !is_entry_parameter() || + entry_parameter_number() == buffer_info.entry_parameter_number(); + } + + // Factory methods: + + static BufferInfo MakeTempBuffer(uint64 size) { + return BufferInfo(Kind::kTempBuffer, /*size=*/size, + /*entry_param_number=*/-1); + } + static BufferInfo MakeConstant(uint64 size) { + return BufferInfo(Kind::kConstant, /*size=*/size, + /*entry_param_number=*/-1); + } + static BufferInfo MakeEntryParameter(uint64 size, uint64 param_number) { + return BufferInfo(Kind::kEntryParameter, /*size=*/size, + /*entry_param_number=*/param_number); + } + static BufferInfo MakeOnStackBuffer(uint64 size) { + return BufferInfo(Kind::kOnStackBuffer, /*size=*/size, + /*entry_param_number=*/-1); + } + + private: + BufferInfo() = default; + + enum class Kind : unsigned { + kConstant, + kTempBuffer, + kEntryParameter, + kOnStackBuffer + }; + + Kind kind() const { return static_cast(kind_); } + + explicit BufferInfo(Kind kind, uint64 size, uint64 entry_param_number) + : kind_(kind), size_(size), entry_param_number_(entry_param_number) {} + + static uint64 Pack(Kind kind, uint64 size) { + return (static_cast(size) << 2) | static_cast(kind); + } + + static void Unpack(uint64 packed, Kind* kind, uint64* size) { + *size = packed >> 2; + *kind = static_cast((packed << 62) >> 62); + } + + Kind kind_ : 2; + uint64 size_ : 62; + int64 entry_param_number_; +}; + +// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment. +constexpr size_t kAlign = 64; + +// AlignedBufferBytes returns the sum of the size of each buffer in +// `buffer_infos`, skipping constants, on-stack buffers and, if +// allocate_entry_params is false, entry parameters. There are `n` entries in +// `buffer_infos`. Each buffer is aligned to kAlign byte boundaries. +size_t AlignedBufferBytes(const BufferInfo* buffer_infos, size_t n, + bool allocate_entry_params); + +// MallocContiguousBuffers allocates buffers for use by the entry point +// generated by tfcompile. There are `n` entries in `buffer_infos`. If +// `annotate_initialized` is set, the allocated memory will be annotated as +// having been initialized - this is useful when allocating temporary buffers. +// If allocate_entry_params is true then allocates temp buffers and entry +// parameters, otherwise allocated only temp buffers. Slots in `bufs` +// corresponding to unallocated buffers are set to nullptr. +// +// A single contiguous block of memory is allocated, and portions of it are +// parceled out into `bufs`, which must have space for `n` entries. Returns +// the head of the allocated contiguous block, which should be passed to +// FreeContiguous when the buffers are no longer in use. +void* MallocContiguousBuffers(const BufferInfo* buffer_infos, size_t n, + bool allocate_entry_params, void** bufs, + bool annotate_initialized); + +// FreeContiguous frees the contiguous block of memory allocated by +// MallocContiguousBuffers. +void FreeContiguous(void* contiguous); +} // namespace cpu_function_runtime +} // namespace tensorflow + +#endif // TENSORFLOW_COMPILER_TF2XLA_CPU_FUNCTION_RUNTIME_H_ diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc similarity index 50% rename from tensorflow/compiler/aot/runtime_test.cc rename to tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc index 06ec623eb2dce5f8dc7156fb7e7b9ad57d90c8ee..8ca628c4eb6700d7184899bc1753dd6c6aa392b0 100644 --- a/tensorflow/compiler/aot/runtime_test.cc +++ b/tensorflow/compiler/tf2xla/cpu_function_runtime_test.cc @@ -13,39 +13,70 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/aot/runtime.h" +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/platform/test.h" namespace tensorflow { -namespace tfcompile { -namespace runtime { namespace { -TEST(Runtime, AlignmentValue) { +using cpu_function_runtime::BufferInfo; + +TEST(XlaCompiledCpuFunctionTest, AlignmentValue) { // We've chosen 64 byte alignment for the tfcompile runtime to mimic the // regular tensorflow allocator, which was chosen to play nicely with Eigen. // The tfcompile runtime also has a requirement that comes from the xla // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8 // So any value that we choose must abide by that constraint as well. - EXPECT_EQ(kAlign, Allocator::kAllocatorAlignment); + EXPECT_EQ(cpu_function_runtime::kAlign, Allocator::kAllocatorAlignment); +} + +std::vector SizesToBufferInfos(const intptr_t* sizes, size_t n) { + std::vector buffer_infos; + std::transform(sizes, sizes + n, std::back_inserter(buffer_infos), + [&](intptr_t size) { + if (size == -1) { + // Use a dummy on-stack buffer allocation to indicat the + // the current slot does not need an allocation. + int64 on_stack_buffer_size = 4; + return BufferInfo::MakeOnStackBuffer(on_stack_buffer_size); + } + return BufferInfo::MakeTempBuffer(size); + }); + return buffer_infos; +} + +// Simple wrappers to make writing tests more ergonomic. + +size_t AlignedBufferBytesFromSizes(const intptr_t* sizes, size_t n) { + std::vector buffer_infos = SizesToBufferInfos(sizes, n); + return AlignedBufferBytes(buffer_infos.data(), n, + /*allocate_entry_params=*/false); } -TEST(Runtime, AlignedBufferBytes) { - EXPECT_EQ(aligned_buffer_bytes(nullptr, 0), 0); +void* MallocContiguousBuffersFromSizes(const intptr_t* sizes, size_t n, + void** bufs, bool annotate_initialized) { + std::vector buffer_infos = SizesToBufferInfos(sizes, n); + return MallocContiguousBuffers(buffer_infos.data(), n, + /*allocate_entry_params=*/false, bufs, + annotate_initialized); +} + +TEST(XlaCompiledCpuFunctionTest, AlignedBufferBytes) { + EXPECT_EQ(AlignedBufferBytesFromSizes(nullptr, 0), 0); static constexpr intptr_t sizesA[1] = {-1}; - EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0); + EXPECT_EQ(AlignedBufferBytesFromSizes(sizesA, 1), 0); static constexpr intptr_t sizesB[1] = {3}; - EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64); + EXPECT_EQ(AlignedBufferBytesFromSizes(sizesB, 1), 64); static constexpr intptr_t sizesC[1] = {32}; - EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64); + EXPECT_EQ(AlignedBufferBytesFromSizes(sizesC, 1), 64); static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3}; - EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320); + EXPECT_EQ(AlignedBufferBytesFromSizes(sizesD, 7), 320); } void* add_ptr(void* base, uintptr_t delta) { @@ -56,48 +87,48 @@ void* add_ptr(void* base, uintptr_t delta) { // expected nullptrs, and write to each byte of allocated memory. We rely on // the leak checker to tell us if there's an inconsistency between malloc and // free. We also check the contiguous property. -TEST(Runtime, MallocFreeContiguousBuffers) { +TEST(XlaCompiledCpuFunctionTest, MallocFreeContiguousBuffers) { // Test empty sizes. - void* base = MallocContiguousBuffers(nullptr, 0, nullptr, false); + void* base = MallocContiguousBuffersFromSizes(nullptr, 0, nullptr, false); EXPECT_EQ(base, nullptr); - FreeContiguous(base); + cpu_function_runtime::FreeContiguous(base); // Test non-empty sizes with 0 sum. static constexpr intptr_t sizesA[1] = {-1}; void* bufA[1]; - base = MallocContiguousBuffers(sizesA, 1, bufA, false); + base = MallocContiguousBuffersFromSizes(sizesA, 1, bufA, false); EXPECT_EQ(base, nullptr); EXPECT_EQ(bufA[0], nullptr); - FreeContiguous(base); + cpu_function_runtime::FreeContiguous(base); // Test non-empty sizes with non-0 sum. static constexpr intptr_t sizesB[1] = {3}; void* bufB[1]; - base = MallocContiguousBuffers(sizesB, 1, bufB, false); + base = MallocContiguousBuffersFromSizes(sizesB, 1, bufB, false); EXPECT_NE(base, nullptr); EXPECT_EQ(bufB[0], add_ptr(base, 0)); char* bufB0_bytes = static_cast(bufB[0]); bufB0_bytes[0] = 'A'; bufB0_bytes[1] = 'B'; bufB0_bytes[2] = 'C'; - FreeContiguous(base); + cpu_function_runtime::FreeContiguous(base); // Test non-empty sizes with non-0 sum, and annotate_initialized. static constexpr intptr_t sizesC[1] = {3}; void* bufC[1]; - base = MallocContiguousBuffers(sizesC, 1, bufC, true); + base = MallocContiguousBuffersFromSizes(sizesC, 1, bufC, true); EXPECT_NE(base, nullptr); EXPECT_EQ(bufC[0], add_ptr(base, 0)); char* bufC0_bytes = static_cast(bufC[0]); bufC0_bytes[0] = 'A'; bufC0_bytes[1] = 'B'; bufC0_bytes[2] = 'C'; - FreeContiguous(base); + cpu_function_runtime::FreeContiguous(base); // Test mixed sizes. static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3}; void* bufD[7]; - base = MallocContiguousBuffers(sizesD, 7, bufD, false); + base = MallocContiguousBuffersFromSizes(sizesD, 7, bufD, false); EXPECT_NE(base, nullptr); EXPECT_EQ(bufD[0], add_ptr(base, 0)); EXPECT_EQ(bufD[1], nullptr); @@ -115,10 +146,26 @@ TEST(Runtime, MallocFreeContiguousBuffers) { } } } - FreeContiguous(base); + cpu_function_runtime::FreeContiguous(base); +} + +void CheckRoundTripIsOk(const BufferInfo& buffer_info) { + BufferInfo round_trip(buffer_info.Encode()); + ASSERT_EQ(round_trip, buffer_info); +} + +TEST(XlaCompiledCpuFunctionTest, BufferInfoTest) { + CheckRoundTripIsOk(BufferInfo::MakeTempBuffer(0)); + CheckRoundTripIsOk(BufferInfo::MakeTempBuffer(4)); + CheckRoundTripIsOk(BufferInfo::MakeOnStackBuffer(0)); + CheckRoundTripIsOk(BufferInfo::MakeOnStackBuffer(4)); + CheckRoundTripIsOk(BufferInfo::MakeConstant(0)); + CheckRoundTripIsOk(BufferInfo::MakeConstant(4)); + CheckRoundTripIsOk( + BufferInfo::MakeEntryParameter(/*size=*/0, /*param_number=*/4)); + CheckRoundTripIsOk( + BufferInfo::MakeEntryParameter(/*size=*/4, /*param_number=*/0)); } } // namespace -} // namespace runtime -} // namespace tfcompile } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index 0609e223381550645d1a41ba75e4cd57f893ee95..b1366e9e31e28406c5bf1a808b9c5670558ed9c7 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -6,6 +6,10 @@ package( load("//tensorflow:tensorflow.bzl", "tf_copts") load("//tensorflow:tensorflow.bzl", "tf_kernel_library") +load( + "//third_party/mkl:build_defs.bzl", + "if_mkl", +) tf_kernel_library( name = "xla_ops", @@ -129,6 +133,7 @@ tf_kernel_library( "//tensorflow/compiler/xla/client/lib:constants", "//tensorflow/compiler/xla/client/lib:math", "//tensorflow/compiler/xla/client/lib:numeric", + "//tensorflow/compiler/xla/client/lib:pooling", "//tensorflow/compiler/xla/client/lib:prng", "//tensorflow/compiler/xla/client/lib:sorting", "//tensorflow/core:framework", @@ -153,8 +158,14 @@ tf_kernel_library( "//tensorflow/core/kernels:sparse_to_dense_op", "//tensorflow/core/kernels:stack_ops", "//tensorflow/core/kernels:training_ops", - "//tensorflow/core/kernels:transpose_op", - ], + ] + if_mkl( + [ + "//tensorflow/core/kernels:mkl_transpose_op", + ], + [ + "//tensorflow/core/kernels:transpose_op", + ], + ), ) tf_kernel_library( diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc index 26fc1620a4f032b3af28de6e3a5af0e965e82341..276d744c096f8996c774964204feaa3762bdb844 100644 --- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc @@ -65,6 +65,6 @@ class XlaArgOp : public XlaOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(XlaArgOp); }; -REGISTER_XLA_OP(Name("_Arg").AllowResourceTypes(), XlaArgOp); +REGISTER_XLA_OP(Name("_Arg").AllowResourceTypes().CompilationOnly(), XlaArgOp); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc index ceb2af756c2d2020c7449086b957c9fbc1cc2979..6a7eb8d90c45ab119096eaa259e05c6ca768c5aa 100644 --- a/tensorflow/compiler/tf2xla/kernels/if_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc @@ -200,25 +200,35 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { } } + bool resource_variable_seen = false; + for (int i = 0; i < ctx->num_inputs(); ++i) { + if (ctx->input_type(i) == DT_RESOURCE) { + resource_variable_seen = true; + } else { + OP_REQUIRES( + ctx, !resource_variable_seen, + errors::FailedPrecondition( + "Resource variables and regular inputs cannot be interleaved.")); + } + } + xla::XlaOp outputs = xla::Conditional( ctx->Input(0), xla::Tuple(b, inputs), *then_result.computation, xla::Tuple(b, inputs), *else_result.computation); // Sets non-variable outputs. for (int i = 0; i < output_types_.size(); ++i) { - if (ctx->input_type(i) != DT_RESOURCE) { - xla::XlaOp output_handle = xla::GetTupleElement(outputs, i); - if (VLOG_IS_ON(2)) { - LOG(INFO) << "Setting output " << i; - auto shape_or = b->GetShape(output_handle); - if (shape_or.ok()) { - LOG(INFO) << "Shape for output " << i << ": " - << xla::ShapeUtil::HumanString(shape_or.ValueOrDie()); - } else { - LOG(INFO) << "Shape unknown for output " << i; - } + xla::XlaOp output_handle = xla::GetTupleElement(outputs, i); + if (VLOG_IS_ON(2)) { + LOG(INFO) << "Setting output " << i; + auto shape_or = b->GetShape(output_handle); + if (shape_or.ok()) { + LOG(INFO) << "Shape for output " << i << ": " + << xla::ShapeUtil::HumanString(shape_or.ValueOrDie()); + } else { + LOG(INFO) << "Shape unknown for output " << i; } - ctx->SetOutput(i, output_handle); } + ctx->SetOutput(i, output_handle); } // Updates the values of any resource variables modified by the conditional @@ -247,6 +257,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) { } REGISTER_XLA_OP(Name("If").AllowResourceTypes(), XlaIfOp); +REGISTER_XLA_OP(Name("StatelessIf").AllowResourceTypes(), XlaIfOp); REGISTER_XLA_OP(Name("XlaIf").AllowResourceTypes(), XlaIfOp); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc index 3d506e71e03d6b804d1ea0e63c760cfb82629f12..d4d180aff806f12875f0e43f111ee090f6607ef6 100644 --- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/pooling.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/literal.h" @@ -71,59 +72,53 @@ class PoolingOp : public XlaOpKernel { int num_dims() const { return num_spatial_dims_ + 2; } - // Method that builds an initial value to use in reductions. - virtual xla::XlaOp InitValue(xla::XlaBuilder* b) = 0; - - // The reduction operation to apply to each window. - virtual const xla::XlaComputation* Reduction(XlaOpKernelContext* ctx) = 0; - - // A post-processing operation to apply on the outputs of the ReduceWindow. - virtual xla::XlaOp PostProcessOutput(XlaOpKernelContext* ctx, - const xla::XlaOp& output, DataType dtype, - const TensorShape& input_shape) = 0; - - void Compile(XlaOpKernelContext* ctx) override { - std::vector ksize = ksize_; - std::vector stride = stride_; - if (ctx->num_inputs() != 1) { - const TensorShape ksize_shape = ctx->InputShape(1); - // Validate input sizes. - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ksize_shape), - errors::InvalidArgument("ksize must be a vector, not shape ", - ksize_shape.DebugString())); - OP_REQUIRES(ctx, ksize_shape.num_elements() == num_dims(), - errors::InvalidArgument("Sliding window ksize field must " - "specify ", - num_dims(), " dimensions")); - ksize.clear(); - OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &ksize)); - - const TensorShape stride_shape = ctx->InputShape(2); - // Validate input sizes. - OP_REQUIRES(ctx, TensorShapeUtils::IsVector(stride_shape), - errors::InvalidArgument("stride must be a vector, not shape ", - stride_shape.DebugString())); - OP_REQUIRES(ctx, stride_shape.num_elements() == num_dims(), - errors::InvalidArgument("Sliding window stride field must " - "specify ", - num_dims(), " dimensions")); - stride.clear(); - OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &stride)); + protected: + xla::StatusOr> GetKernelSize(XlaOpKernelContext* ctx) { + if (ctx->num_inputs() == 1) { + return ksize_; } - const TensorShape input_shape = ctx->InputShape(0); - OP_REQUIRES(ctx, input_shape.dims() == num_dims(), - errors::InvalidArgument("Input to ", type_string(), - " operator must have ", num_dims(), - " dimensions")); + const TensorShape ksize_shape = ctx->InputShape(1); + // Validate input sizes. + if (!TensorShapeUtils::IsVector(ksize_shape)) { + return errors::InvalidArgument("ksize must be a vector, not shape ", + ksize_shape.DebugString()); + } + if (ksize_shape.num_elements() != num_dims()) { + return errors::InvalidArgument( + "Sliding window ksize field must " + "specify ", + num_dims(), " dimensions"); + } + std::vector ksize; + auto status = ctx->ConstantInputAsIntVector(1, &ksize); + if (!status.ok()) { + return status; + } + return ksize; + } - xla::XlaBuilder* const b = ctx->builder(); - auto input = - XlaHelpers::ConvertElementType(b, ctx->Input(0), reduction_type_); - auto reduce = xla::ReduceWindow(input, InitValue(b), *Reduction(ctx), ksize, - stride, padding_); - auto pooled = XlaHelpers::ConvertElementType(b, reduce, input_type(0)); - ctx->SetOutput(0, - PostProcessOutput(ctx, pooled, input_type(0), input_shape)); + xla::StatusOr> GetStride(XlaOpKernelContext* ctx) { + if (ctx->num_inputs() == 1) { + return stride_; + } + const TensorShape stride_shape = ctx->InputShape(2); + // Validate input sizes. + if (!TensorShapeUtils::IsVector(stride_shape)) { + return errors::InvalidArgument("stride must be a vector, not shape ", + stride_shape.DebugString()); + } + if (stride_shape.num_elements() != num_dims()) { + return errors::InvalidArgument( + "Sliding window stride field must " + "specify ", + num_dims(), " dimensions"); + } + std::vector stride; + auto status = ctx->ConstantInputAsIntVector(2, &stride); + if (!status.ok()) { + return status; + } + return stride; } protected: @@ -136,24 +131,48 @@ class PoolingOp : public XlaOpKernel { xla::PrimitiveType xla_reduction_type_; }; +// Converts the tensor data format to the one required by the XLA pooling +// library. +xla::TensorFormat XlaTensorFormat(tensorflow::TensorFormat data_format, + int num_spatial_dims) { + int num_dims = num_spatial_dims + 2; + int batch_dimension = GetTensorBatchDimIndex(num_dims, data_format); + int feature_dimension = GetTensorFeatureDimIndex(num_dims, data_format); + gtl::InlinedVector spatial_dimensions(num_spatial_dims); + for (int spatial_dim = 0; spatial_dim < num_spatial_dims; ++spatial_dim) { + spatial_dimensions[spatial_dim] = + GetTensorSpatialDimIndex(num_dims, data_format, spatial_dim); + } + return xla::TensorFormat(/*batch_dimension=*/batch_dimension, + /*feature_dimension=*/feature_dimension, + /*spatial_dimensions=*/spatial_dimensions); +} + class MaxPoolOp : public PoolingOp { public: MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims) : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims, /*reduction_type=*/ctx->input_type(0)) {} - xla::XlaOp InitValue(xla::XlaBuilder* b) override { - return xla::MinValue(b, xla_reduction_type_); - } + void Compile(XlaOpKernelContext* ctx) override { + auto ksize_or_error = GetKernelSize(ctx); + OP_REQUIRES_OK(ctx, ksize_or_error.status()); + std::vector ksize = ksize_or_error.ValueOrDie(); - const xla::XlaComputation* Reduction(XlaOpKernelContext* ctx) override { - return ctx->GetOrCreateMax(reduction_type_); - } + auto stride_or_error = GetStride(ctx); + OP_REQUIRES_OK(ctx, stride_or_error.status()); + std::vector stride = stride_or_error.ValueOrDie(); + + const TensorShape input_shape = ctx->InputShape(0); + OP_REQUIRES(ctx, input_shape.dims() == num_dims(), + errors::InvalidArgument("Input to ", type_string(), + " operator must have ", num_dims(), + " dimensions")); - xla::XlaOp PostProcessOutput(XlaOpKernelContext* ctx, - const xla::XlaOp& output, DataType dtype, - const TensorShape& input_shape) override { - return output; + auto pooling = + xla::MaxPool(ctx->Input(0), ksize, stride, padding_, + XlaTensorFormat(data_format_, input_shape.dims() - 2)); + ctx->SetOutput(0, pooling); } }; @@ -180,9 +199,8 @@ class MaxPool3DOp : public MaxPoolOp { }; REGISTER_XLA_OP(Name("MaxPool3D"), MaxPool3DOp); -// Common computation shared between AvgPool and AvgPoolGrad. Divide each -// element of an image by the count of elements that contributed to that -// element during pooling. +// Divide each element of an image by the count of elements that contributed to +// that element during pooling. static xla::XlaOp AvgPoolDivideByCount( XlaOpKernelContext* ctx, const xla::XlaOp& output, DataType dtype, const TensorShape& input_shape, xla::Padding padding, @@ -241,20 +259,34 @@ class AvgPoolOp : public PoolingOp { /*reduction_type=*/ XlaHelpers::SumAccumulationType(ctx->input_type(0))) {} - xla::XlaOp InitValue(xla::XlaBuilder* b) override { - return xla::Zero(b, xla_reduction_type_); - } + void Compile(XlaOpKernelContext* ctx) override { + auto ksize_or_error = GetKernelSize(ctx); + OP_REQUIRES_OK(ctx, ksize_or_error.status()); + std::vector ksize = ksize_or_error.ValueOrDie(); - const xla::XlaComputation* Reduction(XlaOpKernelContext* ctx) override { - return ctx->GetOrCreateAdd(reduction_type_); - } + auto stride_or_error = GetStride(ctx); + OP_REQUIRES_OK(ctx, stride_or_error.status()); + std::vector stride = stride_or_error.ValueOrDie(); + + const TensorShape input_shape = ctx->InputShape(0); + OP_REQUIRES(ctx, input_shape.dims() == num_dims(), + errors::InvalidArgument("Input to ", type_string(), + " operator must have ", num_dims(), + " dimensions")); - xla::XlaOp PostProcessOutput(XlaOpKernelContext* ctx, - const xla::XlaOp& output, DataType dtype, - const TensorShape& input_shape) override { - return AvgPoolDivideByCount(ctx, output, dtype, input_shape, padding_, - ksize_, stride_, num_spatial_dims_, - data_format_); + auto xla_data_format = + XlaTensorFormat(data_format_, input_shape.dims() - 2); + auto spatial_padding = MakeSpatialPadding( + input_shape.dim_sizes(), ksize, stride, padding_, xla_data_format); + + // Convert the input to the reduction type. + auto converted_input = + ConvertElementType(ctx->Input(0), xla_reduction_type_); + auto pooling = + xla::AvgPool(converted_input, ksize, stride, spatial_padding, + xla_data_format, padding_ == xla::Padding::kValid); + // Convert the pooling result back to the input type before returning it. + ctx->SetOutput(0, ConvertElementType(pooling, ctx->input_xla_type(0))); } }; diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc index 1911e6ea362f999c787cbf95dcc9137a6a630273..64900e4709fd3e16d21096b0cfff8922906cb0d4 100644 --- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc @@ -104,7 +104,7 @@ class RetvalOp : public XlaOpKernel { TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp); }; -REGISTER_XLA_OP(Name("_Retval"), RetvalOp); +REGISTER_XLA_OP(Name("_Retval").CompilationOnly(), RetvalOp); } // anonymous namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc index 1d7a63dc311c60927f460e281601963e21232ec7..025ba827410f1a9f993a8a1855558a2daa86609b 100644 --- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc @@ -38,11 +38,15 @@ class SoftmaxOp : public XlaOpKernel { void Compile(XlaOpKernelContext* ctx) override { const TensorShape logits_shape = ctx->InputShape(0); - OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_shape), - errors::InvalidArgument("logits must be 2-dimensional")); + OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(logits_shape), + errors::InvalidArgument("logits must have >= 1 dimension, got ", + logits_shape.DebugString())); - const int kBatchDim = 0; - const int kClassDim = 1; + // Major dimensions are batch dimensions, minor dimension is the class + // dimension. + std::vector batch_dims(logits_shape.dims() - 1); + std::iota(batch_dims.begin(), batch_dims.end(), 0); + const int kClassDim = logits_shape.dims() - 1; const DataType type = input_type(0); const xla::PrimitiveType xla_type = ctx->input_xla_type(0); @@ -56,7 +60,7 @@ class SoftmaxOp : public XlaOpKernel { xla::Reduce(logits, xla::MinValue(b, xla_type), max_func, {kClassDim}); // Subtract the max in batch b from every element in batch b. Broadcasts // along the batch dimension. - auto shifted_logits = xla::Sub(logits, logits_max, {kBatchDim}); + auto shifted_logits = xla::Sub(logits, logits_max, batch_dims); auto exp_shifted = xla::Exp(shifted_logits); const DataType accumulation_type = XlaHelpers::SumAccumulationType(type); xla::PrimitiveType xla_accumulation_type; @@ -71,9 +75,9 @@ class SoftmaxOp : public XlaOpKernel { auto softmax = log_ // softmax = shifted_logits - log(sum(exp(shifted_logits))) - ? xla::Sub(shifted_logits, xla::Log(sum), {kBatchDim}) + ? xla::Sub(shifted_logits, xla::Log(sum), batch_dims) // softmax = exp(shifted_logits) / sum(exp(shifted_logits)) - : xla::Div(exp_shifted, sum, {kBatchDim}); + : xla::Div(exp_shifted, sum, batch_dims); ctx->SetOutput(0, softmax); } diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc index 1e8a376765d36ffa677ece06fbd131744299e04b..296518229ebf0ba46717afc4f26d5ae1551c2862 100644 --- a/tensorflow/compiler/tf2xla/kernels/while_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc @@ -301,6 +301,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) { } REGISTER_XLA_OP(Name("While").AllowResourceTypes(), XlaWhileOp); +REGISTER_XLA_OP(Name("StatelessWhile").AllowResourceTypes(), XlaWhileOp); REGISTER_XLA_OP(Name("XlaWhile").AllowResourceTypes(), XlaWhileOp); } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc index 2fb66913ada375d53512b9a1115326b3cc2afea4..77da1bf29ced60e490f07abad41cf8ce96232982 100644 --- a/tensorflow/compiler/tf2xla/literal_util.cc +++ b/tensorflow/compiler/tf2xla/literal_util.cc @@ -32,6 +32,23 @@ Status HostTensorToBorrowingLiteral(const Tensor& host_tensor, return Status::OK(); } +Status HostTensorToMutableBorrowingLiteral( + Tensor* host_tensor, xla::MutableBorrowingLiteral* literal) { + xla::Shape xla_shape; + TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor->dtype(), + host_tensor->shape(), &xla_shape)); + return HostTensorToMutableBorrowingLiteral(xla_shape, host_tensor, literal); +} + +Status HostTensorToMutableBorrowingLiteral( + const xla::Shape& xla_shape, Tensor* host_tensor, + xla::MutableBorrowingLiteral* literal) { + *literal = xla::MutableBorrowingLiteral( + static_cast(DMAHelper::base(host_tensor)), xla_shape); + + return Status::OK(); +} + Status HostTensorsToBorrowingLiteralTuple( tensorflow::gtl::ArraySlice host_tensors, xla::BorrowingLiteral* literal) { diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h index 0610a57029e72dff79a84742346f78a42b7f4ff1..09d6fa811669b422532673540e4da47f47e6be4e 100644 --- a/tensorflow/compiler/tf2xla/literal_util.h +++ b/tensorflow/compiler/tf2xla/literal_util.h @@ -30,6 +30,16 @@ namespace tensorflow { // 'host_tensor'. Status HostTensorToBorrowingLiteral(const Tensor& host_tensor, xla::BorrowingLiteral* literal); +// Returns a MutableBorrowingLiteral that utilizes the same underlying buffer +// owned by 'host_tensor', but is mutable via the xla::Literal methods. +Status HostTensorToMutableBorrowingLiteral( + Tensor* host_tensor, xla::MutableBorrowingLiteral* literal); +// Similar as above, except the literal shape is explicitly provided and used +// instead of obtaining it from the 'host_tensor'. The provided literal shape +// 'xla_shape' must be compatible with the shape of 'host_tensor'. +Status HostTensorToMutableBorrowingLiteral( + const xla::Shape& xla_shape, Tensor* host_tensor, + xla::MutableBorrowingLiteral* literal); // Returns a BorrowingLiteral tuple that utilizes the same underlying buffers // owned by 'host_tensors'. diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc index 9203e8d9e607e99ad738350a1c3f2b9e900df179..0e07485d1861aa40b14e527b14947c6f8bab647e 100644 --- a/tensorflow/compiler/tf2xla/tf2xla_util.cc +++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include +#include #include #include @@ -297,4 +298,29 @@ void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype, } } +namespace { +uint32 InitialRandomSeed() { + // Support plumbing the TF seed through to XLA is being worked on. + // If a user wants deterministic behavior, their best option + // is to start with a known checkpoint. This also handles issues when + // multiple random calls can be invoked in any order by TF executor. + // Another option is to use stateless random ops. They have much cleaner + // semantics. + // If a user really wants to set a deterministic seed for XLA-based + // devices, this is the place to do it. + std::random_device rd; + // Make the starting value odd. + return rd() | 1; +} +} // namespace + +uint32 GetXLARandomSeed() { + // We initialize counter with an odd number and increment it by two + // everytime. This ensures that it will never be zero, even + // after an overflow. When seeded with zero, some XLA backends + // can return all zeros instead of random numbers. + static std::atomic counter(InitialRandomSeed()); + return counter.fetch_add(2); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h index 745beb39c1d917cd0d1cd219536ee26a96253ec9..33620ef810bd4fe897f384474e661e341a448b93 100644 --- a/tensorflow/compiler/tf2xla/tf2xla_util.h +++ b/tensorflow/compiler/tf2xla/tf2xla_util.h @@ -56,6 +56,9 @@ Status SetNodeShardingFromNeighbors(Node* n, bool out_edges); void AddDtypeToKernalDefConstraint(StringPiece name, DataType dtype, KernelDef* kdef); +// Returns the next random seed to use for seeding xla rng. +uint32 GetXLARandomSeed(); + } // namespace tensorflow #endif // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_ diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc index 672e19bd93449ccc31f4af5ded23257b197a3c39..1f0f240135dfcd0c540cc39a42514c67ce979ee0 100644 --- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc +++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc @@ -16,45 +16,47 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h" #include -#include "tensorflow/compiler/aot/runtime.h" namespace tensorflow { XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data, AllocMode alloc_mode) - : raw_function_(static_data.raw_function), - result_index_(static_data.result_index), - args_(new void*[static_data.num_args]), - temps_(new void*[static_data.num_temps]), - arg_names_(static_data.arg_names), - result_names_(static_data.result_names), - program_shape_(static_data.program_shape), - hlo_profile_printer_data_(static_data.hlo_profile_printer_data) { + : raw_function_(static_data.raw_function_), + result_index_(static_data.result_index_), + buffer_table_(new void*[static_data.num_buffers_]), + buffer_infos_(static_data.buffer_infos_), + arg_index_table_(static_data.arg_index_table_), + num_args_(static_data.num_args_), + arg_names_(static_data.arg_names_), + result_names_(static_data.result_names_), + program_shape_(static_data.program_shape_), + hlo_profile_printer_data_(static_data.hlo_profile_printer_data_) { + bool allocate_entry_params = + alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS; // Allocate arg and temp buffers. - if (alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS) { - alloc_args_ = tensorflow::tfcompile::runtime::MallocContiguousBuffers( - static_data.arg_sizes, static_data.num_args, args_, - /*annotate_initialized=*/false); - } - alloc_temps_ = tensorflow::tfcompile::runtime::MallocContiguousBuffers( - static_data.temp_sizes, static_data.num_temps, temps_, + alloc_buffer_table_ = cpu_function_runtime::MallocContiguousBuffers( + static_data.buffer_infos_, static_data.num_buffers_, + /*allocate_entry_params=*/allocate_entry_params, buffer_table_, /*annotate_initialized=*/true); - // If Hlo profiling is enabled the generated code expects an appropriately // sized buffer to be passed in as the last argument. If Hlo profiling is // disabled the last function argument is still present in the function // signature, but it is ignored by the generated code and we pass in null for // it. if (hlo_profiling_enabled()) { - profile_counters_ = new int64[static_data.profile_counters_size](); + profile_counters_ = new int64[static_data.profile_counters_size_](); } } +bool XlaCompiledCpuFunction::Run() { + raw_function_(buffer_table_[result_index_], &run_options_, nullptr, + buffer_table_, profile_counters_); + return true; +} + XlaCompiledCpuFunction::~XlaCompiledCpuFunction() { - tensorflow::tfcompile::runtime::FreeContiguous(alloc_args_); - tensorflow::tfcompile::runtime::FreeContiguous(alloc_temps_); - delete[] args_; - delete[] temps_; + cpu_function_runtime::FreeContiguous(alloc_buffer_table_); + delete[] buffer_table_; delete[] profile_counters_; } diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h index 48a8c083cacf2f6ecf9dc1817b6174c01385d035..425e769346ffcbc548495d93cb7adc779f860110 100644 --- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h +++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h @@ -19,6 +19,7 @@ limitations under the License. #include #include +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/core/platform/types.h" @@ -56,36 +57,85 @@ class XlaCompiledCpuFunction { // StaticData represents the state necessary to run an XLA-compiled // function. For JIT this is backed by data in XlaJitCompiledCpuFunction; for // AOT this is backed by data compiled into the object file. - struct StaticData { + // + // The contents of StaticData are XLA-internal implementation details and + // should not be relied on by clients. + // + // TODO(sanjoy): Come up with a cleaner way to express the contraint we want + // here: generated XlaCompiledCpuFunction subclasses should be able to create + // instances of StaticData but only XlaCompiledCpuFunction should be able to + // read from StaticData instances. + class StaticData { + public: + void set_raw_function(RawFunction raw_function) { + raw_function_ = raw_function; + } + void set_buffer_infos( + const cpu_function_runtime::BufferInfo* buffer_infos) { + buffer_infos_ = buffer_infos; + } + void set_num_buffers(size_t num_buffers) { num_buffers_ = num_buffers; } + void set_arg_index_table(const int32* arg_index_table) { + arg_index_table_ = arg_index_table; + } + void set_num_args(int64 num_args) { num_args_ = num_args; } + void set_result_index(size_t result_index) { result_index_ = result_index; } + void set_arg_names(const char** arg_names) { arg_names_ = arg_names; } + void set_result_names(const char** result_names) { + result_names_ = result_names; + } + void set_program_shape(const xla::ProgramShape* program_shape) { + program_shape_ = program_shape; + } + const xla::HloProfilePrinterData* hlo_profile_printer_data() const { + return hlo_profile_printer_data_; + } + void set_hlo_profile_printer_data( + const xla::HloProfilePrinterData* hlo_profile_printer_data) { + hlo_profile_printer_data_ = hlo_profile_printer_data; + } + void set_profile_counters_size(int64 profile_counters_size) { + profile_counters_size_ = profile_counters_size; + } + + private: // The raw function to call. - RawFunction raw_function; + RawFunction raw_function_; + + // Contains information about the buffers used by the XLA computation. + const cpu_function_runtime::BufferInfo* buffer_infos_ = nullptr; + size_t num_buffers_ = 0; + + // Entry parameter i is described by + // buffer_infos[arg_index_table[i]]. + const int32* arg_index_table_ = nullptr; - // Cardinality and sizes of arg and temp buffers. - const intptr_t* arg_sizes = nullptr; - size_t num_args = 0; - const intptr_t* temp_sizes = nullptr; - size_t num_temps = 0; + // There are num_args entry parameters. + int64 num_args_ = 0; // The 0-based index of the result tuple, in the temp buffers. - size_t result_index = 0; + size_t result_index_ = 0; // [Optional] Arrays of arg and result names. These are arrays of C-style // strings, where the array is terminated by nullptr. - const char** arg_names = nullptr; - const char** result_names = nullptr; + const char** arg_names_ = nullptr; + const char** result_names_ = nullptr; // [Optional] Arg and result shapes. - const xla::ProgramShape* program_shape = nullptr; + const xla::ProgramShape* program_shape_ = nullptr; // [Optional] Profile printer data. Null if profiling is disabled. - const xla::HloProfilePrinterData* hlo_profile_printer_data = nullptr; + const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr; // [Optional] The number of profile counters expected in the profile counter // buffer by the generated code and hlo_profile_printer. 0 if profiling is // disabled. This information is already present in // hlo_profile_printer_data but xla::HloProfilePrinterData is forward // declared so we don't have access to that information here. - int64 profile_counters_size = 0; + int64 profile_counters_size_ = 0; + + // Only XlaCompiledCpuFunction is allowed to read the above fields. + friend class XlaCompiledCpuFunction; }; // AllocMode controls the buffer allocation mode. @@ -113,11 +163,7 @@ class XlaCompiledCpuFunction { // Runs the computation, with inputs read from arg buffers, and outputs // written to result buffers. Returns true on success and false on failure. - bool Run() { - raw_function_(temps_[result_index_], &run_options_, - const_cast(args_), temps_, profile_counters_); - return true; - } + bool Run(); // Returns the error message from the previous failed Run call. // @@ -129,14 +175,25 @@ class XlaCompiledCpuFunction { // ------------------------------ // Arg methods for managing input buffers. Buffers are in row-major order. - // Returns the underlying array of argument buffers, where args()[I] is the - // buffer for the positional argument at index I. - void** args() { return args_; } - const void* const* args() const { return args_; } - // Returns the buffer for the positional argument at the given `index`. - void* arg_data(size_t index) { return args_[index]; } - const void* arg_data(size_t index) const { return args_[index]; } + void* arg_data(size_t index) { + return buffer_table_[arg_index_table_[index]]; + } + const void* arg_data(size_t index) const { + return buffer_table_[arg_index_table_[index]]; + } + + int num_args() const { return num_args_; } + + // Returns the size of entry parameter `idx`. + // + // There is a static version of this method on tfcompile generated subclasses + // of XlaCompiledCpuFunction, but try to prefer this when possible since it + // works both for XlaJitCompiledCpuFunction and AOT compiled subclasses. + int arg_size(int idx) const { + assert(idx < num_args()); + return buffer_infos_[arg_index_table_[idx]].size(); + } // Sets the buffer for the positional argument at the given `index` to `data`. // Must be called before Run to have an effect. May be called under any @@ -149,7 +206,9 @@ class XlaCompiledCpuFunction { // // Aliasing of argument and result buffers is not allowed, and results in // undefined behavior. - void set_arg_data(size_t index, void* data) { args_[index] = data; } + void set_arg_data(size_t index, void* data) { + buffer_table_[arg_index_table_[index]] = data; + } // ------------------------------ // Result methods for managing output buffers. Buffers are in row-major order. @@ -159,9 +218,9 @@ class XlaCompiledCpuFunction { // Returns the underlying array of result buffers, where results()[I] is the // buffer for the positional result at index I. - void** results() { return static_cast(temps_[result_index_]); } + void** results() { return static_cast(buffer_table_[result_index_]); } const void* const* results() const { - return static_cast(temps_[result_index_]); + return static_cast(buffer_table_[result_index_]); } // Profile counters for this XLA computation. @@ -219,14 +278,28 @@ class XlaCompiledCpuFunction { const RawFunction raw_function_; const size_t result_index_; - // Arrays of argument and temp buffers; entries in args_ may be overwritten by - // the user. - void** args_ = nullptr; - void** temps_ = nullptr; + // Array containing pointers to argument and temp buffers (slots corresponding + // to constant and on-stack buffers are null). + void** const buffer_table_; - // Backing memory for individual arg and temp buffers. - void* alloc_args_ = nullptr; - void* alloc_temps_ = nullptr; + // Describes the buffers used by the XLA computation. + const cpu_function_runtime::BufferInfo* const buffer_infos_; + + // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]] + // for XLA generated code to be able to find it. + // + // For now we need to keep around the args_ array because there is code that + // depends on args() returning a void**. However, in the future we may remove + // args_ in favor of using buffer_table_ as the sole storage for the + // arguments. + const int32* const arg_index_table_; + + // The number of incoming arguments. + const int32 num_args_; + + // Backing memory for buffer_table_ and args_, the latter depending on + // AllocMode. + void* alloc_buffer_table_ = nullptr; // Backing memory for profiling counters. int64* profile_counters_ = nullptr; diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc index 00ccfb1c7873c85564b1bf4cf582cd31baa17ad5..86a78ee429e8913edb4a948727fa692083c472f4 100644 --- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc +++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h" #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -35,41 +36,6 @@ limitations under the License. namespace tensorflow { namespace { - -// Returns a vector of positional argument buffer sizes. -xla::StatusOr> ComputeArgSizes( - const xla::ProgramShape& program_shape) { - std::vector arg_sizes; - const size_t num_args = program_shape.parameters_size(); - arg_sizes.reserve(num_args); - for (int i = 0; i < num_args; ++i) { - const xla::Shape& arg_shape = program_shape.parameters(i); - constexpr size_t kPointerSize = sizeof(void*); - arg_sizes.push_back(xla::ShapeUtil::ByteSizeOf(arg_shape, kPointerSize)); - } - return std::move(arg_sizes); -} - -// Returns a vector of positional temporary buffer sizes. -xla::StatusOr> ComputeTempSizes( - const xla::BufferAssignment& buffer_assignment) { - const std::vector& allocations = - buffer_assignment.Allocations(); - std::vector temp_sizes; - temp_sizes.reserve(allocations.size()); - for (const xla::BufferAllocation& allocation : allocations) { - // Callers don't allocate temporary buffers for parameters. Nor for - // thread-local buffers, which are lowered to alloca. - if (allocation.is_entry_computation_parameter() || - allocation.is_thread_local()) { - temp_sizes.push_back(-1); - } else { - temp_sizes.push_back(allocation.size()); - } - } - return std::move(temp_sizes); -} - // Returns the index of the result in the temp buffers. xla::StatusOr ComputeResultIndex( const xla::BufferAssignment& buffer_assignment) { @@ -153,11 +119,11 @@ XlaJitCompiledCpuFunction::Compile( const xla::BufferAssignment& buffer_assignment = cpu_executable->buffer_assignment(); - // Compute buffer sizes and the result index, needed to run the raw function. - TF_ASSIGN_OR_RETURN(std::vector arg_sizes, - ComputeArgSizes(*program_shape)); - TF_ASSIGN_OR_RETURN(std::vector temp_sizes, - ComputeTempSizes(buffer_assignment)); + // Compute buffer infos and the result index, needed to run the raw function. + std::vector buffer_infos = + xla::cpu::CreateBufferInfosFromBufferAssignment(buffer_assignment); + std::vector arg_index_table = + xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos); TF_ASSIGN_OR_RETURN(size_t result_index, ComputeResultIndex(buffer_assignment)); @@ -165,28 +131,28 @@ XlaJitCompiledCpuFunction::Compile( new XlaJitCompiledCpuFunction); XlaJitCompiledCpuFunction* jit = jit_unique_ptr.get(); jit->executable_ = std::move(executable); - jit->arg_sizes_ = std::move(arg_sizes); - jit->temp_sizes_ = std::move(temp_sizes); + jit->buffer_infos_ = std::move(buffer_infos); + jit->arg_index_table_ = std::move(arg_index_table); jit->program_shape_ = std::move(program_shape); - jit->static_data_.raw_function = std::move(raw_function); - jit->static_data_.arg_sizes = jit->arg_sizes_.data(); - jit->static_data_.num_args = jit->arg_sizes_.size(); - jit->static_data_.temp_sizes = jit->temp_sizes_.data(); - jit->static_data_.num_temps = jit->temp_sizes_.size(); - jit->static_data_.result_index = result_index; + jit->static_data_.set_raw_function(raw_function); + jit->static_data_.set_buffer_infos(jit->buffer_infos_.data()); + jit->static_data_.set_num_buffers(jit->buffer_infos_.size()); + jit->static_data_.set_arg_index_table(jit->arg_index_table_.data()); + jit->static_data_.set_num_args(jit->arg_index_table_.size()); + jit->static_data_.set_result_index(result_index); // Optional metadata is collected and set below. CollectNames(config.feed(), &jit->nonempty_arg_names_, &jit->arg_names_); CollectNames(config.fetch(), &jit->nonempty_result_names_, &jit->result_names_); - jit->static_data_.arg_names = jit->arg_names_.data(); - jit->static_data_.result_names = jit->result_names_.data(); - jit->static_data_.program_shape = jit->program_shape_.get(); + jit->static_data_.set_arg_names(jit->arg_names_.data()); + jit->static_data_.set_result_names(jit->result_names_.data()); + jit->static_data_.set_program_shape(jit->program_shape_.get()); if (cpu_executable->hlo_profiling_enabled()) { - jit->static_data_.hlo_profile_printer_data = - &cpu_executable->hlo_profile_printer_data(); - jit->static_data_.profile_counters_size = - cpu_executable->hlo_profile_printer_data().profile_counters_size(); + jit->static_data_.set_hlo_profile_printer_data( + &cpu_executable->hlo_profile_printer_data()); + jit->static_data_.set_profile_counters_size( + cpu_executable->hlo_profile_printer_data().profile_counters_size()); } return std::move(jit_unique_ptr); diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h index af307ae4eff74927242c4650d8a43710e991cc52..d3c8f22a8078d03d15447ed200c914390f40b04f 100644 --- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h +++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h @@ -66,9 +66,11 @@ class XlaJitCompiledCpuFunction { // The static data is backed by the rest of the state in this class. XlaCompiledCpuFunction::StaticData static_data_; - // The backing arrays of arg and temp buffer sizes. - std::vector arg_sizes_; - std::vector temp_sizes_; + // The backing array for buffer infos. + std::vector buffer_infos_; + + // The backing array for the arg index table. + std::vector arg_index_table_; // The backing arrays of arg and result names. We hold the actual strings in // nonempty_*_names_, and hold arrays of pointers in *_names_ for the static diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h index ea75ad32d5df7bbadd37e89de6144b264ab6d5d1..2d5d078aa77423cc18bab053b80a7576acbd849e 100644 --- a/tensorflow/compiler/xla/array.h +++ b/tensorflow/compiler/xla/array.h @@ -409,7 +409,7 @@ class Array { // Returns the total number of elements in the array. int64 num_elements() const { - return std::accumulate(sizes_.begin(), sizes_.end(), 1, + return std::accumulate(sizes_.begin(), sizes_.end(), 1LL, std::multiplies()); } diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD index 39d5582d19dbb9942ae87e1962fc9fa713bcdd50..a2f32ab97eab10294a607f35fc79ded1cc2c5792 100644 --- a/tensorflow/compiler/xla/client/lib/BUILD +++ b/tensorflow/compiler/xla/client/lib/BUILD @@ -121,6 +121,30 @@ xla_test( ], ) +cc_library( + name = "pooling", + srcs = ["pooling.cc"], + hdrs = ["pooling.h"], + deps = [ + ":arithmetic", + ":constants", + "//tensorflow/compiler/tf2xla/lib:util", + "//tensorflow/compiler/xla/client:xla_builder", + "//tensorflow/core:lib", + ], +) + +xla_test( + name = "pooling_test", + srcs = ["pooling_test.cc"], + deps = [ + ":pooling", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + ], +) + cc_library( name = "prng", srcs = ["prng.cc"], @@ -144,7 +168,7 @@ cc_library( ":numeric", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", ], ) @@ -161,7 +185,7 @@ xla_test( "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", ], diff --git a/tensorflow/compiler/xla/client/lib/pooling.cc b/tensorflow/compiler/xla/client/lib/pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..7199269a6c889f3589c1148687faf0bb2aaae90a --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/pooling.cc @@ -0,0 +1,183 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/pooling.h" +#include "tensorflow/compiler/tf2xla/lib/util.h" +#include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" + +namespace xla { + +namespace { + +// Common computation shared between AvgPool and AvgPoolGrad. Divide each +// element of an image by the count of elements that contributed to that +// element during pooling. +XlaOp AvgPoolDivideByCountWithGeneralPadding( + XlaOp sums, PrimitiveType dtype, + tensorflow::gtl::ArraySlice input_shape, + tensorflow::gtl::ArraySlice> spatial_padding, + tensorflow::gtl::ArraySlice ksize, + tensorflow::gtl::ArraySlice stride, + const TensorFormat& data_format) { + // The padding shouldn't be included in the counts. We use another + // ReduceWindow to find the right counts. + const int num_spatial_dims = spatial_padding.size(); + + std::vector input_dim_sizes(num_spatial_dims); + std::vector window_dims(num_spatial_dims); + std::vector window_ksize(num_spatial_dims); + std::vector window_stride(num_spatial_dims); + CHECK_EQ(data_format.num_spatial_dims(), num_spatial_dims) + << "Invalid number of spatial dimentions in data format specification"; + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + input_dim_sizes[i] = input_shape[dim]; + window_dims[i] = dim; + window_ksize[i] = ksize[dim]; + window_stride[i] = stride[dim]; + } + + XlaBuilder* b = sums.builder(); + // Build a matrix of all 1s, with the same width/height as the input. + auto ones = Broadcast(One(b, dtype), input_dim_sizes); + PaddingConfig padding_config; + for (int i = 0; i < num_spatial_dims; ++i) { + auto dims = padding_config.add_dimensions(); + dims->set_edge_padding_low(spatial_padding[i].first); + dims->set_edge_padding_high(spatial_padding[i].second); + } + auto zero = Zero(b, dtype); + auto padded_ones = Pad(ones, zero, padding_config); + + // Perform a ReduceWindow with the same window size, strides, and padding + // to count the number of contributions to each result element. + auto counts = + ReduceWindow(padded_ones, zero, CreateScalarAddComputation(dtype, b), + window_ksize, window_stride, Padding::kValid); + + return Div(sums, counts, window_dims); +} + +// Sums all elements in the window specified by 'kernel_size' and 'stride'. +XlaOp ComputeSums(XlaOp operand, XlaOp init_value, + tensorflow::gtl::ArraySlice kernel_size, + tensorflow::gtl::ArraySlice stride, + const TensorFormat& data_format) { + XlaBuilder* b = operand.builder(); + return b->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape operand_shape, b->GetShape(operand)); + TF_ASSIGN_OR_RETURN(Shape init_shape, b->GetShape(init_value)); + PrimitiveType accumulation_type = init_shape.element_type(); + auto add_computation = CreateScalarAddComputation(accumulation_type, b); + return ReduceWindow(operand, init_value, add_computation, kernel_size, + stride, Padding::kValid); + }); +} + +// Creates a padding configuration out of spatial padding values. +PaddingConfig MakeSpatialPaddingConfig( + tensorflow::gtl::ArraySlice> spatial_padding, + tensorflow::gtl::ArraySlice kernel_size, + tensorflow::gtl::ArraySlice stride, + const TensorFormat& data_format) { + const int num_spatial_dims = kernel_size.size() - 2; + PaddingConfig padding_config; + for (int i = 0; i < 2 + num_spatial_dims; ++i) { + padding_config.add_dimensions(); + } + CHECK_EQ(data_format.num_spatial_dims(), num_spatial_dims) + << "Invalid number of spatial dimentions in data format specification"; + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + auto padding_dimension = padding_config.mutable_dimensions(dim); + padding_dimension->set_edge_padding_low(spatial_padding[i].first); + padding_dimension->set_edge_padding_high(spatial_padding[i].second); + } + return padding_config; +} + +} // namespace + +XlaOp MaxPool(XlaOp operand, tensorflow::gtl::ArraySlice kernel_size, + tensorflow::gtl::ArraySlice stride, Padding padding, + const TensorFormat& data_format) { + XlaBuilder* b = operand.builder(); + return b->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape operand_shape, b->GetShape(operand)); + PrimitiveType dtype = operand_shape.element_type(); + auto max_computation = CreateScalarMaxComputation(dtype, b); + auto init_value = MinValue(b, dtype); + return ReduceWindow(operand, init_value, max_computation, kernel_size, + stride, padding); + }); +} + +XlaOp AvgPool(XlaOp operand, tensorflow::gtl::ArraySlice kernel_size, + tensorflow::gtl::ArraySlice stride, + tensorflow::gtl::ArraySlice> padding, + const TensorFormat& data_format, + const bool counts_include_padding) { + XlaBuilder* b = operand.builder(); + return b->ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape operand_shape, b->GetShape(operand)); + PrimitiveType dtype = operand_shape.element_type(); + auto init_value = Zero(b, dtype); + std::vector input_size(operand_shape.dimensions().begin(), + operand_shape.dimensions().end()); + auto padding_config = + MakeSpatialPaddingConfig(padding, kernel_size, stride, data_format); + auto padded_operand = Pad(operand, Zero(b, dtype), padding_config); + auto pooled = ComputeSums(padded_operand, init_value, kernel_size, stride, + data_format); + if (counts_include_padding) { + // If counts include padding, all windows have the same number of elements + // contributing to each average. Divide by the window size everywhere to + // get the average. + int64 window_size = + std::accumulate(kernel_size.begin(), kernel_size.end(), 1, + [](int64 x, int64 y) { return x * y; }); + + auto divisor = ConstantR0WithType(b, dtype, window_size); + return pooled / divisor; + } else { + return AvgPoolDivideByCountWithGeneralPadding( + pooled, dtype, input_size, padding, kernel_size, stride, data_format); + } + }); +} + +std::vector> MakeSpatialPadding( + tensorflow::gtl::ArraySlice input_size, + tensorflow::gtl::ArraySlice kernel_size, + tensorflow::gtl::ArraySlice stride, Padding padding, + const TensorFormat& data_format) { + const int num_spatial_dims = kernel_size.size() - 2; + std::vector input_spatial_dimensions; + std::vector kernel_size_spatial_dimensions; + std::vector stride_spatial_dimensions; + CHECK_EQ(data_format.num_spatial_dims(), num_spatial_dims) + << "Invalid number of spatial dimentions in data format specification"; + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + input_spatial_dimensions.push_back(input_size[dim]); + kernel_size_spatial_dimensions.push_back(kernel_size[dim]); + stride_spatial_dimensions.push_back(stride[dim]); + } + return MakePadding(input_spatial_dimensions, kernel_size_spatial_dimensions, + stride_spatial_dimensions, padding); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/pooling.h b/tensorflow/compiler/xla/client/lib/pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..1699c585d3b09a306c21cfa797a9023a8463bd1f --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/pooling.h @@ -0,0 +1,73 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_ +#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_ + +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/core/lib/gtl/inlined_vector.h" + +namespace xla { + +// Tensor format for reduce window operations. +class TensorFormat { + public: + TensorFormat(int batch_dimension, int feature_dimension, + tensorflow::gtl::ArraySlice spatial_dimensions) + : batch_dimension_(batch_dimension), + feature_dimension_(feature_dimension), + spatial_dimensions_(spatial_dimensions.begin(), + spatial_dimensions.end()) {} + + int batch_dimension() const { return batch_dimension_; } + + int feature_dimension() const { return feature_dimension_; } + + int spatial_dimension(int dim) const { return spatial_dimensions_[dim]; } + + int num_spatial_dims() const { return spatial_dimensions_.size(); } + + private: + // The number of the dimension that represents the batch. + int batch_dimension_; + // The number of the dimension that represents the features. + int feature_dimension_; + // The dimension numbers for the spatial dimensions. + tensorflow::gtl::InlinedVector spatial_dimensions_; +}; + +// Computes the max pool of 'operand'. +XlaOp MaxPool(XlaOp operand, tensorflow::gtl::ArraySlice kernel_size, + tensorflow::gtl::ArraySlice stride, Padding padding, + const TensorFormat& data_format); + +// Computes the average pool of 'operand'. +XlaOp AvgPool(XlaOp operand, tensorflow::gtl::ArraySlice kernel_size, + tensorflow::gtl::ArraySlice stride, + tensorflow::gtl::ArraySlice> padding, + const TensorFormat& data_format, + const bool counts_include_padding); + +// Returns the list of low and high padding elements in each spatial dimension +// for the given 'padding' specification. +std::vector> MakeSpatialPadding( + tensorflow::gtl::ArraySlice input_size, + tensorflow::gtl::ArraySlice kernel_size, + tensorflow::gtl::ArraySlice stride, Padding padding, + const TensorFormat& data_format); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_ diff --git a/tensorflow/compiler/xla/client/lib/pooling_test.cc b/tensorflow/compiler/xla/client/lib/pooling_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4b4553b60db555ad7c2ab6b695236df745e30683 --- /dev/null +++ b/tensorflow/compiler/xla/client/lib/pooling_test.cc @@ -0,0 +1,185 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/lib/pooling.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" + +namespace xla { +namespace { + +TensorFormat MakeNCHWFormat(int num_spatial_dims) { + tensorflow::gtl::InlinedVector spatial_dimensions; + for (int i = 0; i < num_spatial_dims; ++i) { + spatial_dimensions.push_back(i + 2); + } + return TensorFormat(/*batch_dimension=*/0, /*feature_dimension=*/1, + /*spatial_dimensions=*/spatial_dimensions); +} + +std::vector> MakeGeneralPadding( + XlaOp input, tensorflow::gtl::ArraySlice kernel_size, + tensorflow::gtl::ArraySlice stride, Padding padding, + const xla::TensorFormat& data_format) { + XlaBuilder* b = input.builder(); + Shape operand_shape = b->GetShape(input).ValueOrDie(); + std::vector input_size(operand_shape.dimensions().begin(), + operand_shape.dimensions().end()); + return MakeSpatialPadding(input_size, kernel_size, stride, padding, + data_format); +} + +// Add singleton batch and feature dimensions to spatial dimensions, according +// to 'data_format' specification. +std::vector ExpandWithBatchAndFeatureDimensions( + tensorflow::gtl::ArraySlice spatial_dim_sizes, + const xla::TensorFormat& data_format) { + const int num_spatial_dims = spatial_dim_sizes.size(); + std::vector tensor_sizes(num_spatial_dims + 2, 1); + for (int i = 0; i < num_spatial_dims; ++i) { + int dim = data_format.spatial_dimension(i); + tensor_sizes[dim] = spatial_dim_sizes[i]; + } + return tensor_sizes; +} + +class PoolingTest : public ClientLibraryTestBase { + public: + ErrorSpec error_spec_{0.0001}; +}; + +XLA_TEST_F(PoolingTest, MaxPool2D) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = kernel_size; + MaxPool(input, kernel_size, stride, Padding::kValid, data_format); + + ComputeAndCompareR4(&builder, {{{{5, 4}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, MaxPool2DWithPadding) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = kernel_size; + MaxPool(input, kernel_size, stride, Padding::kSame, data_format); + + ComputeAndCompareR4(&builder, {{{{5, 4, 5}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, MaxPool2DWithPaddingAndStride) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format); + MaxPool(input, kernel_size, stride, Padding::kSame, data_format); + + ComputeAndCompareR4(&builder, {{{{5, 4, 4, 5, 5}, {5, 4, 3, 2, 1}}}}, + {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2D) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = kernel_size; + auto padding = MakeGeneralPadding(input, kernel_size, stride, Padding::kValid, + data_format); + AvgPool(input, kernel_size, stride, padding, data_format, + /*counts_include_padding=*/true); + + ComputeAndCompareR4(&builder, {{{{3, 3}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2DWithPadding) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = kernel_size; + auto padding = MakeGeneralPadding(input, kernel_size, stride, Padding::kSame, + data_format); + AvgPool(input, kernel_size, stride, padding, data_format, + /*counts_include_padding=*/false); + + ComputeAndCompareR4(&builder, {{{{3, 3, 3}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2DWithPaddingAndStride) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({1, 1}, data_format); + auto padding = MakeGeneralPadding(input, kernel_size, stride, Padding::kSame, + data_format); + AvgPool(input, kernel_size, stride, padding, data_format, + /*counts_include_padding=*/false); + + ComputeAndCompareR4(&builder, + {{{{3, 3, 3, 3, 3}, {4.5, 3.5, 2.5, 1.5, 1}}}}, {}, + error_spec_); +} + +XLA_TEST_F(PoolingTest, AvgPool2DWithGeneralPaddingCountNotIncludePadding) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({3, 3}, data_format); + auto stride = kernel_size; + AvgPool(input, kernel_size, stride, {{1, 1}, {2, 1}}, data_format, + /*counts_include_padding=*/false); + + ComputeAndCompareR4(&builder, {{{{3, 3}}}}, {}, error_spec_); +} + +XLA_TEST_F(PoolingTest, + AvgPool2DWithGeneralPaddingCountNotIncludePaddingAndStride) { + XlaBuilder builder(TestName()); + + XlaOp input = ConstantR4FromArray4D( + &builder, {{{{1, 2, 3, 4, 5}, {5, 4, 3, 2, 1}}}}); + auto data_format = MakeNCHWFormat(2); + auto kernel_size = ExpandWithBatchAndFeatureDimensions({3, 3}, data_format); + auto stride = ExpandWithBatchAndFeatureDimensions({2, 2}, data_format); + AvgPool(input, kernel_size, stride, {{2, 1}, {1, 1}}, data_format, + /*counts_include_padding=*/false); + + ComputeAndCompareR4(&builder, {{{{1.5, 3, 4.5}, {3, 3, 3}}}}, {}, + error_spec_); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc index 3a744148fba9957c10c825c00d500960f134396c..6ef81689489d8117d5951bcb75693c2e3413e4d6 100644 --- a/tensorflow/compiler/xla/client/lib/prng.cc +++ b/tensorflow/compiler/xla/client/lib/prng.cc @@ -56,7 +56,7 @@ ThreeFry2x32State ThreeFry2x32(ThreeFry2x32State input, ThreeFry2x32State key) { // Performs a single round of the Threefry2x32 algorithm, with a rotation // amount 'rotation'. - auto round = [builder](ThreeFry2x32State v, int rotation) { + auto round = [](ThreeFry2x32State v, int rotation) { v[0] = v[0] + v[1]; v[1] = RotateLeftS32(v[1], rotation); v[1] = v[0] ^ v[1]; diff --git a/tensorflow/compiler/xla/client/lib/sorting.h b/tensorflow/compiler/xla/client/lib/sorting.h index 404b4783c3878ca0fab811fa8c3d02686af44316..b9dfafdd6f957ae050e0f5dbd076d5288235b490 100644 --- a/tensorflow/compiler/xla/client/lib/sorting.h +++ b/tensorflow/compiler/xla/client/lib/sorting.h @@ -16,7 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SORTING_H_ #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SORTING_H_ -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc index b6eee762a5f002e00fd6118d91f25343e22f13d3..fef98c9923096e21a755c6d730de2c7c10852b2d 100644 --- a/tensorflow/compiler/xla/client/lib/sorting_test.cc +++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc @@ -14,7 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/client/lib/sorting.h" -#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc index b1a776b8b84eb0954e0d874d1b707e46c92f6389..081fec7ad92958aa285e4be41394d7b1876e0815 100644 --- a/tensorflow/compiler/xla/client/lib/testing.cc +++ b/tensorflow/compiler/xla/client/lib/testing.cc @@ -98,14 +98,13 @@ std::vector> MakeFakeArgumentsOrDie( << "Computation should have progran shape."; auto program_shape = computation.proto().program_shape(); - // For every (unbound) parameter that the computation wants, we manufacture - // some arbitrary data so that we can invoke the computation. - std::vector> fake_arguments; - for (const Shape& parameter : program_shape.parameters()) { - fake_arguments.push_back(MakeFakeDataOrDie(parameter, client)); - } - - return fake_arguments; + // Create and run a program which produces a tuple with one element per + // parameter, then return the tuple's constituent buffers. + std::vector param_shapes(program_shape.parameters().begin(), + program_shape.parameters().end()); + auto fake_input_tuple = + MakeFakeDataOrDie(ShapeUtil::MakeTupleShape(param_shapes), client); + return client->DeconstructTuple(*fake_input_tuple).ValueOrDie(); } } // namespace xla diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index e7250e11d5e59bb01026d5cf304901d17fd2ba42..cffb24e29beda6a8c40dca2fe709be22892dd489 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -101,11 +101,14 @@ Status LocalExecutable::ValidateExecutionOptions( } } - // Verify that the device the executable was built for is equivalent to the - // device it will run on. - int run_device_ordinal = run_options.device_ordinal() == -1 - ? backend_->default_device_ordinal() - : run_options.device_ordinal(); + // Verify that the device the executable was built for is equivalent + // to the device it will run on. + int run_device_ordinal = run_options.device_ordinal(); + if (run_device_ordinal == -1) { + run_device_ordinal = run_options.stream() != nullptr + ? run_options.stream()->parent()->device_ordinal() + : backend_->default_device_ordinal(); + } TF_ASSIGN_OR_RETURN(bool devices_equivalent, backend_->devices_equivalent( run_device_ordinal, build_options_.device_ordinal())); @@ -300,7 +303,7 @@ StatusOr> LocalClient::TransferFromOutfeedLocal( const Shape& shape, int device_ordinal) { TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, backend().stream_executor(device_ordinal)); - auto literal = MakeUnique(); + auto literal = Literal::CreateFromShape(shape); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralFromOutfeed( executor, shape, literal.get())); return std::move(literal); diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc index 53be5a79c23438e103e353b8c5fc0e2446ad78c0..b3b00e2fffe1196b36190ec72d1425bae4e4e276 100644 --- a/tensorflow/compiler/xla/client/xla_builder.cc +++ b/tensorflow/compiler/xla/client/xla_builder.cc @@ -45,21 +45,6 @@ int64 GetUniqueId() { return id; } -// Returns true if an instruction with the given opcode can be the root of the -// computation. -bool CanBeRoot(HloOpcode opcode) { - switch (opcode) { - case HloOpcode::kAfterAll: - case HloOpcode::kSend: - case HloOpcode::kSendDone: - case HloOpcode::kOutfeed: - case HloOpcode::kTrace: - return false; - default: - return true; - } -} - } // namespace XlaOp operator-(const XlaOp& x) { return Neg(x); } @@ -142,28 +127,13 @@ XlaOp XlaBuilder::ReportErrorOrReturn( return ReportErrorOrReturn(op_creator()); } -StatusOr XlaBuilder::GetProgramShape(int64* root_id) const { +StatusOr XlaBuilder::GetProgramShape(int64 root_id) const { TF_RETURN_IF_ERROR(first_error_); - - TF_RET_CHECK(root_id != nullptr); + TF_RET_CHECK((root_id >= 0) && (root_id < instructions_.size())); ProgramShape program_shape; - // Not all instructions can be roots. Walk backwards from the last added - // instruction until a valid root is found. - int64 index = instructions_.size() - 1; - for (; index >= 0; index--) { - TF_ASSIGN_OR_RETURN(HloOpcode opcode, - StringToHloOpcode(instructions_[index].opcode())); - if (CanBeRoot(opcode)) { - break; - } - } - if (index < 0) { - return FailedPrecondition("no root instruction was found"); - } - *root_id = instructions_[index].id(); - *program_shape.mutable_result() = instructions_[index].shape(); + *program_shape.mutable_result() = instructions_[root_id].shape(); // Check that the parameter numbers are continuous from 0, and add parameter // shapes and names to the program shape. @@ -188,8 +158,15 @@ StatusOr XlaBuilder::GetProgramShape(int64* root_id) const { } StatusOr XlaBuilder::GetProgramShape() const { - int64 root; - return GetProgramShape(&root); + TF_RET_CHECK(!instructions_.empty()); + return GetProgramShape(instructions_.back().id()); +} + +StatusOr XlaBuilder::GetProgramShape(XlaOp root) const { + if (root.builder_ != this) { + return InvalidArgument("Given root operation is not in this computation."); + } + return GetProgramShape(root.handle()); } void XlaBuilder::IsConstantVisitor(const int64 op_handle, @@ -257,17 +234,29 @@ StatusOr XlaBuilder::Build() { first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace); return AppendStatus(first_error_, backtrace); } + return Build(instructions_.back().id()); +} + +StatusOr XlaBuilder::Build(XlaOp root) { + if (root.builder_ != this) { + return InvalidArgument("Given root operation is not in this computation."); + } + return Build(root.handle()); +} + +StatusOr XlaBuilder::Build(int64 root_id) { + if (!first_error_.ok()) { + string backtrace; + first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace); + return AppendStatus(first_error_, backtrace); + } HloComputationProto entry; entry.set_id(GetUniqueId()); // Give the computation a global unique id. entry.set_name(StrCat(name_, entry.id())); // Ensure that the name is unique. - { - int64 root_id; - TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(), - GetProgramShape(&root_id)); - entry.set_root_id(root_id); - } + TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(), GetProgramShape(root_id)); + entry.set_root_id(root_id); for (auto& instruction : instructions_) { // Ensures that the instruction names are unique among the whole graph. @@ -1099,11 +1088,11 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) { sharding_builder::AssignDevice(0); XlaScopedShardingAssignment scoped_sharding(this, infeed_instruction_sharding); - TF_ASSIGN_OR_RETURN(infeed, - AddInstruction(std::move(instr), HloOpcode::kInfeed)); + TF_ASSIGN_OR_RETURN( + infeed, AddInstruction(std::move(instr), HloOpcode::kInfeed, {})); } else { - TF_ASSIGN_OR_RETURN(infeed, - AddInstruction(std::move(instr), HloOpcode::kInfeed)); + TF_ASSIGN_OR_RETURN( + infeed, AddInstruction(std::move(instr), HloOpcode::kInfeed, {})); } // The infeed instruction produces a tuple of the infed data and a token @@ -1635,6 +1624,32 @@ XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices, }); } +XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, + const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers) { + return ReportErrorOrReturn([&]() -> StatusOr { + HloInstructionProto instr; + + TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input)); + TF_ASSIGN_OR_RETURN(const Shape& scatter_indices_shape, + GetShape(scatter_indices)); + TF_ASSIGN_OR_RETURN(const Shape& updates_shape, GetShape(updates)); + TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape, + update_computation.GetProgramShape()); + TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), + ShapeInference::InferScatterShape( + input_shape, scatter_indices_shape, updates_shape, + to_apply_shape, dimension_numbers)); + + *instr.mutable_scatter_dimension_numbers() = dimension_numbers; + + AddCalledComputation(update_computation, &instr); + return AddInstruction(std::move(instr), HloOpcode::kScatter, + {input, scatter_indices, updates}); + }); +} + XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand, const XlaComputation& true_computation, const XlaOp& false_operand, @@ -1681,7 +1696,7 @@ XlaOp XlaBuilder::Reduce( TF_ASSIGN_OR_RETURN(*instr.mutable_shape(), ShapeInference::InferReduceShape( - operand_shape, init_shape, dimensions_to_reduce, + {&operand_shape, &init_shape}, dimensions_to_reduce, called_program_shape)); for (int64 dim : dimensions_to_reduce) { @@ -1866,6 +1881,61 @@ XlaOp XlaBuilder::CrossReplicaSum( }); } +XlaOp XlaBuilder::AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups) { + return ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + + // The HloInstruction for Alltoall currently only handles the data + // communication: it accepts N already split parts and scatters them to N + // cores, and each core gathers the N received parts into a tuple as the + // output. So here we explicitly split the operand before the hlo alltoall, + // and concat the tuple elements. + // + // First, run shape inference to make sure the shapes are valid. + TF_RETURN_IF_ERROR( + ShapeInference::InferAllToAllShape(operand_shape, split_dimension, + concat_dimension, split_count) + .status()); + + // Split into N parts. + std::vector slices; + slices.reserve(split_count); + const int64 block_size = + operand_shape.dimensions(split_dimension) / split_count; + for (int i = 0; i < split_count; i++) { + slices.push_back(SliceInDim(operand, /*start_index=*/i * block_size, + /*limit_index=*/(i + 1) * block_size, + /*stride=*/1, /*dimno=*/split_dimension)); + } + + // Handle data communication. + HloInstructionProto instr; + TF_ASSIGN_OR_RETURN(auto slice_shapes, this->GetOperandShapes(slices)); + std::vector slice_shape_ptrs; + c_transform(slice_shapes, std::back_inserter(slice_shape_ptrs), + [](const Shape& shape) { return &shape; }); + TF_ASSIGN_OR_RETURN( + *instr.mutable_shape(), + ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs)); + for (const ReplicaGroup& group : replica_groups) { + *instr.add_replica_groups() = group; + } + TF_ASSIGN_OR_RETURN( + XlaOp alltoall, + AddInstruction(std::move(instr), HloOpcode::kAllToAll, slices)); + + // Concat the N received parts. + std::vector received; + received.reserve(split_count); + for (int i = 0; i < split_count; i++) { + received.push_back(this->GetTupleElement(alltoall, i)); + } + return this->ConcatInDim(received, concat_dimension); + }); +} + XlaOp XlaBuilder::SelectAndScatter( const XlaOp& operand, const XlaComputation& select, tensorflow::gtl::ArraySlice window_dimensions, @@ -2137,11 +2207,6 @@ StatusOr XlaBuilder::BuildConstantSubGraph( TF_ASSIGN_OR_RETURN(const HloInstructionProto* root, LookUpInstruction(root_op)); - TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode())); - if (!CanBeRoot(opcode)) { - return InvalidArgument("the operand with opcode %s cannot be root", - root->opcode().c_str()); - } HloComputationProto entry; entry.set_id(GetUniqueId()); // Give the computation a global unique id. @@ -2667,6 +2732,13 @@ XlaOp CrossReplicaSum( replica_group_ids, channel_id); } +XlaOp AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups) { + return operand.builder()->AllToAll(operand, split_dimension, concat_dimension, + split_count, replica_groups); +} + XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select, tensorflow::gtl::ArraySlice window_dimensions, tensorflow::gtl::ArraySlice window_strides, @@ -2803,6 +2875,13 @@ XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices, window_bounds); } +XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers) { + return input.builder()->Scatter(input, scatter_indices, updates, + update_computation, dimension_numbers); +} + void Send(const XlaOp& operand, const ChannelHandle& handle) { return operand.builder()->Send(operand, handle); } diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h index ae331407d6cbb08f8bfc25baabbedd1ba897231f..9403d7ca8dabc80a3964b50d29f158a98091f843 100644 --- a/tensorflow/compiler/xla/client/xla_builder.h +++ b/tensorflow/compiler/xla/client/xla_builder.h @@ -195,9 +195,14 @@ class XlaBuilder { // Builds the computation with the requested operations, or returns a non-ok // status. Note that all ops that have been enqueued will be moved to the - // computation being returned. + // computation being returned. The root of the computation will be the last + // added operation. StatusOr Build(); + // Overload of Build which specifies a particular root instruction for the + // computation. + StatusOr Build(XlaOp root); + // Builds the computation with the requested operations, or notes an error in // the parent XlaBuilder and returns an empty computation if building failed. // This function is intended to be used where the returned XlaComputation is @@ -225,9 +230,14 @@ class XlaBuilder { // Returns the shape of the given op. StatusOr GetShape(const XlaOp& op) const; - // Returns the (inferred) result for the current computation's shape. + // Returns the (inferred) result for the current computation's shape. This + // assumes the root instruction is the last added instruction. StatusOr GetProgramShape() const; + // Returns the (inferred) result for the current computation's shape using the + // given operation as the root. + StatusOr GetProgramShape(XlaOp root) const; + // Reports an error to the builder, by // * storing it internally and capturing a backtrace if it's the first error // (this deferred value will be produced on the call to @@ -255,6 +265,9 @@ class XlaBuilder { StatusOr IsConstant(const XlaOp& operand) const; private: + // Build helper which takes the id of the root operation.. + StatusOr Build(int64 root_id); + // Enqueues a "retrieve parameter value" instruction for a parameter that was // passed to the computation. XlaOp Parameter(int64 parameter_number, const Shape& shape, @@ -686,9 +699,9 @@ class XlaBuilder { // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means, // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1. // - // - `channel_id`: for Allreduce nodes from different models, if they have the - // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be - // applied cross models. + // - `channel_id`: for Allreduce nodes from different modules, if they have + // the same channel_id, they will be 'Allreduce'd. If empty, Allreduce will + // not be applied cross modules. // // TODO(b/79737069): Rename this to AllReduce when it's ready to use. XlaOp CrossReplicaSum( @@ -697,6 +710,13 @@ class XlaBuilder { const tensorflow::gtl::optional& channel_id = tensorflow::gtl::nullopt); + // Enqueues an operation that do an Alltoall of the operand cross cores. + // + // TODO(b/110096724): This is NOT YET ready to use. + XlaOp AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups); + // Enqueues an operation that scatters the `source` array to the selected // indices of each window. XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select, @@ -857,6 +877,11 @@ class XlaBuilder { const GatherDimensionNumbers& dimension_numbers, tensorflow::gtl::ArraySlice window_bounds); + // Enqueues a Scatter node onto the computation. + XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers); + // Enqueues a Send node onto the computation for device-to-device // communication, to send the given operand to a Recv instruction that shares // the same channel handle. @@ -964,9 +989,8 @@ class XlaBuilder { // shape. StatusOr Reshape(const Shape& shape, const XlaOp& operand); - // Returns the (inferred) result for the program shape for the current - // computation and fills the root_id in the pointer. - StatusOr GetProgramShape(int64* root_id) const; + // Returns the (inferred) result for the program shape using the given root. + StatusOr GetProgramShape(int64 root_id) const; // Returns shapes for the operands. StatusOr> GetOperandShapes( @@ -1229,6 +1253,9 @@ class XlaBuilder { const XlaOp& operand, const XlaComputation& computation, tensorflow::gtl::ArraySlice replica_group_ids, const tensorflow::gtl::optional& channel_id); + friend XlaOp AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups); friend XlaOp SelectAndScatter( const XlaOp& operand, const XlaComputation& select, tensorflow::gtl::ArraySlice window_dimensions, @@ -1296,6 +1323,10 @@ class XlaBuilder { friend XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices, const GatherDimensionNumbers& dimension_numbers, tensorflow::gtl::ArraySlice window_bounds); + friend XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, + const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers); friend void Send(const XlaOp& operand, const ChannelHandle& handle); friend XlaOp Recv(XlaBuilder* builder, const Shape& shape, const ChannelHandle& handle); @@ -1811,9 +1842,9 @@ XlaOp CrossReplicaSum( // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means, // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1. // -// - `channel_id`: for Allreduce nodes from different models, if they have the +// - `channel_id`: for Allreduce nodes from different modules, if they have the // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be -// applied cross models. +// applied cross modules. // // TODO(b/79737069): Rename this to AllReduce when it's ready to use. XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation, @@ -1821,6 +1852,13 @@ XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation, const tensorflow::gtl::optional& channel_id = tensorflow::gtl::nullopt); +// Enqueues an operation that do an Alltoall of the operand cross cores. +// +// TODO(b/110096724): This is NOT YET ready to use. +XlaOp AllToAll(const XlaOp& operand, int64 split_dimension, + int64 concat_dimension, int64 split_count, + const std::vector& replica_groups = {}); + // Enqueues an operation that scatters the `source` array to the selected // indices of each window. XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select, @@ -1977,6 +2015,11 @@ XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices, const GatherDimensionNumbers& dimension_numbers, tensorflow::gtl::ArraySlice window_bounds); +// Enqueues a Scatter node onto the computation. +XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, + const XlaOp& updates, const XlaComputation& update_computation, + const ScatterDimensionNumbers& dimension_numbers); + // Enqueues a Send node onto the computation for device-to-device // communication. This operation sends the given operand to // a Recv instruction in a different computation that shares the same channel diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc index 28a207b137d901213ec43d506a638ef08a6bded9..49a15ec3b449bdec07aa6ecfbc40b7b9f62c3f4e 100644 --- a/tensorflow/compiler/xla/client/xla_builder_test.cc +++ b/tensorflow/compiler/xla/client/xla_builder_test.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/compiler/xla/xla_data.pb.h" namespace xla { @@ -46,6 +47,17 @@ class XlaBuilderTest : public ::testing::Test { return HloModule::CreateFromProto(proto, config); } + // Overload which explicitly specifies the root instruction. + StatusOr> BuildHloModule(XlaBuilder* b, + XlaOp root) { + TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build(root)); + const HloModuleProto& proto = computation.proto(); + TF_ASSIGN_OR_RETURN(const auto& config, + HloModule::CreateModuleConfigFromProto( + proto, legacy_flags::GetDebugOptionsFromFlags())); + return HloModule::CreateFromProto(proto, config); + } + // Returns the name of the test currently being run. string TestName() const { return ::testing::UnitTest::GetInstance()->current_test_info()->name(); @@ -293,6 +305,21 @@ TEST_F(XlaBuilderTest, Transpose) { EXPECT_THAT(root, op::Transpose(op::Parameter())); } +TEST_F(XlaBuilderTest, AllToAll) { + XlaBuilder b(TestName()); + auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x"); + AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, + /*split_count=*/2); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b)); + auto root = module->entry_computation()->root_instruction(); + + // AllToAll is decomposed into slices -> all-to-all -> gte -> concat. + EXPECT_EQ(root->opcode(), HloOpcode::kConcatenate); + EXPECT_EQ(root->operand(0)->operand(0)->opcode(), HloOpcode::kAllToAll); + EXPECT_TRUE( + ShapeUtil::Equal(root->shape(), ShapeUtil::MakeShape(F32, {8, 8}))); +} + TEST_F(XlaBuilderTest, ReportError) { XlaBuilder b(TestName()); auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x"); @@ -320,5 +347,45 @@ TEST_F(XlaBuilderTest, ReportErrorOrReturnHandlesErrors) { EXPECT_THAT(statusor.status().error_message(), HasSubstr("a test error")); } +TEST_F(XlaBuilderTest, BuildWithSpecificRoot) { + XlaBuilder b(TestName()); + XlaOp constant = ConstantR0(&b, 1.0); + Add(constant, ConstantR0(&b, 2.0)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/constant)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Constant()); +} + +TEST_F(XlaBuilderTest, BuildWithSpecificRootAndMultipleParameters) { + // Specifying a particular root in Build should still include all entry + // parameters. + XlaBuilder b(TestName()); + const Shape shape = ShapeUtil::MakeShape(F32, {42, 123}); + XlaOp x = Parameter(&b, 0, shape, "x"); + XlaOp y = Parameter(&b, 1, shape, "y"); + XlaOp z = Parameter(&b, 2, shape, "z"); + Add(x, Sub(y, z)); + TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/x)); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Parameter()); + EXPECT_EQ(module->entry_computation()->num_parameters(), 3); + EXPECT_EQ(module->entry_computation()->instruction_count(), 5); +} + +TEST_F(XlaBuilderTest, BuildWithSpecificRootWithWrongBuilder) { + XlaBuilder b(TestName()); + XlaBuilder other_b(TestName()); + const Shape shape = ShapeUtil::MakeShape(F32, {42, 123}); + + Parameter(&b, 0, shape, "param"); + XlaOp other_param = Parameter(&other_b, 0, shape, "other_param"); + + Status status = b.Build(other_param).status(); + ASSERT_IS_NOT_OK(status); + EXPECT_THAT( + status.error_message(), + ::testing::HasSubstr("root operation is not in this computation")); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD deleted file mode 100644 index 2e131dbad26970d4cb9860c17c3de3d52de36223..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/client/xla_client/BUILD +++ /dev/null @@ -1,33 +0,0 @@ -# Description: -# The new XLA client libraries. - -licenses(["notice"]) # Apache 2.0 - -package(default_visibility = [":friends"]) - -package_group( - name = "friends", - includes = [ - "//tensorflow/compiler/xla:friends", - ], -) - -# Filegroup used to collect source files for dependency checking. -filegroup( - name = "c_srcs", - data = glob([ - "**/*.cc", - "**/*.h", - ]), -) - -load("//tensorflow:tensorflow.bzl", "tf_cc_test") - -cc_library( - name = "xla_builder", - hdrs = ["xla_builder.h"], - visibility = ["//visibility:public"], - deps = [ - "//tensorflow/compiler/xla/client:xla_builder", - ], -) diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py index abd10b164eaef8e75ed304483861baf250c5b954..fb135f5ceda67ce6c001de15b8f3f084ca164826 100644 --- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py +++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py @@ -20,7 +20,7 @@ from __future__ import print_function import math -import numpy as np +import numpy as _np # Avoids becoming a part of public Tensorflow API. from tensorflow.compiler.xla import xla_data_pb2 from tensorflow.compiler.xla.python_api import xla_shape @@ -85,7 +85,7 @@ class Sharding(object): something we really want to expose to users (especially as the contract for tile_assignment is very strict). """ - if not isinstance(tile_assignment, np.ndarray): + if not isinstance(tile_assignment, _np.ndarray): raise TypeError('Tile assignment must be of type np.ndarray') if not isinstance(tile_shape, xla_shape.Shape): raise TypeError('Tile shape must be of type xla_shape.Shape') diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc index 15eeb2ea13607d43c995197f8f0e3c58abd4d94a..b72d190d54591384392e79e73e90cf52df04a902 100644 --- a/tensorflow/compiler/xla/layout_util.cc +++ b/tensorflow/compiler/xla/layout_util.cc @@ -297,7 +297,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) { shape.layout().padded_dimensions_size() == 0) { return false; } - CHECK(IsDenseArray(shape)); + CHECK(IsDenseArray(shape)) << shape.ShortDebugString(); CHECK_EQ(shape.dimensions_size(), shape.layout().padded_dimensions_size()); for (int64 i = 0; i < shape.dimensions_size(); ++i) { if (shape.layout().padded_dimensions(i) > shape.dimensions(i)) { diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc index f42fb92359f40ec763866af094972046f6407ae1..1bf8948ef6ded56573d588258c3d9bbfaa55a50d 100644 --- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc +++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc @@ -31,7 +31,6 @@ std::vector* flag_objects; std::once_flag flags_init; void SetDebugOptionsDefaults(DebugOptions* flags) { - flags->set_xla_enable_fast_math(true); flags->set_xla_llvm_enable_alias_scope_metadata(true); flags->set_xla_llvm_enable_noalias_metadata(true); flags->set_xla_llvm_enable_invariant_load_metadata(true); @@ -53,6 +52,11 @@ void SetDebugOptionsDefaults(DebugOptions* flags) { // the heuristics needed to decide when to run on multiple streams. See // b/77879207. flags->set_xla_gpu_disable_multi_streaming(true); + + // TODO(jlebar): Disable fastmath once doing so is not a performance + // regression. + flags->set_xla_cpu_enable_fast_math(true); + flags->set_xla_gpu_enable_fast_math(true); } // Allocates flag_values and flag_objects; this function must not be called more @@ -150,10 +154,16 @@ void AllocateFlags() { flag_values->mutable_xla_generate_hlo_text_to(), "Dump all HLO modules as text into the provided directory path."), tensorflow::Flag( - "xla_enable_fast_math", - bool_setter_for(&DebugOptions::set_xla_enable_fast_math), - flag_values->xla_enable_fast_math(), - "Enable unsafe fast-math optimizations in the compiler; " + "xla_cpu_enable_fast_math", + bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math), + flag_values->xla_cpu_enable_fast_math(), + "Enable unsafe fast-math optimizations in the CPU compiler; " + "this may produce faster code at the expense of some accuracy."), + tensorflow::Flag( + "xla_gpu_enable_fast_math", + bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math), + flag_values->xla_cpu_enable_fast_math(), + "Enable unsafe fast-math optimizations in the GPU compiler; " "this may produce faster code at the expense of some accuracy."), tensorflow::Flag( "xla_llvm_enable_alias_scope_metadata", diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc index 0545deb096e9eace5a9713f200e10559aa718441..36e472568ecfdb97c828817ed339260ee7878723 100644 --- a/tensorflow/compiler/xla/literal.cc +++ b/tensorflow/compiler/xla/literal.cc @@ -71,7 +71,7 @@ std::ostream& operator<<(std::ostream& out, const Literal& literal) { return out; } -Literal::StrideConfig::StrideConfig( +MutableLiteralBase::StrideConfig::StrideConfig( const Shape& source_shape, const Shape& dest_shape, tensorflow::gtl::ArraySlice dimensions) : dimensions(dimensions), @@ -133,7 +133,8 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) { } Literal::Literal(const Shape& shape, bool allocate_arrays) - : LiteralBase(), shape_(MakeUnique(shape)) { + : MutableLiteralBase() { + shape_ = MakeUnique(shape); CHECK(LayoutUtil::HasLayout(*shape_)); root_piece_ = new Piece(); root_piece_->set_subshape(shape_.get()); @@ -159,7 +160,9 @@ void Literal::DeallocateBuffers() { }); } -Literal::Literal(Literal&& other) : LiteralBase() { *this = std::move(other); } +Literal::Literal(Literal&& other) : MutableLiteralBase() { + *this = std::move(other); +} Literal& Literal::operator=(Literal&& other) { DCHECK(&other.root_piece_->subshape() == other.shape_.get()); @@ -187,12 +190,13 @@ const SparseIndexArray* LiteralBase::sparse_indices( return piece(shape_index).sparse_indices(); } -SparseIndexArray* Literal::sparse_indices(const ShapeIndex& shape_index) { +SparseIndexArray* MutableLiteralBase::sparse_indices( + const ShapeIndex& shape_index) { return piece(shape_index).sparse_indices(); } template -Status Literal::CopySliceFromInternal( +Status MutableLiteralBase::CopySliceFromInternal( const LiteralBase& src_literal, tensorflow::gtl::ArraySlice src_base, tensorflow::gtl::ArraySlice dest_base, tensorflow::gtl::ArraySlice copy_size) { @@ -225,8 +229,8 @@ Status Literal::CopySliceFromInternal( // proper stride size at the matching dimension. DimensionVector src_indexes(src_base.size(), 0); DimensionVector dest_indexes(dest_base.size(), 0); - Literal::StrideConfig stride_config(src_literal.shape(), shape(), - copy_size); + MutableLiteralBase::StrideConfig stride_config(src_literal.shape(), shape(), + copy_size); auto copy_proc = [&](tensorflow::gtl::ArraySlice indexes) { // Map from multi-dimensional index, to source index. @@ -253,9 +257,10 @@ Status Literal::CopySliceFromInternal( return Status::OK(); } -Status Literal::CopyElementFrom(const LiteralSlice& src_literal, - tensorflow::gtl::ArraySlice src_index, - tensorflow::gtl::ArraySlice dest_index) { +Status MutableLiteralBase::CopyElementFrom( + const LiteralSlice& src_literal, + tensorflow::gtl::ArraySlice src_index, + tensorflow::gtl::ArraySlice dest_index) { DCHECK_EQ(shape().element_type(), src_literal.shape().element_type()); const int64 src_linear_index = IndexUtil::MultidimensionalIndexToLinearIndex( src_literal.shape(), src_index); @@ -275,8 +280,8 @@ Status Literal::CopyElementFrom(const LiteralSlice& src_literal, return Status::OK(); } -/* static */ StatusOr> Literal::CreateFromProto( - const LiteralProto& proto) { +/* static */ StatusOr> +MutableLiteralBase::CreateFromProto(const LiteralProto& proto) { if (!proto.has_shape()) { return InvalidArgument("LiteralProto has no shape"); } @@ -405,9 +410,9 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) { return Status::OK(); } -Status Literal::CopyFrom(const LiteralSlice& src_literal, - const ShapeIndex& dest_shape_index, - const ShapeIndex& src_shape_index) { +Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal, + const ShapeIndex& dest_shape_index, + const ShapeIndex& src_shape_index) { const Shape& dest_subshape = ShapeUtil::GetSubshape(shape(), dest_shape_index); const Shape& src_subshape = @@ -482,10 +487,11 @@ Status Literal::MoveFrom(Literal&& src_literal, return Status::OK(); } -Status Literal::CopySliceFrom(const LiteralSlice& src_literal, - tensorflow::gtl::ArraySlice src_base, - tensorflow::gtl::ArraySlice dest_base, - tensorflow::gtl::ArraySlice copy_size) { +Status MutableLiteralBase::CopySliceFrom( + const LiteralSlice& src_literal, + tensorflow::gtl::ArraySlice src_base, + tensorflow::gtl::ArraySlice dest_base, + tensorflow::gtl::ArraySlice copy_size) { TF_RET_CHECK(ShapeUtil::IsArray(shape())) << ShapeUtil::HumanString(shape()); TF_RET_CHECK(ShapeUtil::IsArray(src_literal.shape())) << ShapeUtil::HumanString(src_literal.shape()); @@ -543,7 +549,7 @@ Status Literal::CopySliceFrom(const LiteralSlice& src_literal, shape().element_type()); } -void Literal::PopulateR1(const tensorflow::core::Bitmap& values) { +void MutableLiteralBase::PopulateR1(const tensorflow::core::Bitmap& values) { CHECK(ShapeUtil::IsArray(shape())); CHECK_EQ(ShapeUtil::Rank(shape()), 1); CHECK_EQ(element_count(), values.bits()); @@ -895,8 +901,8 @@ size_t LiteralBase::Hash() const { return hash_value; } -Status Literal::SetIntegralAsS64(tensorflow::gtl::ArraySlice multi_index, - int64 value) { +Status MutableLiteralBase::SetIntegralAsS64( + tensorflow::gtl::ArraySlice multi_index, int64 value) { CHECK(LayoutUtil::IsDenseArray(shape())); switch (shape().element_type()) { case PRED: @@ -933,7 +939,7 @@ tensorflow::gtl::ArraySlice LiteralBase::GetSparseIndex( return p.sparse_indices()->At(sparse_element_number); } -void Literal::SortSparseElements(const ShapeIndex& shape_index) { +void MutableLiteralBase::SortSparseElements(const ShapeIndex& shape_index) { piece(shape_index).SortSparseElements(); } @@ -1391,11 +1397,11 @@ StatusOr> LiteralBase::ConvertToShape( elements.push_back(std::move(*new_element)); } auto converted = MakeUnique(); - *converted = Literal::MoveIntoTuple(&elements); + *converted = MutableLiteralBase::MoveIntoTuple(&elements); return std::move(converted); } -/* static */ Literal Literal::MoveIntoTuple( +/* static */ Literal MutableLiteralBase::MoveIntoTuple( tensorflow::gtl::MutableArraySlice elements) { std::vector element_shapes; for (const Literal& element : elements) { @@ -1808,7 +1814,8 @@ Status CopyFromRepeatedField(tensorflow::gtl::MutableArraySlice dest, } // namespace Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) { - // These conditions should have been checked in Literal::CreateFromProto. + // These conditions should have been checked in + // MutableLiteralBase::CreateFromProto. TF_RET_CHECK(proto.has_shape()); TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape())); TF_RET_CHECK(ShapeUtil::Equal(proto.shape(), subshape())); @@ -1900,7 +1907,7 @@ const void* LiteralBase::untyped_data(const ShapeIndex& shape_index) const { return piece(shape_index).untyped_data(); } -void* Literal::untyped_data(const ShapeIndex& shape_index) { +void* MutableLiteralBase::untyped_data(const ShapeIndex& shape_index) { return piece(shape_index).untyped_data(); } @@ -1916,6 +1923,127 @@ string LiteralBase::GetR1U8AsString() const { ShapeUtil::ElementsIn(shape())); } +void MutableBorrowingLiteral::CopyPieceSubtree(const Shape& shape, + Piece* src_piece, + Piece* dest_piece) { + DCHECK(ShapeUtil::Equal(src_piece->subshape(), dest_piece->subshape())) + << "src_piece has shape: " + << ShapeUtil::HumanString(src_piece->subshape()) + << "dest_piece has shape: " + << ShapeUtil::HumanString(dest_piece->subshape()); + if (ShapeUtil::IsTuple(shape)) { + for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { + const Shape& subshape = shape.tuple_shapes(i); + + auto child_piece = Piece(); + child_piece.set_subshape(&subshape); + + CopyPieceSubtree(subshape, &src_piece->child(i), &child_piece); + + dest_piece->emplace_back(std::move(child_piece)); + } + } else if (ShapeUtil::IsArray(shape)) { + dest_piece->set_buffer(src_piece->buffer()); + } else { + // If the shape is neither an array nor tuple, then it must be + // zero-sized. Otherwise, some memory needs to be allocated for it. + CHECK_EQ(dest_piece->size_bytes(), 0); + } +} + +MutableLiteralBase::~MutableLiteralBase() {} + +MutableBorrowingLiteral::MutableBorrowingLiteral( + const MutableBorrowingLiteral& literal) + : MutableLiteralBase() { + shape_ = MakeUnique(literal.shape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal.root_piece(), root_piece_); +} + +MutableBorrowingLiteral& MutableBorrowingLiteral::operator=( + const MutableBorrowingLiteral& literal) { + shape_ = MakeUnique(literal.shape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal.root_piece(), root_piece_); + + return *this; +} + +MutableBorrowingLiteral::MutableBorrowingLiteral( + const MutableLiteralBase& literal) + : MutableLiteralBase() { + shape_ = MakeUnique(literal.shape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal.root_piece(), root_piece_); +} + +MutableBorrowingLiteral::MutableBorrowingLiteral(MutableLiteralBase* literal) + : MutableLiteralBase() { + shape_ = MakeUnique(literal->shape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal->root_piece(), root_piece_); +} + +MutableBorrowingLiteral::MutableBorrowingLiteral( + MutableBorrowingLiteral literal, const ShapeIndex& view_root) + : MutableLiteralBase() { + shape_ = MakeUnique(literal.piece(view_root).subshape()); + CHECK(LayoutUtil::HasLayout(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_subshape(shape_.get()); + + CopyPieceSubtree(*shape_, &literal.piece(view_root), root_piece_); +} + +MutableBorrowingLiteral::MutableBorrowingLiteral(const char* src_buf_ptr, + const Shape& shape) + : MutableLiteralBase() { + shape_ = MakeUnique(shape); + CHECK(LayoutUtil::HasLayout(*shape_)); + CHECK(!ShapeUtil::IsTuple(*shape_)); + + root_piece_ = new Piece(); + root_piece_->set_buffer(const_cast(src_buf_ptr)); + root_piece_->set_subshape(shape_.get()); +} + +MutableBorrowingLiteral::~MutableBorrowingLiteral() { + if (root_piece_ != nullptr) { + root_piece_->ForEachMutableSubpiece( + [&](const ShapeIndex& index, Piece* piece) { + if (piece->buffer() != nullptr) { + delete piece->sparse_indices(); + } + }); + delete root_piece_; + } +} + +LiteralSlice::LiteralSlice(const LiteralBase& literal) + : LiteralBase(), root_piece_(&literal.root_piece()) {} + +LiteralSlice::LiteralSlice(const LiteralBase& literal, + const ShapeIndex& view_root) + : LiteralBase(), root_piece_(&literal.piece(view_root)) {} + void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) { CHECK(ShapeUtil::IsTuple(shape)); for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { @@ -1932,13 +2060,6 @@ void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) { } } -LiteralSlice::LiteralSlice(const LiteralBase& literal) - : LiteralBase(), root_piece_(&literal.root_piece()) {} - -LiteralSlice::LiteralSlice(const LiteralBase& literal, - const ShapeIndex& view_root) - : LiteralBase(), root_piece_(&literal.piece(view_root)) {} - BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape) : LiteralBase(), shape_(MakeUnique(shape)) { CHECK(ShapeUtil::IsArray(*shape_)); diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h index dd67dfa8d4a556aea179bc47abfdc9a9c8872c45..92c0f903cbe252a153103aa8514bb5531696bbfe 100644 --- a/tensorflow/compiler/xla/literal.h +++ b/tensorflow/compiler/xla/literal.h @@ -310,9 +310,10 @@ class LiteralBase { // type of literal itself (0 for numeric types, and false for predicates). // // Note: It's an antipattern to use this method then immediately call - // Literal::Populate on the result (since that results in zero initialization, - // then reinitialization. Conside if a call to MakeUnique(shape), - // followed by the call to Literal::Populate can be used instead. + // MutableLiteralBase::Populate on the result (since that results in zero + // initialization, then reinitialization. Conside if a call to + // MakeUnique(shape), followed by the call to + // MutableLiteralBase::Populate can be used instead. static std::unique_ptr CreateFromShape(const Shape& shape); protected: @@ -534,7 +535,7 @@ class LiteralBase { virtual const Piece& root_piece() const = 0; // LiteralSlice and Literal must access Pieces of other Literals. - friend class Literal; + friend class MutableLiteralBase; friend class LiteralSlice; friend class BorrowingLiteral; @@ -545,33 +546,10 @@ class LiteralBase { tensorflow::gtl::ArraySlice start_indices) const; }; -// Class representing literal values in XLA. -// -// The underlying buffer and shape is always owned by this class. -class Literal : public LiteralBase { +// Abstract base class representing a mutable literal in XLA. +class MutableLiteralBase : public LiteralBase { public: - Literal() : Literal(ShapeUtil::MakeNil()) {} - - // Create a literal of the given shape. The literal is allocated sufficient - // memory to hold the shape. Memory is uninitialized. - explicit Literal(const Shape& shape); - virtual ~Literal(); - - // Literals are moveable, but not copyable. To copy a literal use - // Literal::Clone or Literal::CloneToUnique. This prevents inadvertent copies - // of literals which can be expensive. - Literal(const Literal& other) = delete; - Literal& operator=(const Literal& other) = delete; - Literal(Literal&& other); - // 'allocate_arrays' indicates whether to allocate memory for the arrays in - // the shape. If false, buffer pointers inside of the Literal::Pieces are set - // to nullptr. - Literal(const Shape& shape, bool allocate_arrays); - Literal& operator=(Literal&& other); - - // TODO(b/67651157): Remove this accessor. Literal users should not be able to - // mutate the shape as this can produce malformed Literals. - Shape* mutable_shape_do_not_use() { return shape_.get(); } + virtual ~MutableLiteralBase() = 0; // Returns a MutableArraySlice view of the array for this literal for the // given NativeT (e.g., float). CHECKs if the subshape of the literal at the @@ -587,6 +565,10 @@ class Literal : public LiteralBase { // is not a sparse array. SparseIndexArray* sparse_indices(const ShapeIndex& shape_index = {}); + // TODO(b/67651157): Remove this accessor. Literal users should not be able to + // mutate the shape as this can produce malformed Literals. + Shape* mutable_shape_do_not_use() { return shape_.get(); } + // Returns a pointer to the underlying buffer holding the array at the given // shape index. CHECKs if the subshape of the literal at the given ShapeIndex // is not array. @@ -613,21 +595,6 @@ class Literal : public LiteralBase { const ShapeIndex& dest_shape_index = {}, const ShapeIndex& src_shape_index = {}); - // Returns a vector containing the tuple elements of this Literal as separate - // Literals. This Literal must be tuple-shaped and can be a nested tuple. The - // elements are moved into the new Literals; no data is copied. Upon return - // this Literal is set to a nil shape (empty tuple) - std::vector DecomposeTuple(); - - // Similar to CopyFrom, but with move semantincs. The subshape of this literal - // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal' - // (layouts and shapes must match), but need not be arrays. The memory - // allocated in this literal for the subshape at dest_shape_index is - // deallocated, and the respective buffers are replaced with those in - // src_literal. Upon return, src_literal is set to a nil shape (empty tuple). - Status MoveFrom(Literal&& src_literal, - const ShapeIndex& dest_shape_index = {}); - // Copies the values from src_literal, starting at src_base shape indexes, // to this literal, starting at dest_base, where the copy size in each // dimension is specified by copy_size. @@ -730,12 +697,7 @@ class Literal : public LiteralBase { static StatusOr> CreateFromProto( const LiteralProto& proto); - private: - // Recursively sets the subshapes and buffers of all subpieces rooted at - // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in - // the shape. - void SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays); - + protected: // Returns the piece at the given ShapeIndex. Piece& piece(const ShapeIndex& shape_index) { return const_cast(LiteralBase::piece(shape_index)); @@ -783,12 +745,83 @@ class Literal : public LiteralBase { template Status PopulateInternal(const FnType& generator, bool parallel); + friend class LiteralBase; + friend class MutableBorrowingLiteral; +}; +std::ostream& operator<<(std::ostream& out, const Literal& literal); + +// The underlying buffer and shape is always owned by this class. +class Literal : public MutableLiteralBase { + public: + Literal() : Literal(ShapeUtil::MakeNil()) {} + + // Create a literal of the given shape. The literal is allocated sufficient + // memory to hold the shape. Memory is uninitialized. + explicit Literal(const Shape& shape); + virtual ~Literal(); + + // Literals are moveable, but not copyable. To copy a literal use + // Literal::Clone or Literal::CloneToUnique. This prevents inadvertent copies + // of literals which can be expensive. + Literal(const Literal& other) = delete; + Literal& operator=(const Literal& other) = delete; + Literal(Literal&& other); + // 'allocate_arrays' indicates whether to allocate memory for the arrays in + // the shape. If false, buffer pointers inside of the Literal::Pieces are set + // to nullptr. + Literal(const Shape& shape, bool allocate_arrays); + Literal& operator=(Literal&& other); + + // Similar to CopyFrom, but with move semantincs. The subshape of this literal + // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal' + // (layouts and shapes must match), but need not be arrays. The memory + // allocated in this literal for the subshape at dest_shape_index is + // deallocated, and the respective buffers are replaced with those in + // src_literal. Upon return, src_literal is set to a nil shape (empty tuple). + virtual Status MoveFrom(Literal&& src_literal, + const ShapeIndex& dest_shape_index = {}); + + // Returns a vector containing the tuple elements of this Literal as separate + // Literals. This Literal must be tuple-shaped and can be a nested tuple. The + // elements are moved into the new Literals; no data is copied. Upon return + // this Literal is set to a nil shape (empty tuple) + std::vector DecomposeTuple(); + + private: // Deallocate the buffers held by this literal. void DeallocateBuffers(); - friend class LiteralBase; + // Recursively sets the subshapes and buffers of all subpieces rooted at + // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in + // the shape. + void SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays); +}; + +// The underlying buffer is not owned by this class and is always owned by +// others. The shape is not owned by this class and not mutable. +class MutableBorrowingLiteral : public MutableLiteralBase { + public: + virtual ~MutableBorrowingLiteral(); + + MutableBorrowingLiteral() : MutableLiteralBase() {} + + MutableBorrowingLiteral(const MutableBorrowingLiteral& literal); + MutableBorrowingLiteral& operator=(const MutableBorrowingLiteral& literal); + + // Implicit conversion constructors. + MutableBorrowingLiteral(const MutableLiteralBase& literal); + MutableBorrowingLiteral(MutableLiteralBase* literal); + MutableBorrowingLiteral(MutableBorrowingLiteral literal, + const ShapeIndex& view_root); + MutableBorrowingLiteral(const char* src_buf_ptr, const Shape& shape); + + private: + // Recursively copies the subtree from the `src_piece` at the given child + // index to the `dest_piece`. For buffers only the pointers are copied, but + // not the content. + void CopyPieceSubtree(const Shape& shape, Piece* src_piece, + Piece* dest_piece); }; -std::ostream& operator<<(std::ostream& out, const Literal& literal); // A read-only view of a Literal. A LiteralSlice contains pointers to shape and // literal buffers always owned by others. @@ -831,9 +864,9 @@ class BorrowingLiteral : public LiteralBase { const Piece& root_piece() const override { return root_piece_; }; Piece root_piece_; - // Shape of this literal. Stored as unique_ptr so such that the (default) - // move construction of this class would be trivially correct: the pointer to - // Shape root_piece_ stores will still point to the correct address. + // Shape of this literal. Stored as unique_ptr such that the (default) move + // construction of this class would be trivially correct: the pointer to Shape + // root_piece_ stores will still point to the correct address. std::unique_ptr shape_; }; @@ -886,7 +919,7 @@ tensorflow::gtl::ArraySlice LiteralBase::data( } template -tensorflow::gtl::MutableArraySlice Literal::data( +tensorflow::gtl::MutableArraySlice MutableLiteralBase::data( const ShapeIndex& shape_index) { return piece(shape_index).data(); } @@ -904,14 +937,15 @@ inline NativeT LiteralBase::Get( } template -inline void Literal::Set(tensorflow::gtl::ArraySlice multi_index, - const ShapeIndex& shape_index, NativeT value) { +inline void MutableLiteralBase::Set( + tensorflow::gtl::ArraySlice multi_index, + const ShapeIndex& shape_index, NativeT value) { return piece(shape_index).Set(multi_index, value); } template -inline void Literal::Set(tensorflow::gtl::ArraySlice multi_index, - NativeT value) { +inline void MutableLiteralBase::Set( + tensorflow::gtl::ArraySlice multi_index, NativeT value) { return root_piece().Set(multi_index, value); } @@ -929,7 +963,7 @@ NativeT LiteralBase::GetSparseElement(int64 sparse_element_number, } template -void Literal::AppendSparseElement( +void MutableLiteralBase::AppendSparseElement( tensorflow::gtl::ArraySlice multi_index, NativeT value, const ShapeIndex& shape_index) { Piece& p = piece(shape_index); @@ -959,7 +993,8 @@ void LiteralBase::EachCell( } template -inline void Literal::PopulateR1(tensorflow::gtl::ArraySlice values) { +inline void MutableLiteralBase::PopulateR1( + tensorflow::gtl::ArraySlice values) { CHECK(ShapeUtil::IsArray(shape())); CHECK_EQ(ShapeUtil::Rank(shape()), 1); CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size()); @@ -971,7 +1006,7 @@ inline void Literal::PopulateR1(tensorflow::gtl::ArraySlice values) { } template -void Literal::PopulateR2( +void MutableLiteralBase::PopulateR2( std::initializer_list> values) { CHECK(ShapeUtil::IsArray(shape())); CHECK_EQ(ShapeUtil::Rank(shape()), 2); @@ -996,7 +1031,7 @@ void Literal::PopulateR2( } template -void Literal::PopulateFromArray(const Array& values) { +void MutableLiteralBase::PopulateFromArray(const Array& values) { CHECK(ShapeUtil::IsArray(shape())); CHECK_EQ(shape().element_type(), primitive_util::NativeToPrimitiveType()); @@ -1009,24 +1044,24 @@ void Literal::PopulateFromArray(const Array& values) { } template -void Literal::PopulateR2FromArray2D(const Array2D& values) { +void MutableLiteralBase::PopulateR2FromArray2D(const Array2D& values) { PopulateFromArray(values); } template -void Literal::PopulateR3FromArray3D(const Array3D& values) { +void MutableLiteralBase::PopulateR3FromArray3D(const Array3D& values) { PopulateFromArray(values); } template -void Literal::PopulateR4FromArray4D(const Array4D& values) { +void MutableLiteralBase::PopulateR4FromArray4D(const Array4D& values) { PopulateFromArray(values); } template -void Literal::PopulateSparse(SparseIndexArray indices, - tensorflow::gtl::ArraySlice values, - bool sort) { +void MutableLiteralBase::PopulateSparse( + SparseIndexArray indices, tensorflow::gtl::ArraySlice values, + bool sort) { CHECK(LayoutUtil::IsSparseArray(shape())); int rank = ShapeUtil::Rank(shape()); CHECK_EQ(indices.rank(), rank); @@ -1049,7 +1084,8 @@ void Literal::PopulateSparse(SparseIndexArray indices, } template -Status Literal::PopulateInternal(const FnType& generator, bool parallel) { +Status MutableLiteralBase::PopulateInternal(const FnType& generator, + bool parallel) { const Shape& this_shape = shape(); const int64 rank = ShapeUtil::Rank(this_shape); TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape)); @@ -1092,17 +1128,17 @@ Status Literal::PopulateInternal(const FnType& generator, bool parallel) { return Status::OK(); } template -Status Literal::Populate(const FnType& generator) { +Status MutableLiteralBase::Populate(const FnType& generator) { return PopulateInternal(generator, /*parallel=*/false); } template -Status Literal::PopulateParallel(const FnType& generator) { +Status MutableLiteralBase::PopulateParallel(const FnType& generator) { return PopulateInternal(generator, /*parallel=*/true); } template -void Literal::PopulateWithValue(NativeT value) { +void MutableLiteralBase::PopulateWithValue(NativeT value) { CHECK(ShapeUtil::IsArray(shape())); CHECK_EQ(shape().element_type(), primitive_util::NativeToPrimitiveType()); diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc index 548fbe8a83a3797aa8ac32dc1f6c085fc0100197..5d33df7d40bf3bfcc8012ce1129d532b34555344 100644 --- a/tensorflow/compiler/xla/literal_util.cc +++ b/tensorflow/compiler/xla/literal_util.cc @@ -34,9 +34,9 @@ limitations under the License. #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/types.h" -using tensorflow::strings::Printf; using tensorflow::strings::StrCat; namespace xla { diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc index fed0e58e66a04df2ff9554cb0dd0053b7c669803..69ef4f7a2f3ea559a334a11cbe8392b610742bab 100644 --- a/tensorflow/compiler/xla/metric_table_report.cc +++ b/tensorflow/compiler/xla/metric_table_report.cc @@ -134,8 +134,7 @@ void MetricTableReport::AppendHeader() { void MetricTableReport::AppendCategoryTable() { const std::vector categories = MakeCategories(&entries_); - AppendLine("********** categories table **********"); - AppendLine("The left hand side numbers are ", metric_name_, "."); + AppendLine("********** categories table for ", metric_name_, " **********"); AppendLine(); double metric_sum = UnaccountedMetric(); @@ -185,8 +184,8 @@ void MetricTableReport::AppendCategoryTable() { } void MetricTableReport::AppendEntryTable() { - AppendLine("********** ", entry_name_, " table **********"); - AppendLine("The left hand side numbers are ", metric_name_, "."); + AppendLine("********** ", entry_name_, " table for ", metric_name_, + " **********"); AppendLine(); double metric_sum = UnaccountedMetric(); diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc index 434d78d78dd58f8bfcb992eb4f3d81beaadb56c3..8246f76d3443d58f4174cc4f86100f54d6b46928 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.cc +++ b/tensorflow/compiler/xla/python/local_computation_builder.cc @@ -624,6 +624,7 @@ _FORWARD_BINOP(ShiftRightArithmetic) _FORWARD_BINOP(ShiftRightLogical) _FORWARD_BINOP(Atan2) _FORWARD_BINOP(Pow) +_FORWARD_BINOP(Complex) _FORWARD_UNOP(Not) _FORWARD_UNOP(Abs) _FORWARD_UNOP(Exp) @@ -658,6 +659,9 @@ _FORWARD_UNOP(Asinh) _FORWARD_UNOP(Atanh) _FORWARD_UNOP(Cosh) _FORWARD_UNOP(Sinh) +_FORWARD_UNOP(Real) +_FORWARD_UNOP(Imag) +_FORWARD_UNOP(Conj) #undef _FORWARD #undef _FORWARD_UNOP diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h index 545aa63f9d6e2e2e26c26f49941a5160279154b3..a568c24c6376e1fe17f5e5a4f6626bf0970985a3 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.h +++ b/tensorflow/compiler/xla/python/local_computation_builder.h @@ -341,6 +341,7 @@ class LocalComputationBuilder { _FORWARD_BINOP(ShiftRightLogical) _FORWARD_BINOP(Atan2) _FORWARD_BINOP(Pow) + _FORWARD_BINOP(Complex) _FORWARD_UNOP(Not) _FORWARD_UNOP(Abs) _FORWARD_UNOP(Exp) @@ -375,6 +376,9 @@ class LocalComputationBuilder { _FORWARD_UNOP(Atanh) _FORWARD_UNOP(Cosh) _FORWARD_UNOP(Sinh) + _FORWARD_UNOP(Real) + _FORWARD_UNOP(Imag) + _FORWARD_UNOP(Conj) #undef _FORWARD #undef _FORWARD_UNOP diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i index 9b8b0aa7f28e64f434bb24f88a3a9cbe177f8a78..5d5a955bfee35b38a61b9a9f792c1b31259ce044 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.i +++ b/tensorflow/compiler/xla/python/local_computation_builder.i @@ -1029,6 +1029,10 @@ tensorflow::ImportNumpy(); %unignore xla::swig::LocalComputationBuilder::Atanh; %unignore xla::swig::LocalComputationBuilder::Cosh; %unignore xla::swig::LocalComputationBuilder::Sinh; +%unignore xla::swig::LocalComputationBuilder::Real; +%unignore xla::swig::LocalComputationBuilder::Imag; +%unignore xla::swig::LocalComputationBuilder::Conj; +%unignore xla::swig::LocalComputationBuilder::Complex; %unignore xla::swig::DestructureLocalShapedBufferTuple; %unignore xla::swig::DeleteLocalShapedBuffer; %unignore xla::swig::DeleteLocalComputation; diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc index 71351abd593d45fb5080112438a91df368eee173..6f665faf61b25b23a32ce4d0a012543ba18d7e64 100644 --- a/tensorflow/compiler/xla/python/numpy_bridge.cc +++ b/tensorflow/compiler/xla/python/numpy_bridge.cc @@ -50,6 +50,8 @@ int PrimitiveTypeToNumpyType(PrimitiveType primitive_type) { return NPY_FLOAT32; case F64: return NPY_FLOAT64; + case C64: + return NPY_COMPLEX64; case TUPLE: return NPY_OBJECT; default: @@ -83,6 +85,8 @@ PrimitiveType NumpyTypeToPrimitiveType(int np_type) { return F32; case NPY_FLOAT64: return F64; + case NPY_COMPLEX64: + return C64; case NPY_OBJECT: return TUPLE; default: @@ -104,6 +108,7 @@ bool NumpyTypeIsValid(int np_type) { case NPY_FLOAT16: case NPY_FLOAT32: case NPY_FLOAT64: + case NPY_COMPLEX64: case NPY_OBJECT: return true; default: @@ -425,6 +430,9 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array, case NPY_FLOAT64: CopyNumpyArrayToLiteral(py_array, literal); break; + case NPY_COMPLEX64: + CopyNumpyArrayToLiteral(py_array, literal); + break; default: return InvalidArgument( "No XLA literal container for Numpy type number: %d", np_type); @@ -462,6 +470,9 @@ void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal, case NPY_FLOAT64: CopyLiteralToNumpyArray(literal, py_array); break; + case NPY_COMPLEX64: + CopyLiteralToNumpyArray(literal, py_array); + break; default: LOG(FATAL) << "No XLA literal container for Numpy type" << np_type; } diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index c0105b385b02e13b360ad1fb5af734d2209a92c2..a2c6fc344d192265d536ef7e23ad5c6d7c847014 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -120,6 +120,9 @@ _UNARY_OPS = [ 'Atanh', 'Cosh', 'Sinh', + 'Real', + 'Imag', + 'Conj', ] _BINARY_OPS = [ @@ -144,6 +147,7 @@ _BINARY_OPS = [ 'ShiftRightArithmetic', 'ShiftRightLogical', 'Atan2', + 'Complex', ] diff --git a/tensorflow/compiler/xla/python_api/BUILD b/tensorflow/compiler/xla/python_api/BUILD index 8999cda5ef852d1246bea45a3312575ec1ac0721..d790c4db6c466a2bf4d2cf30365749fb901f74a0 100644 --- a/tensorflow/compiler/xla/python_api/BUILD +++ b/tensorflow/compiler/xla/python_api/BUILD @@ -10,6 +10,8 @@ py_library( srcs = ["types.py"], deps = [ "//tensorflow/compiler/xla:xla_data_proto_py", + "//tensorflow/python:dtypes", + "//tensorflow/python:platform", "//third_party/py/numpy", ], ) diff --git a/tensorflow/compiler/xla/python_api/types.py b/tensorflow/compiler/xla/python_api/types.py index b60f8dce92ace1b2c682374a2605b3a477936bbc..57dfce3971b829d2a3052d347e5d2d322db0c841 100644 --- a/tensorflow/compiler/xla/python_api/types.py +++ b/tensorflow/compiler/xla/python_api/types.py @@ -20,9 +20,10 @@ from __future__ import print_function import collections -import numpy as np +import numpy as _np # Avoids becoming a part of public Tensorflow API. from tensorflow.compiler.xla import xla_data_pb2 +from tensorflow.python.framework import dtypes # Records corresponsence between a XLA primitive type and Python/Numpy types. # @@ -40,76 +41,82 @@ TypeConversionRecord = collections.namedtuple('TypeConversionRecord', [ # Maps from XLA primitive types to TypeConversionRecord. MAP_XLA_TYPE_TO_RECORD = { + xla_data_pb2.BF16: + TypeConversionRecord( + primitive_type=xla_data_pb2.BF16, + numpy_dtype=dtypes.bfloat16.as_numpy_dtype, + literal_field_name='bf16s', + literal_field_type=float), xla_data_pb2.F16: TypeConversionRecord( primitive_type=xla_data_pb2.F16, - numpy_dtype=np.float16, + numpy_dtype=_np.float16, literal_field_name='f16s', literal_field_type=float), xla_data_pb2.F32: TypeConversionRecord( primitive_type=xla_data_pb2.F32, - numpy_dtype=np.float32, + numpy_dtype=_np.float32, literal_field_name='f32s', literal_field_type=float), xla_data_pb2.F64: TypeConversionRecord( primitive_type=xla_data_pb2.F64, - numpy_dtype=np.float64, + numpy_dtype=_np.float64, literal_field_name='f64s', literal_field_type=float), xla_data_pb2.S8: TypeConversionRecord( primitive_type=xla_data_pb2.S8, - numpy_dtype=np.int8, + numpy_dtype=_np.int8, literal_field_name='s8s', literal_field_type=int), xla_data_pb2.S16: TypeConversionRecord( primitive_type=xla_data_pb2.S16, - numpy_dtype=np.int16, + numpy_dtype=_np.int16, literal_field_name='s16s', literal_field_type=int), xla_data_pb2.S32: TypeConversionRecord( primitive_type=xla_data_pb2.S32, - numpy_dtype=np.int32, + numpy_dtype=_np.int32, literal_field_name='s32s', literal_field_type=int), xla_data_pb2.S64: TypeConversionRecord( primitive_type=xla_data_pb2.S64, - numpy_dtype=np.int64, + numpy_dtype=_np.int64, literal_field_name='s64s', literal_field_type=int), xla_data_pb2.U8: TypeConversionRecord( primitive_type=xla_data_pb2.U8, - numpy_dtype=np.uint8, + numpy_dtype=_np.uint8, literal_field_name='s8s', literal_field_type=int), xla_data_pb2.U16: TypeConversionRecord( primitive_type=xla_data_pb2.U16, - numpy_dtype=np.uint16, + numpy_dtype=_np.uint16, literal_field_name='s16s', literal_field_type=int), xla_data_pb2.U32: TypeConversionRecord( primitive_type=xla_data_pb2.U32, - numpy_dtype=np.uint32, + numpy_dtype=_np.uint32, literal_field_name='s32s', literal_field_type=int), xla_data_pb2.U64: TypeConversionRecord( primitive_type=xla_data_pb2.U64, - numpy_dtype=np.uint64, + numpy_dtype=_np.uint64, literal_field_name='s64s', literal_field_type=int), xla_data_pb2.PRED: TypeConversionRecord( primitive_type=xla_data_pb2.PRED, - numpy_dtype=np.bool, + numpy_dtype=_np.bool, literal_field_name='preds', literal_field_type=bool) } @@ -119,6 +126,6 @@ MAP_XLA_TYPE_TO_RECORD = { # doesn't work as expected (https://github.com/numpy/numpy/issues/7242). Thus, # when keying by dtype in this dict, we use the string form of dtypes. MAP_DTYPE_TO_RECORD = { - str(np.dtype(record.numpy_dtype)): record + str(_np.dtype(record.numpy_dtype)): record for record in MAP_XLA_TYPE_TO_RECORD.values() } diff --git a/tensorflow/compiler/xla/python_api/xla_literal.py b/tensorflow/compiler/xla/python_api/xla_literal.py index b040098c294ffaae92b72f678947f99289239314..757e41a78ad2b57d2ef6e1f3055160be22c7b3ed 100644 --- a/tensorflow/compiler/xla/python_api/xla_literal.py +++ b/tensorflow/compiler/xla/python_api/xla_literal.py @@ -18,7 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import numpy as np +import numpy as _np # Avoids becoming a part of public Tensorflow API. from tensorflow.compiler.xla import xla_data_pb2 from tensorflow.compiler.xla.python_api import types @@ -35,7 +35,7 @@ def ConvertLiteralToNumpyArray(literal): type_record = types.MAP_XLA_TYPE_TO_RECORD[element_type] if not literal.shape.dimensions: - return np.array( + return _np.array( getattr(literal, type_record.literal_field_name)[0], type_record.numpy_dtype) else: @@ -54,7 +54,7 @@ def ConvertLiteralToNumpyArray(literal): numpy_reshaper = lambda arr: arr.reshape(numpy_shape, order='C') else: raise NotImplementedError('Unsupported layout: {0}'.format(layout_order)) - ndarray = np.array( + ndarray = _np.array( getattr(literal, type_record.literal_field_name), copy=False, dtype=type_record.numpy_dtype) @@ -69,11 +69,11 @@ def _ConvertNumpyArrayToLiteral(ndarray): if ndarray.ndim == 0: getattr(literal, type_record.literal_field_name).append( - np.asscalar(ndarray.astype(type_record.literal_field_type))) + _np.asscalar(ndarray.astype(type_record.literal_field_type))) else: # Ndarrays with boolean dtypes need special type conversion with protobufs - if ndarray.dtype in {np.bool_, np.dtype('bool')}: - for element in np.nditer(ndarray): + if ndarray.dtype in {_np.bool_, _np.dtype('bool')}: + for element in _np.nditer(ndarray): getattr(literal, type_record.literal_field_name).append( type_record.literal_field_type(element)) else: diff --git a/tensorflow/compiler/xla/python_api/xla_shape.py b/tensorflow/compiler/xla/python_api/xla_shape.py index 6af28958035bbb03e7e1dbb0d0c7bb2c2f25b96d..f158f6b2410352432445f669155aff0af5526abf 100644 --- a/tensorflow/compiler/xla/python_api/xla_shape.py +++ b/tensorflow/compiler/xla/python_api/xla_shape.py @@ -18,7 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import numpy as np +import numpy as _np # Avoids becoming a part of public Tensorflow API. from tensorflow.compiler.xla import xla_data_pb2 from tensorflow.compiler.xla.python_api import types @@ -111,7 +111,7 @@ def _CreateShapeFromNumpy(ndarray): # pylint: disable=invalid-name # Set the shape's layout based on the ordering of ndarray. # Numpy arrays come in two orders: Fortran (column-major) and C (row-major). - if np.isfortran(ndarray): + if _np.isfortran(ndarray): # Column-major layout. This corresponds to a "dimension order is # minor-to-major" layout in XLA. layout = range(ndarray.ndim) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 528b7fdfd3c39cc3a56afc92474dbae976a08ba8..7d315fa0d3d8e38cefbccf9b71d9bd0706a7a434 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -570,7 +570,7 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", - "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:core_cpu_lib", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//third_party/eigen3", @@ -613,6 +613,7 @@ cc_library( "//tensorflow/compiler/xla:xla_proto", "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", "//tensorflow/core:lib", + "//tensorflow/core:ptr_util", "//tensorflow/core:stream_executor_no_cuda", ], alwayslink = 1, @@ -1384,6 +1385,18 @@ tf_cc_test( ], ) +cc_library( + name = "while_loop_analysis", + srcs = ["while_loop_analysis.cc"], + hdrs = ["while_loop_analysis.h"], + deps = [ + ":hlo", + ":hlo_evaluator", + "//tensorflow/compiler/xla:literal", + "//tensorflow/core:lib", + ], +) + cc_library( name = "while_loop_simplifier", srcs = ["while_loop_simplifier.cc"], @@ -1391,8 +1404,8 @@ cc_library( deps = [ ":call_inliner", ":hlo", - ":hlo_evaluator", ":hlo_pass", + ":while_loop_analysis", "//tensorflow/compiler/xla:statusor", "//tensorflow/core:lib", ], diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 946ef6f0d6b9025b84c4b9341f4ec600465d4b1e..37834e1cc2657ff56f65a4f94eb973b9022eb8e1 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -1803,6 +1803,12 @@ Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice( } Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) { + // TODO(b/112040122): Most of those optimizations can be done for multi-output + // reduces. + if (ShapeUtil::IsTuple(reduce->shape())) { + return Status::OK(); + } + auto arg = reduce->mutable_operand(0); auto init_value = reduce->mutable_operand(1); tensorflow::gtl::ArraySlice dimensions(reduce->dimensions()); diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc index ad14fe6f2c8b8d30204771c8c8a7fee4ca476105..862cbeeba6b82e1f24a6616b3237dc47d022e9af 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc @@ -2006,7 +2006,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) { // Builds a convolution from and runs algebraic simplification on // the computation. Returns a string description of the result of // simplification. - auto build_and_simplify = [&options, this]() -> string { + auto build_and_simplify = [&options]() -> string { HloComputation::Builder b(TestName()); Window window; diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 95b4cb6d2e694063b648b264bd2454ae0a5469ff..51ebc4763b612884a4453edec5711f78c4006fc3 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -109,11 +109,11 @@ Status AllocationTracker::Unregister(const GlobalDataHandle& data) { ResolveInternal(data)); for (const auto& shaped_buffer : replicated_buffers) { std::vector shape_indices; - ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(), - [this, &shape_indices](const Shape& /*subshape*/, - const ShapeIndex& index) { - shape_indices.push_back(index); - }); + ShapeUtil::ForEachSubshape( + shaped_buffer->on_device_shape(), + [&shape_indices](const Shape& /*subshape*/, const ShapeIndex& index) { + shape_indices.push_back(index); + }); for (const ShapeIndex& index : shape_indices) { TF_RETURN_IF_ERROR(DecrementRefCount(shaped_buffer->buffer(index), shaped_buffer->device_ordinal())); diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc index 32f785a70adf0e7ea3ce281f7ff73224be8d424e..a725351462809e5b670bbf1d79d2dded87e54f07 100644 --- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc +++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc @@ -137,9 +137,9 @@ ENTRY entry { if (instruction->opcode() == HloOpcode::kParameter) { continue; } - ASSERT_TRUE(instruction->has_sharding()); - TF_ASSERT_OK_AND_ASSIGN(int device, instruction->sharding().UniqueDevice()); - EXPECT_EQ(device, 1); + auto device = instruction->sharding_unique_device(); + ASSERT_TRUE(device); + EXPECT_EQ(*device, 1); } } diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc index e4d2e73b994819f748bceb6a9b2f9c1ca7c16308..118a11c8de3c06d240079723f0a5db314cfcace5 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment.cc @@ -877,8 +877,8 @@ Status BufferAssigner::AssignBuffersForComputation( // important reuse case where an elementwise instruction reuses one of its // operand's buffer. This improves locality. std::sort(sorted_buffers.begin(), sorted_buffers.end(), - [this, has_sequential_order, &liveness, &post_order_position, - assignment](const LogicalBuffer* a, const LogicalBuffer* b) { + [has_sequential_order, &liveness, &post_order_position, assignment]( + const LogicalBuffer* a, const LogicalBuffer* b) { // Primary sort is by decreasing buffer size. const int64 a_size = assignment->buffer_size_(*a); const int64 b_size = assignment->buffer_size_(*b); @@ -1441,9 +1441,9 @@ void BufferAssigner::BuildColocatedBufferSets( const HloInstruction* while_hlo = instruction; ShapeUtil::ForEachSubshape( while_hlo->shape(), - [this, while_hlo, &points_to_analysis, &buffer_liveness, - buffer_size, computation, colocated_buffer_sets]( - const Shape& /*subshape*/, const ShapeIndex& index) { + [this, while_hlo, &points_to_analysis, buffer_size, + colocated_buffer_sets](const Shape& /*subshape*/, + const ShapeIndex& index) { std::vector colocated_set; // Add while.init. AddBufferToColocatedSet(while_hlo->operand(0), index, diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h index 99abb9bae32b35652e84cddc7c38dbd97ecb5006..34f7fe12cac5a4dcd3822865bee903d6eabc25c0 100644 --- a/tensorflow/compiler/xla/service/compiler.h +++ b/tensorflow/compiler/xla/service/compiler.h @@ -48,11 +48,6 @@ namespace xla { // compuation. using ObjectFileData = std::vector; -// Contains the buffer sizes information needed to allocate buffers to execute -// an ahead-of-time computation. Entries which contain -1 designate a parameter -// which should be skipped over during allocation. -using BufferSizes = std::vector; - // Abstract superclass describing the result of an ahead-of-time compilation. class AotCompilationResult { public: diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc index 36fb9b43aa20bad788a0638b4fed6c88fc9023f0..3e39c1bab1e07d192a8c145be5103085fd3c189b 100644 --- a/tensorflow/compiler/xla/service/copy_insertion.cc +++ b/tensorflow/compiler/xla/service/copy_insertion.cc @@ -312,7 +312,7 @@ Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis, return Status::OK(); } -// We add copies for all the indices of the true and false computaiton roots, +// We add copies for all the indices of the true and false computation roots, // in order to resolve interference. We later rely on the CopyRemover to drop // the unnecessary ones. Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis, @@ -648,7 +648,12 @@ class CopyRemover { // We can only perform copy elision if the resulting merged values have // totally ordered live ranges; otherwise the merged buffer would have // live range interference. - if (IsHead(*dest)) { + if (src->next == dest) { + // In the process of eliding copies, its possible for a copy to have the + // same source and destination buffer. In this case, the copy can be + // safely removed. + VLOG(2) << copy->name() << " source and destination buffers are same."; + } else if (IsHead(*dest)) { // The copy copies an arbitrary value in the source buffer (call it s_x) // and defines d_0, the first value in the destination buffer. After // merging, the values in the combined buffer must be strictly ordered diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc index cd735256b83f5f1d69a89e693de6064d460a36e5..892d0d7b547aaf1e7f1c55e4163d1e1fd9518def 100644 --- a/tensorflow/compiler/xla/service/copy_insertion_test.cc +++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc @@ -2007,5 +2007,46 @@ ENTRY TestComputation { InsertCopies(module.get()); } +TEST_F(CopyInsertionTest, NestedWhiles) { + // Verify that only no unnecessary copies remain after copy insertion for + // trivial nested whiles (b/112472605). + const string& hlo_string = R"( +HloModule TestModule + +cond.inner { + ROOT param.cond.inner = pred[] parameter(0) +} + +body.inner { + param.body.inner = pred[] parameter(0) + ROOT neg = pred[] negate(param.body.inner) +} + +cond.outer { + ROOT param.cond.outer = pred[] parameter(0) +} + +body.outer { + param.cond.outer = pred[] parameter(0) + ROOT while = pred[] while(param.cond.outer), condition=cond.inner, body=body.inner +} + +ENTRY TestComputation { + entry_param = pred[] parameter(0) + ROOT while = pred[] while(entry_param), condition=cond.outer, body=body.outer +} +)"; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())); + InsertCopies(module.get()); + + // There should only be a single copy inserted, and it's in the entry + // computation. + EXPECT_EQ(CountCopies(*module), 1); + EXPECT_THAT(module->entry_computation()->root_instruction(), + op::While(op::Copy(op::Parameter()))); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index 504b61d134a0099d055d0266408e1dfb94af5b2a..3efe3e2f93adc788258295e3142c1cc6c0a4bbef 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -54,12 +54,24 @@ cc_library( alwayslink = True, # Contains per-platform transfer manager registration ) +cc_library( + name = "buffer_info_util", + srcs = ["buffer_info_util.cc"], + hdrs = ["buffer_info_util.h"], + deps = [ + "//tensorflow/compiler/tf2xla:cpu_function_runtime", + "//tensorflow/compiler/xla/service:buffer_assignment", + "//tensorflow/core:lib", + ], +) + cc_library( name = "cpu_compiler", srcs = ["cpu_compiler.cc"], hdrs = ["cpu_compiler.h"], deps = [ ":compiler_functor", + ":buffer_info_util", ":conv_canonicalization", ":cpu_copy_insertion", ":cpu_executable", @@ -73,6 +85,7 @@ cc_library( ":ir_emitter", ":parallel_task_assignment", ":simple_orc_jit", + "//tensorflow/compiler/tf2xla:cpu_function_runtime", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla:status_macros", diff --git a/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc b/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..408fe0f5bf5d729165eadd532d4740211620645d --- /dev/null +++ b/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc @@ -0,0 +1,57 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h" + +namespace xla { +namespace cpu { + +using BufferInfo = ::tensorflow::cpu_function_runtime::BufferInfo; + +std::vector CreateBufferInfosFromBufferAssignment( + const BufferAssignment& buffer_assignment) { + std::vector buffer_infos; + for (const BufferAllocation& allocation : buffer_assignment.Allocations()) { + if (allocation.is_thread_local()) { + buffer_infos.push_back(BufferInfo::MakeOnStackBuffer(allocation.size())); + } else if (allocation.is_constant()) { + buffer_infos.push_back(BufferInfo::MakeConstant(allocation.size())); + } else if (allocation.is_entry_computation_parameter()) { + buffer_infos.push_back(BufferInfo::MakeEntryParameter( + /*size=*/allocation.size(), + /*param_number=*/allocation.parameter_number())); + } else { + buffer_infos.push_back(BufferInfo::MakeTempBuffer(allocation.size())); + } + } + return buffer_infos; +} + +std::vector CreateArgIndexTableFromBufferInfos( + tensorflow::gtl::ArraySlice buffer_infos) { + std::vector result; + for (int64 i = 0; i < buffer_infos.size(); i++) { + if (buffer_infos[i].is_entry_parameter()) { + if (buffer_infos[i].entry_parameter_number() >= result.size()) { + result.resize(buffer_infos[i].entry_parameter_number() + 1); + } + result[buffer_infos[i].entry_parameter_number()] = i; + } + } + return result; +} + +} // namespace cpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/buffer_info_util.h b/tensorflow/compiler/xla/service/cpu/buffer_info_util.h new file mode 100644 index 0000000000000000000000000000000000000000..05de70c72686dcbdaf0b47c46cde23ed45abdb42 --- /dev/null +++ b/tensorflow/compiler/xla/service/cpu/buffer_info_util.h @@ -0,0 +1,42 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_ + +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" +#include "tensorflow/compiler/xla/service/buffer_assignment.h" +#include "tensorflow/core/lib/gtl/array_slice.h" + +namespace xla { +namespace cpu { +// Creates and returns a list of BufferInfo instances containing relevant +// information from `buffer_assignment`. +std::vector<::tensorflow::cpu_function_runtime::BufferInfo> +CreateBufferInfosFromBufferAssignment( + const BufferAssignment& buffer_assignment); + +// Creates and returns a table containing the mapping from entry computation +// parameters to buffer allocation indices. +// +// If this function returns V then entry parameter i has buffer allocation index +// V[i]. +std::vector CreateArgIndexTableFromBufferInfos( + tensorflow::gtl::ArraySlice<::tensorflow::cpu_function_runtime::BufferInfo> + buffer_infos); +} // namespace cpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_BUFFER_INFO_UTIL_H_ diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc index 6a7eb85e3baec3517b8f3ddef6a8dcfae9c9e614..128eea4828b5e514b2ba6b398898e4a5d228e746 100644 --- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc +++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc @@ -156,9 +156,26 @@ std::unique_ptr CompilerFunctor::operator()( target_machine_->addPassesToEmitMC(codegen_passes, mc_context, ostream); codegen_passes.run(module); - // Construct ObjectFile from machine code buffer. - return std::unique_ptr( + std::unique_ptr memory_buffer( new llvm::SmallVectorMemoryBuffer(std::move(stream_buffer))); + + if (VLOG_IS_ON(2)) { + llvm::Expected> obj_file = + llvm::object::ObjectFile::createObjectFile(*memory_buffer); + if (obj_file) { + StatusOr disasm_result = + disassembler_->DisassembleObjectFile(*obj_file.get()); + if (disasm_result.ok()) { + XLA_VLOG_LINES(2, disasm_result.ValueOrDie().text); + } else { + LOG(WARNING) << "Could not disassemble object file!"; + } + } else { + LOG(WARNING) << "Could convert memory buffer to object file!"; + } + } + + return memory_buffer; } static std::vector VectorFunctionsForTargetLibraryInfoImpl() { diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index b49ea898962e437ec80dca0deec3aba70556b0dd..62272c29c0365a871975dd4a56e0a432cc62e98a 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -50,6 +50,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/buffer_liveness.h" #include "tensorflow/compiler/xla/service/call_inliner.h" #include "tensorflow/compiler/xla/service/conditional_simplifier.h" +#include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h" #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h" #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h" #include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h" @@ -103,6 +104,7 @@ limitations under the License. namespace xla { namespace cpu { +using BufferInfo = ::tensorflow::cpu_function_runtime::BufferInfo; CpuAotCompilationOptions::CpuAotCompilationOptions( string triple, string cpu_name, string features, string entry_point_name, @@ -120,11 +122,11 @@ se::Platform::Id CpuAotCompilationOptions::PlatformId() const { } CpuAotCompilationResult::CpuAotCompilationResult( - ObjectFileData object_file_data, BufferSizes buffer_sizes, + ObjectFileData object_file_data, std::vector buffer_infos, int64 result_buffer_index, std::unique_ptr hlo_profile_printer_data) : object_file_data_(std::move(object_file_data)), - buffer_sizes_(std::move(buffer_sizes)), + buffer_infos_(std::move(buffer_infos)), result_buffer_index_(result_buffer_index), hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {} @@ -354,7 +356,7 @@ llvm::TargetOptions CompilerTargetOptions( llvm::TargetOptions target_options; llvm_ir::SetTargetOptions( /*fast_math_enabled=*/module_config.debug_options() - .xla_enable_fast_math(), + .xla_cpu_enable_fast_math(), &target_options); return target_options; } @@ -521,7 +523,7 @@ StatusOr> CpuCompiler::RunBackend( CompilerTargetOptions(module->config()), CodeGenOptLevel(module->config()), options::OptimizeForSizeRequested(module->config()), - module->config().debug_options().xla_enable_fast_math(), + module->config().debug_options().xla_cpu_enable_fast_math(), module->config().debug_options().xla_llvm_disable_expensive_passes(), pre_optimization_ir_hook, post_optimization_ir_hook); llvm_module->setDataLayout(jit->data_layout()); @@ -651,9 +653,9 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, // so we bail if the configs have conflicting flags. At the moment, the only // flag that needs to be consistent is fast-math. const bool fast_math_enabled = - modules[0]->config().debug_options().xla_enable_fast_math(); + modules[0]->config().debug_options().xla_cpu_enable_fast_math(); for (const auto& module : modules) { - if (module->config().debug_options().xla_enable_fast_math() != + if (module->config().debug_options().xla_cpu_enable_fast_math() != fast_math_enabled) { return InvalidArgument( "All HLO module configs must have the same value for " @@ -830,7 +832,7 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, CompilerFunctor compiler_functor( target_machine.get(), &disassembler, opt_level, options::OptimizeForSizeRequested(module->config()), - module->config().debug_options().xla_enable_fast_math(), + module->config().debug_options().xla_cpu_enable_fast_math(), module->config().debug_options().xla_llvm_disable_expensive_passes(), pre_optimization_ir_dump_hook, post_optimization_ir_dump_hook); std::unique_ptr object_file = @@ -838,28 +840,14 @@ CpuCompiler::CompileAheadOfTime(std::vector> modules, ObjectFileData object_file_data(object_file->getBufferStart(), object_file->getBufferEnd()); - BufferSizes buffer_sizes; - for (const BufferAllocation& allocation : assignment->Allocations()) { - // Callers don't need to allocate temporary buffers for parameters. - if (allocation.is_entry_computation_parameter() || - allocation.is_constant()) { - buffer_sizes.push_back(-1); - continue; - } - // Callers don't need to allocate anything for thread-local temporary - // buffers. They are lowered to allocas. - if (allocation.is_thread_local()) { - buffer_sizes.push_back(-1); - continue; - } - buffer_sizes.push_back(allocation.size()); - } + std::vector buffer_infos = + CreateBufferInfosFromBufferAssignment(*assignment); TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, assignment->GetUniqueTopLevelOutputSlice()); results.emplace_back(MakeUnique( - std::move(object_file_data), std::move(buffer_sizes), + std::move(object_file_data), std::move(buffer_infos), result_slice.index(), std::move(hlo_profile_printer_data))); } diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h index e56f9f01134f84b4698c078b750b0c1fdca7748e..04e1c48872ed55ca7f2aa3bec08c44a1666b90f1 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h @@ -19,6 +19,7 @@ limitations under the License. #include #include "llvm/Target/TargetMachine.h" +#include "tensorflow/compiler/tf2xla/cpu_function_runtime.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/llvm_compiler.h" @@ -78,7 +79,8 @@ class CpuAotCompilationOptions : public AotCompilationOptions { class CpuAotCompilationResult : public AotCompilationResult { public: CpuAotCompilationResult( - ObjectFileData object_file_data, BufferSizes buffer_sizes, + ObjectFileData object_file_data, + std::vector<::tensorflow::cpu_function_runtime::BufferInfo> buffer_infos, int64 result_buffer_index, std::unique_ptr hlo_profile_printer_data); ~CpuAotCompilationResult(); @@ -88,17 +90,20 @@ class CpuAotCompilationResult : public AotCompilationResult { } const ObjectFileData& object_file_data() const { return object_file_data_; } - const BufferSizes& buffer_sizes() const { return buffer_sizes_; } + const std::vector<::tensorflow::cpu_function_runtime::BufferInfo>& + buffer_infos() const { + return buffer_infos_; + } int64 result_buffer_index() const { return result_buffer_index_; } private: // Contains the compiled computation: an object file. const ObjectFileData object_file_data_; - // The list of buffer sizes which should be allocated in order to execute the - // compiled computation. These buffers are used for temporary buffers used - // ephemerally during computation as well as the output result. - const BufferSizes buffer_sizes_; + // A list of BufferInfo objects describing the buffers used by the XLA + // computation. + const std::vector<::tensorflow::cpu_function_runtime::BufferInfo> + buffer_infos_; // Contains which buffer index into |buffer_sizes| was designated to the // result of the computation. This buffer should be passed into the output diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 81e17a5cd4de7151217ba0f2710c49546bce1f10..c376864c3e1f882e11bc05f8cf93f2fb1c88e4ec 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -69,12 +69,19 @@ CpuExecutable::CpuExecutable( // guarded by the mutex. compute_function_ = reinterpret_cast(cantFail(sym.getAddress())); + VLOG(1) << "compute_function_ at address " + << reinterpret_cast(compute_function_); } -Status CpuExecutable::AllocateBuffers( +StatusOr, + std::vector>> +CpuExecutable::CreateTempArray( DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers) { - CHECK_EQ(buffers->size(), assignment_->Allocations().size()); + tensorflow::gtl::ArraySlice arguments) { + std::vector unowning_buffers( + assignment_->Allocations().size()); + std::vector owning_buffers( + assignment_->Allocations().size()); VLOG(3) << "Allocating " << assignment_->Allocations().size() << " allocations for module " << module().name(); for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); @@ -84,6 +91,8 @@ Status CpuExecutable::AllocateBuffers( VLOG(3) << allocation.ToString(); if (allocation.is_entry_computation_parameter()) { + unowning_buffers[i] = arguments[allocation.parameter_number()]->buffer( + allocation.param_shape_index()); VLOG(3) << "allocation #" << i << " is a parameter"; continue; } @@ -99,34 +108,34 @@ Status CpuExecutable::AllocateBuffers( } int64 buffer_size = allocation.size(); - if (!(*buffers)[i].is_null()) { + if (!owning_buffers[i].is_null()) { VLOG(3) << "buffer #" << i << " is in the preallocated result ShapedBuffer"; } else { - TF_ASSIGN_OR_RETURN((*buffers)[i], memory_allocator->Allocate( - device_ordinal, buffer_size)); + TF_ASSIGN_OR_RETURN(owning_buffers[i], memory_allocator->Allocate( + device_ordinal, buffer_size)); + unowning_buffers[i] = owning_buffers[i].AsDeviceMemoryBase(); VLOG(3) << "buffer #" << i << " allocated " << buffer_size << " bytes [" - << (*buffers)[i].opaque() << "]"; + << owning_buffers[i].opaque() << "]"; } // Since the output buffer and all the temporary buffers were written into // by the JITed code, msan has no way of knowing their memory was // initialized. Mark them initialized so that msan doesn't flag loads from // these buffers. - TF_ANNOTATE_MEMORY_IS_INITIALIZED((*buffers)[i].opaque(), buffer_size); + TF_ANNOTATE_MEMORY_IS_INITIALIZED(owning_buffers[i].opaque(), buffer_size); } TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, assignment_->GetUniqueTopLevelOutputSlice()); VLOG(3) << "result index: " << result_slice.index(); - return Status::OK(); + return {{std::move(unowning_buffers), std::move(owning_buffers)}}; } Status CpuExecutable::ExecuteComputeFunction( const ExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, tensorflow::gtl::ArraySlice buffers, HloExecutionProfile* hlo_execution_profile) { // The calling convention for JITed functions is: @@ -136,17 +145,11 @@ Status CpuExecutable::ExecuteComputeFunction( // // result: Points at the result. // run_options: the ExecutableRunOptions object. - // args_array: An array of pointers, each of which points to a parameter. - // The size of this array is determined by the function's arity - // (ProgramShape). - // temps_array: An array of pointers, each of which points to a temporary - // buffer the computation needs. The size of this array is - // determined by buffer analysis. + // args_array: null + // temps_array: An array of pointers, containing pointers to temporary buffers + // required by the executable adn pointers to entry computation + // parameters. // - std::vector args_array; - for (const ShapedBuffer* argument : arguments) { - args_array.push_back(argument->root_buffer().opaque()); - } uint64 start_micros = tensorflow::Env::Default()->NowMicros(); @@ -169,16 +172,14 @@ Status CpuExecutable::ExecuteComputeFunction( if (VLOG_IS_ON(3)) { VLOG(3) << "Executing compute function:"; VLOG(3) << tensorflow::strings::Printf( - " func(void* result, void* params[%zu], void* temps[%zu], " + " func(void* result, void* params[null], void* temps[%zu], " "uint64 profile_counters[%zu])", - args_array.size(), buffer_pointers.size(), profile_counters_size); + buffer_pointers.size(), profile_counters_size); VLOG(3) << tensorflow::strings::Printf(" result = %p", result_buffer); auto ptr_printer = [](string* out, const void* p) { tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p)); }; - VLOG(3) << tensorflow::strings::Printf( - " params = [%s]", - tensorflow::str_util::Join(args_array, ", ", ptr_printer).c_str()); + VLOG(3) << " params = nullptr"; VLOG(3) << tensorflow::strings::Printf( " temps = [%s]", tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str()); @@ -186,8 +187,8 @@ Status CpuExecutable::ExecuteComputeFunction( profile_counters); } - compute_function_(result_buffer, run_options, args_array.data(), - buffer_pointers.data(), profile_counters); + compute_function_(result_buffer, run_options, nullptr, buffer_pointers.data(), + profile_counters); uint64 end_micros = tensorflow::Env::Default()->NowMicros(); @@ -248,27 +249,11 @@ StatusOr CpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { - if (GetRootPointsToSet().IsAmbiguous()) { - return Unimplemented("Points-to set of root instruction is ambiguous"); - } - - se::Stream* stream = run_options->stream(); - DeviceMemoryAllocator* memory_allocator = run_options->allocator(); - std::vector buffers(assignment_->Allocations().size()); - - TF_RETURN_IF_ERROR(AllocateBuffers( - memory_allocator, stream->parent()->device_ordinal(), &buffers)); - - std::vector unowning_buffers; - unowning_buffers.reserve(buffers.size()); - for (auto& buffer : buffers) { - unowning_buffers.push_back(buffer.AsDeviceMemoryBase()); - } - TF_RETURN_IF_ERROR(ExecuteComputeFunction(&run_options->run_options(), - arguments, unowning_buffers, - hlo_execution_profile)); - - return CreateResultShapedBuffer(run_options, &buffers); + TF_ASSIGN_OR_RETURN( + auto result, + ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile)); + TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone()); + return std::move(result); } StatusOr CpuExecutable::ExecuteAsyncOnStream( @@ -279,22 +264,30 @@ StatusOr CpuExecutable::ExecuteAsyncOnStream( "Asynchronous execution on stream with hlo profiling is not yet " "supported on CPU."); } + return ExecuteAsyncOnStreamImpl(run_options, arguments, nullptr); +} + +StatusOr CpuExecutable::ExecuteAsyncOnStreamImpl( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice arguments, + HloExecutionProfile* hlo_execution_profile) { + if (GetRootPointsToSet().IsAmbiguous()) { + return Unimplemented("Points-to set of root instruction is ambiguous"); + } auto* host_stream = dynamic_cast( run_options->stream()->implementation()); se::Stream* stream = run_options->stream(); DeviceMemoryAllocator* memory_allocator = run_options->allocator(); - std::vector buffers(assignment_->Allocations().size()); - TF_RETURN_IF_ERROR(AllocateBuffers( - memory_allocator, stream->parent()->device_ordinal(), &buffers)); - + std::vector owning_buffers; std::vector unowning_buffers; - unowning_buffers.reserve(buffers.size()); - for (auto& buffer : buffers) { - unowning_buffers.push_back(buffer.AsDeviceMemoryBase()); - } + TF_ASSIGN_OR_RETURN( + std::tie(unowning_buffers, owning_buffers), + CreateTempArray(memory_allocator, stream->parent()->device_ordinal(), + arguments)); + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, - CreateResultShapedBuffer(run_options, &buffers)); + CreateResultShapedBuffer(run_options, &owning_buffers)); // At this point, `unowning_buffers` contains unowning pointers to all of our // buffers, and `buffers` contains owning pointers to the non-live-out @@ -312,23 +305,22 @@ StatusOr CpuExecutable::ExecuteAsyncOnStream( struct AsyncRunTask { CpuExecutable* executable; ServiceExecutableRunOptions run_options; - std::vector arguments; std::vector unowning_buffers; std::shared_ptr> buffers; + HloExecutionProfile* hlo_execution_profile; void operator()() { // Failing a CHECK here is not great, but I don't see an obvious way to // return a failed Status asynchronously. TF_CHECK_OK(executable->ExecuteComputeFunction( - &run_options.run_options(), arguments, unowning_buffers, - /*hlo_execution_profile=*/nullptr)); + &run_options.run_options(), unowning_buffers, hlo_execution_profile)); } }; - host_stream->EnqueueTask(AsyncRunTask{ - this, *run_options, - std::vector(arguments.begin(), arguments.end()), - unowning_buffers, - std::make_shared>(std::move(buffers))}); + host_stream->EnqueueTask( + AsyncRunTask{this, *run_options, std::move(unowning_buffers), + std::make_shared>( + std::move(owning_buffers)), + hlo_execution_profile}); return std::move(result); } diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 8dd47bfb865e8a0552542f510d3365cff0d111e0..96e53de57eee013fe6f847c10e23a38f5beb9adc 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -85,20 +85,39 @@ class CpuExecutable : public Executable { const BufferAssignment& buffer_assignment() const { return *assignment_; } private: - // Allocate buffers required for execution and assign them to the elements of - // "buffers". "buffers" should be sized to the number of buffers in buffer - // assignment. Each vector element corresponds to a particular Index. If - // a vector element already contains a non-null DeviceMemoryBase, then no - // buffer is assigned for this element. - Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator, - int device_ordinal, - std::vector* buffers); + // This is for sharing the code between ExecuteOnStream and + // ExecuteAsyncOnStream. + // + // Notice that it's tricky to use correctly, as the profile object (when it + // exists) must out-live the task. + StatusOr ExecuteAsyncOnStreamImpl( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice arguments, + HloExecutionProfile* hlo_execution_profile); + + // Creates an array suitable for passing as the "temps" argument to the JIT + // compiled function pointer. + // + // Returns (unowning_buffers, owning_buffers) where: + // + // - unowning_buffers.data() can be passed as the temps argument as-is and + // includes pointers to the scratch storage required by the computation, + // the live-out buffer into which the result will be written and entry + // computation parameters. + // + // - owning_buffers contains owning pointers to the buffers that were + // allocated by this routine. This routine allocates buffers for temporary + // storage and the live-out buffer into which the computation writes it + // result. + StatusOr, + std::vector>> + CreateTempArray(DeviceMemoryAllocator* memory_allocator, int device_ordinal, + tensorflow::gtl::ArraySlice arguments); // Calls the generated function performing the computation with the given // arguments using the supplied buffers. Status ExecuteComputeFunction( const ExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, tensorflow::gtl::ArraySlice buffers, HloExecutionProfile* hlo_execution_profile); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc index 54c52bc08f9c53b8c6898689b18c4cb7f4bdcfd0..639064040f521a9e84bd87c5d05f674204e4d6e2 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc @@ -92,9 +92,10 @@ tensorflow::string ShapeString(const void* shape_ptr, xla::int32 shape_length) { } // namespace -void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(xla::int32 buffer_length, - const void* shape, - xla::int32 shape_length) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void* +__xla_cpu_runtime_AcquireInfeedBufferForDequeue(xla::int32 buffer_length, + const void* shape, + xla::int32 shape_length) { if (VLOG_IS_ON(2)) { LOG(INFO) << "AcquireInfeedBufferForDequeue: " << ShapeString(shape, shape_length); @@ -111,9 +112,11 @@ void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(xla::int32 buffer_length, return buffer->data(); } -void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue( - xla::int32 buffer_length, void* buffer_ptr, const void* shape_ptr, - xla::int32 shape_length) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(xla::int32 buffer_length, + void* buffer_ptr, + const void* shape_ptr, + xla::int32 shape_length) { if (VLOG_IS_ON(2)) { LOG(INFO) << "ReleaseInfeedBufferAfterDeque: " << ShapeString(shape_ptr, shape_length); @@ -125,8 +128,10 @@ void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue( std::move(shape)); } -void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation( - xla::int32 buffer_length, const void* shape_ptr, xla::int32 shape_length) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void* +__xla_cpu_runtime_AcquireOutfeedBufferForPopulation(xla::int32 buffer_length, + const void* shape_ptr, + xla::int32 shape_length) { if (VLOG_IS_ON(2)) { LOG(INFO) << "AcquireOutfeedBufferForPopulation: " << ShapeString(shape_ptr, shape_length); @@ -143,9 +148,11 @@ void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation( return buffer->data(); } -void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation( - xla::int32 buffer_length, void* buffer_ptr, const void* shape_ptr, - xla::int32 shape_length) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(xla::int32 buffer_length, + void* buffer_ptr, + const void* shape_ptr, + xla::int32 shape_length) { if (VLOG_IS_ON(2)) { LOG(INFO) << "ReleaseOutfeedBufferAfterPopulation: " << ShapeString(shape_ptr, shape_length); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc index 156166bf2b1ea6d3821da8f67ea2b2eca6825ca6..59bc7e0e16fcc66a010408259a1ccfb2b6bb35fd 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc @@ -173,7 +173,7 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor, Status CpuTransferManager::TransferLiteralFromOutfeed( se::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) { + MutableBorrowingLiteral literal) { if (!ShapeUtil::IsTuple(literal_shape)) { int64 size = GetByteSizeRequirement(literal_shape); // Note: OSS build didn't like implicit conversion from @@ -181,18 +181,16 @@ Status CpuTransferManager::TransferLiteralFromOutfeed( tensorflow::gtl::ArraySlice dimensions( tensorflow::bit_cast(literal_shape.dimensions().data()), literal_shape.dimensions().size()); - *literal = std::move(*LiteralUtil::CreateFromDimensions( - literal_shape.element_type(), dimensions)); - TF_ASSIGN_OR_RETURN(Shape received_shape, - TransferArrayBufferFromOutfeed( - executor, literal->untyped_data(), size)); - TF_RET_CHECK(ShapeUtil::Compatible(received_shape, literal->shape())) + TF_ASSIGN_OR_RETURN( + Shape received_shape, + TransferArrayBufferFromOutfeed(executor, literal.untyped_data(), size)); + TF_RET_CHECK(ShapeUtil::Compatible(received_shape, literal.shape())) << "Shape received from outfeed " << ShapeUtil::HumanString(received_shape) << " did not match the shape that was requested for outfeed: " << ShapeUtil::HumanString(literal_shape); TF_RET_CHECK(size == GetByteSizeRequirement(received_shape)); - *literal->mutable_shape_do_not_use() = received_shape; + *literal.mutable_shape_do_not_use() = received_shape; return Status::OK(); } @@ -201,22 +199,12 @@ Status CpuTransferManager::TransferLiteralFromOutfeed( "Nested tuple outfeeds are not yet implemented on CPU."); } - std::vector> elements; std::vector> buffer_data; for (int64 i = 0; i < literal_shape.tuple_shapes_size(); ++i) { const Shape& tuple_element_shape = ShapeUtil::GetTupleElementShape(literal_shape, i); - // Note: OSS build didn't like implicit conversion from - // literal_shape.dimensions() to the array slice on 2017-07-10. - tensorflow::gtl::ArraySlice dimensions( - tensorflow::bit_cast( - tuple_element_shape.dimensions().data()), - tuple_element_shape.dimensions().size()); - auto empty = LiteralUtil::CreateFromDimensions( - tuple_element_shape.element_type(), dimensions); int64 size = GetByteSizeRequirement(tuple_element_shape); - buffer_data.push_back({empty->untyped_data(), size}); - elements.push_back(std::move(empty)); + buffer_data.push_back({literal.untyped_data({i}), size}); } TF_ASSIGN_OR_RETURN(Shape received_shape, @@ -230,11 +218,7 @@ Status CpuTransferManager::TransferLiteralFromOutfeed( TF_RET_CHECK(GetByteSizeRequirement(literal_shape) == GetByteSizeRequirement(received_shape)); - for (int64 i = 0; i < literal_shape.tuple_shapes_size(); ++i) { - *elements[i]->mutable_shape_do_not_use() = received_shape.tuple_shapes(i); - } - *literal = std::move(*LiteralUtil::MakeTupleOwned(std::move(elements))); - TF_RET_CHECK(ShapeUtil::Equal(literal->shape(), literal_shape)); + TF_RET_CHECK(ShapeUtil::Equal(literal.shape(), literal_shape)); return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h index 593575c0fdaddc71cd6bd844fd179096a9fb0fdc..80ef953d532798281c10b7a212b9c4d84a790c27 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h @@ -18,6 +18,7 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h" #include "tensorflow/compiler/xla/service/generic_transfer_manager.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" @@ -41,7 +42,7 @@ class CpuTransferManager : public GenericTransferManager { const LiteralSlice& literal) override; Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) override; + MutableBorrowingLiteral literal) override; private: Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc index 645888de783e4025cffd6fa4835e60b84bbd7d99..f2ac742b6e6fc12076e7a2a242155c005f4b05b8 100644 --- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc @@ -1066,7 +1066,7 @@ bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled( << config.GetCacheKey(); const bool enable_fast_math = - hlo_module_config_.debug_options().xla_enable_fast_math(); + hlo_module_config_.debug_options().xla_cpu_enable_fast_math(); const bool optimize_for_size = options::OptimizeForSizeRequested(hlo_module_config_); @@ -1149,7 +1149,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() { swap_operands ? lhs_array_.GetBasePointer() : rhs_array_.GetBasePointer(); const bool enable_fast_math = - hlo_module_config_.debug_options().xla_enable_fast_math(); + hlo_module_config_.debug_options().xla_cpu_enable_fast_math(); const bool optimize_for_size = options::OptimizeForSizeRequested(hlo_module_config_); diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc index cf955a8add394c204673be0746a451d4edcadc96..db54454707983ade31594119b2e868fa168d4cc2 100644 --- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc @@ -19,6 +19,8 @@ limitations under the License. #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/types.h" @@ -28,47 +30,6 @@ limitations under the License. namespace xla { namespace cpu { -StatusOr CpuElementalIrEmitter::EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { - switch (op->opcode()) { - case HloOpcode::kTanh: { - PrimitiveType element_type = op->shape().element_type(); - bool cast_result_to_fp16 = false; - string function_name; - switch (element_type) { - case F16: - cast_result_to_fp16 = true; - operand_value = b_->CreateFPCast(operand_value, b_->getFloatTy()); - TF_FALLTHROUGH_INTENDED; - case F32: - function_name = "tanhf"; - break; - case F64: - function_name = "tanh"; - break; - default: - return Unimplemented("tanh"); - } - // Create a function declaration. - llvm::Function* function = - llvm::cast(module_->getOrInsertFunction( - llvm_ir::AsStringRef(function_name), operand_value->getType(), - operand_value->getType())); - function->setCallingConv(llvm::CallingConv::C); - function->setDoesNotThrow(); - function->setDoesNotAccessMemory(); - // Create an instruction to call the function. - llvm::Value* result = b_->CreateCall(function, operand_value); - if (cast_result_to_fp16) { - result = b_->CreateFPCast(result, b_->getHalfTy()); - } - return result; - } - default: - return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value); - } -} - StatusOr CpuElementalIrEmitter::EmitAtan2( PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const { string function_name; @@ -104,6 +65,39 @@ StatusOr CpuElementalIrEmitter::EmitAtan2( return result; } +StatusOr CpuElementalIrEmitter::EmitTanh( + PrimitiveType prim_type, llvm::Value* value) const { + bool cast_result_to_fp16 = false; + string function_name; + switch (prim_type) { + case F16: + cast_result_to_fp16 = true; + value = b_->CreateFPCast(value, b_->getFloatTy()); + TF_FALLTHROUGH_INTENDED; + case F32: + function_name = "tanhf"; + break; + case F64: + function_name = "tanh"; + break; + default: + return Unimplemented("tanh"); + } + // Create a function declaration. + llvm::Function* function = llvm::cast( + module_->getOrInsertFunction(llvm_ir::AsStringRef(function_name), + value->getType(), value->getType())); + function->setCallingConv(llvm::CallingConv::C); + function->setDoesNotThrow(); + function->setDoesNotAccessMemory(); + // Create an instruction to call the function. + llvm::Value* result = b_->CreateCall(function, value); + if (cast_result_to_fp16) { + result = b_->CreateFPCast(result, b_->getHalfTy()); + } + return result; +} + llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator( const HloInstruction* hlo, const HloToElementGeneratorMap& operand_to_generator) const { @@ -117,9 +111,8 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator( ElementwiseSourceIndex(index, *hlo, i))); operands.push_back(operand_value); } - return ir_emitter_->EmitScalarCall(hlo->shape().element_type(), - hlo->to_apply(), operands, - llvm_ir::IrName(hlo)); + return ir_emitter_->EmitElementalMap(*Cast(hlo), + operands, llvm_ir::IrName(hlo)); }; } return ElementalIrEmitter::MakeElementGenerator(hlo, operand_to_generator); diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h index 9598a886ab49fcecf5df7bd65f425fe485de3574..76833e765d05f2477961cd06cead66797c5be623 100644 --- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h @@ -39,10 +39,10 @@ class CpuElementalIrEmitter : public ElementalIrEmitter { const HloToElementGeneratorMap& operand_to_generator) const override; protected: - StatusOr EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const override; StatusOr EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const override; + StatusOr EmitTanh(PrimitiveType prim_type, + llvm::Value* value) const override; IrEmitter* ir_emitter_; }; diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index a6d8551841dcba8b81e257f3deb2aacf9b8aff4a..6f433b4f30372da9cf4503396dbb60172cfc0cb0 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -99,7 +99,7 @@ IrEmitter::IrEmitter( target_machine_features_(*target_machine_features) { b_.setFastMathFlags(llvm_ir::GetFastMathFlags( /*fast_math_enabled=*/hlo_module_config_.debug_options() - .xla_enable_fast_math())); + .xla_cpu_enable_fast_math())); } StatusOr IrEmitter::EmitComputation( @@ -116,6 +116,19 @@ StatusOr IrEmitter::EmitComputation( computation->root_instruction()->outer_dimension_partitions().size(); } + if (computation->root_instruction()->opcode() != HloOpcode::kOutfeed) { + TF_ASSIGN_OR_RETURN( + computation_root_allocation_, + assignment_.GetUniqueTopLevelSlice(computation->root_instruction())); + } + + for (const HloInstruction* param : computation->parameter_instructions()) { + TF_ASSIGN_OR_RETURN(BufferAllocation::Slice param_slice, + assignment_.GetUniqueTopLevelSlice(param)); + computation_parameter_allocations_[param_slice.allocation()->index()] = + param->parameter_number(); + } + InitializeIrFunction(function_name); // The rdtscp instruction is x86 specific. We will fallback to LLVM's generic // readcyclecounter if it is unavailable. @@ -132,6 +145,8 @@ StatusOr IrEmitter::EmitComputation( // Delete 'compute_function', finalizing 'ir_function' and restoring caller // IR insert point. compute_function_.reset(); + computation_root_allocation_ = BufferAllocation::Slice(); + computation_parameter_allocations_.clear(); return ir_function; } @@ -143,11 +158,11 @@ void IrEmitter::InitializeIrFunction(const string& function_name) { is_top_level_computation_ ? llvm::GlobalValue::ExternalLinkage : llvm::GlobalValue::InternalLinkage; // Create and initialize new IrFunction. - compute_function_.reset( - new IrFunction(function_name, linkage, - options::OptimizeForSizeRequested(hlo_module_config_), - hlo_module_config_.debug_options().xla_enable_fast_math(), - module_, &b_, num_dynamic_loop_bounds_)); + compute_function_.reset(new IrFunction( + function_name, linkage, + options::OptimizeForSizeRequested(hlo_module_config_), + hlo_module_config_.debug_options().xla_cpu_enable_fast_math(), module_, + &b_, num_dynamic_loop_bounds_)); } IrEmitter::~IrEmitter() {} @@ -484,23 +499,11 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) { return Status::OK(); } -StatusOr IrEmitter::EmitTargetElementLoopBodyForMap( - HloMapInstruction* map, const llvm_ir::IrArray::Index& index) { - llvm::Function* mapped_ir_function = - FindOrDie(emitted_functions_, map->to_apply()); - std::vector parameter_addresses; - for (const HloInstruction* operand : map->operands()) { - const llvm_ir::IrArray& array = GetIrArrayFor(operand); - parameter_addresses.push_back(array.EmitArrayElementAddress(index, &b_)); - } - return EmitElementFunctionCall(mapped_ir_function, map->shape(), - parameter_addresses, "map_function"); -} - -Status IrEmitter::HandleMap(HloInstruction* map) { - return EmitTargetElementLoop(map, [&](const llvm_ir::IrArray::Index& index) { - return EmitTargetElementLoopBodyForMap(Cast(map), index); - }); +llvm::Value* IrEmitter::EmitElementalMap( + const HloMapInstruction& map_instr, + tensorflow::gtl::ArraySlice elemental_operands, + tensorflow::StringPiece name) { + return EmitThreadLocalCall(*map_instr.to_apply(), elemental_operands, name); } StatusOr IrEmitter::EmitTargetElementLoopBodyForReduceWindow( @@ -508,9 +511,6 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduceWindow( const llvm_ir::IrArray::Index& index) { const HloInstruction* operand = reduce_window->operand(0); const Window& window = reduce_window->window(); - HloComputation* function = reduce_window->to_apply(); - // The called computation should have been emitted previously. - llvm::Function* reducer_function = FindOrDie(emitted_functions_, function); // We fold inputs into the accumulator and initialize it to // the initial value on the reduce_window. @@ -563,11 +563,10 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduceWindow( // We are not in the padding, so carry out the computation. llvm_ir::IrArray input_array(GetIrArrayFor(operand)); - llvm::Value* input_value_address = - input_array.EmitArrayElementAddress(input_index, &b_); - llvm::Value* result = EmitElementFunctionCall( - reducer_function, reduce_window->shape(), - {accumulator_address, input_value_address}, "reducer_function"); + llvm::Value* input_value = input_array.EmitReadArrayElement(input_index, &b_); + llvm::Value* result = EmitThreadLocalCall( + *reduce_window->to_apply(), + {b_.CreateLoad(accumulator_address), input_value}, "reducer_function"); b_.CreateStore(result, accumulator_address); SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); @@ -578,7 +577,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) { TF_RETURN_IF_ERROR(ElementTypesSameAndSupported( /*instruction=*/*reduce_window, /*operands=*/{reduce_window->operand(0)}, - /*supported_types=*/{F32, BF16, S32})); + /*supported_types=*/{F32, BF16, S32, F16})); // TODO(b/31410564): Implement dilation for reduce-window. if (window_util::HasDilation(reduce_window->window())) { @@ -623,12 +622,6 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { "Dilation for SelectAndScatter is not implemented on CPU. "); } - // The select and scatter computations should have been emitted previously. - llvm::Function* select_function = - FindOrDie(emitted_functions_, select_and_scatter->select()); - llvm::Function* scatter_function = - FindOrDie(emitted_functions_, select_and_scatter->scatter()); - // Pseudo code for select-and-scatter: // // initialized_flag is initially off for every window, and is turned on after @@ -733,11 +726,12 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { // If the initialized_flag is true, call the `select` function to potentially // update the selected value and index with the currently visiting operand. SetToFirstInsertPoint(if_initialized.true_block, &b_); - const Shape output_shape = ShapeUtil::MakeShape(PRED, {}); llvm::Value* operand_address = operand_array.EmitArrayElementAddress(operand_index, &b_); - llvm::Value* result = EmitElementFunctionCall( - select_function, output_shape, {selected_value_address, operand_address}, + llvm::Value* operand_element = b_.CreateLoad(operand_address); + llvm::Value* result = EmitThreadLocalCall( + *select_and_scatter->select(), + {b_.CreateLoad(selected_value_address), operand_element}, "select_function"); // If the 'select' function returns false, update the selected value and the @@ -764,14 +758,14 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) { selected_index.push_back(b_.CreateLoad(selected_index_address_slot)); } llvm_ir::IrArray source_array(GetIrArrayFor(source)); - llvm::Value* source_value_address = - source_array.EmitArrayElementAddress(source_index, &b_); + llvm::Value* source_value = + source_array.EmitReadArrayElement(source_index, &b_); llvm_ir::IrArray output_array(GetIrArrayFor(select_and_scatter)); - llvm::Value* output_value_address = - output_array.EmitArrayElementAddress(selected_index, &b_); - llvm::Value* scatter_value = EmitElementFunctionCall( - scatter_function, source->shape(), - {output_value_address, source_value_address}, "scatter_function"); + llvm::Value* output_value = + output_array.EmitReadArrayElement(selected_index, &b_); + llvm::Value* scatter_value = + EmitThreadLocalCall(*select_and_scatter->scatter(), + {output_value, source_value}, "scatter_function"); output_array.EmitWriteArrayElement(selected_index, scatter_value, &b_); SetToFirstInsertPoint(source_loops.GetOuterLoopExitBasicBlock(), &b_); @@ -1248,46 +1242,7 @@ static llvm_ir::IrArray::Index FillReducedDimensionIndex( Status IrEmitter::HandleParameter(HloInstruction* parameter) { VLOG(2) << "HandleParameter: " << parameter->ToString(); - auto param_number = parameter->parameter_number(); - auto param_shape = parameter->shape(); - - // We have to access the parameter at offset param_number in the params - // array. The code generated here is equivalent to this C code: - // - // i8* param_address_untyped = params[param_number]; - // Param* param_address_typed = (Param*)param_address_untyped; - // - // Where Param is the actual element type of the underlying buffer (for - // example, float for an XLA F32 element type). - llvm::Value* params = compute_function_->parameters_arg(); - llvm::Value* param_address_offset = - llvm_ir::EmitBufferIndexingGEP(params, param_number, &b_); - llvm::LoadInst* param_address_untyped = b_.CreateLoad(param_address_offset); - param_address_untyped->setName(AsStringRef(IrName(parameter, "untyped"))); - if (is_top_level_computation_ && - hlo_module_config_.debug_options() - .xla_llvm_enable_invariant_load_metadata()) { - // In the entry computation the parameter slots in the %params argument are - // invariant through program execution. In computations that are called - // from the entry computation (via kWhile, kCall and kConditional) the - // parameter slots are *not* invariant since they're written to by their - // callers. - param_address_untyped->setMetadata( - llvm::LLVMContext::MD_invariant_load, - llvm::MDNode::get(param_address_untyped->getContext(), /*MDs=*/{})); - } - - llvm::Value* param_address_typed = b_.CreateBitCast( - param_address_untyped, IrShapeType(param_shape)->getPointerTo()); - emitted_value_[parameter] = param_address_typed; - - if (!ShapeUtil::IsOpaque(param_shape)) { - AttachAlignmentMetadataForLoad(param_address_untyped, param_shape); - AttachDereferenceableMetadataForLoad(param_address_untyped, param_shape); - } - - VLOG(2) << " emitted value: " << llvm_ir::DumpToString(*param_address_typed); - return Status::OK(); + return EmitTargetAddressForOp(parameter); } // Returns true if the relative order of the unreduced dimensions stays the same @@ -1751,9 +1706,6 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduce( const HloInstruction* arg = reduce->mutable_operand(0); const HloInstruction* init_value = reduce->mutable_operand(1); gtl::ArraySlice dimensions(reduce->dimensions()); - HloComputation* function = reduce->to_apply(); - // The called computation should have been emitted previously. - llvm::Function* reducer_function = FindOrDie(emitted_functions_, function); // Initialize an accumulator with init_value. PrimitiveType accumulator_type = reduce->shape().element_type(); @@ -1793,10 +1745,9 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduce( CHECK(index.end() == it); // Apply the reduction function to the loaded value. - llvm::Value* input_address = - arg_array.EmitArrayElementAddress(input_index, &b_); - llvm::Value* result = EmitElementFunctionCall( - reducer_function, reduce->shape(), {accumulator_addr, input_address}, + llvm::Value* input_element = arg_array.EmitReadArrayElement(input_index, &b_); + llvm::Value* result = EmitThreadLocalCall( + *reduce->to_apply(), {b_.CreateLoad(accumulator_addr), input_element}, "reduce_function"); b_.CreateStore(result, accumulator_addr); @@ -1805,6 +1756,10 @@ StatusOr IrEmitter::EmitTargetElementLoopBodyForReduce( } Status IrEmitter::HandleReduce(HloInstruction* reduce) { + // TODO(b/112040122): Support variadic reduce. + if (!ShapeUtil::IsArray(reduce->shape())) { + return Unimplemented("Variadic reduce is not supported on CPU"); + } auto arg = reduce->mutable_operand(0); auto init_value = reduce->mutable_operand(1); gtl::ArraySlice dimensions(reduce->dimensions()); @@ -1842,6 +1797,10 @@ Status IrEmitter::HandleSendDone(HloInstruction* send_done) { return Unimplemented("Send-done is not implemented on CPU."); } +Status IrEmitter::HandleScatter(HloInstruction*) { + return Unimplemented("Scatter is not implemented on CPUs."); +} + Status IrEmitter::HandleSlice(HloInstruction* slice) { VLOG(2) << "HandleSlice: " << slice->ToString(); auto operand = slice->operand(0); @@ -2134,18 +2093,13 @@ Status IrEmitter::HandleCall(HloInstruction* call) { HloComputation* computation = call->to_apply(); llvm::Function* call_ir_function = FindOrDie(emitted_functions_, computation); - std::vector parameter_addresses; - for (const HloInstruction* operand : call->operands()) { - parameter_addresses.push_back(GetEmittedValueFor(operand)); - } - TF_RETURN_IF_ERROR(EmitTargetAddressForOp(call)); if (!computation->root_instruction()->outer_dimension_partitions().empty()) { // ParallelTaskAssignment assigned partitions, emit call to // ParallelForkJoin. std::vector call_args = GetArrayFunctionCallArguments( - parameter_addresses, &b_, computation->name(), + {}, &b_, computation->name(), /*return_value_buffer=*/emitted_value_[call], /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), /*temp_buffers_arg=*/GetTempBuffersArgument(), @@ -2156,8 +2110,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) { call_args, root->shape(), root->outer_dimension_partitions(), &b_, call_ir_function, computation->name())); } else { - EmitArrayFunctionCallInto(call_ir_function, parameter_addresses, - emitted_value_[call], computation->name()); + EmitGlobalCall(*computation, computation->name()); } return Status::OK(); @@ -2238,12 +2191,6 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { const HloInstruction* init = xla_while->operand(0); emitted_value_[xla_while] = GetEmittedValueFor(init); - // The called computation should have been emitted previously. - llvm::Function* condition_ir_function = - FindOrDie(emitted_functions_, condition); - llvm::Function* body_ir_function = - FindOrDie(emitted_functions_, xla_while->while_body()); - // Generating: // while (Condition(while_result)) { // // CopyInsertion pass inserts copies which enable 'while_result' to @@ -2260,12 +2207,10 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { // Calls the condition function to determine whether to proceed with the // body. It must return a bool, so use the scalar call form. - llvm::Value* while_result = GetEmittedValueFor(xla_while); - llvm::Value* while_condition = EmitElementFunctionCall( - condition_ir_function, condition->root_instruction()->shape(), - {while_result}, IrName(xla_while, "cond")); + EmitGlobalCall(*xla_while->while_condition(), IrName(xla_while, "cond")); llvm::Value* while_predicate = b_.CreateICmpNE( - while_condition, + b_.CreateLoad( + GetBufferForGlobalCallReturnValue(*xla_while->while_condition())), llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0)); // Branches to the body or to the while exit depending on the condition. @@ -2280,8 +2225,8 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) { b_.SetInsertPoint(body_bb); // Calls the body function. - EmitArrayFunctionCallInto(body_ir_function, {while_result}, while_result, - IrName(xla_while, "body")); + EmitGlobalCall(*xla_while->while_body(), IrName(xla_while, "body")); + // Finishes with a branch back to the header. b_.CreateBr(header_bb); @@ -2449,8 +2394,6 @@ Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) { Status IrEmitter::HandleConditional(HloInstruction* conditional) { auto pred = conditional->operand(0); - auto true_arg = conditional->operand(1); - auto false_arg = conditional->operand(2); TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()) && pred->shape().element_type() == PRED) << "Predicate on a Conditional must be bool; got: " @@ -2472,13 +2415,7 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) { << " and " << ShapeUtil::HumanString(false_computation->root_instruction()->shape()); - llvm::Function* true_function = - FindOrDie(emitted_functions_, true_computation); - llvm::Function* false_function = - FindOrDie(emitted_functions_, false_computation); - TF_RETURN_IF_ERROR(EmitTargetAddressForOp(conditional)); - llvm::Value* conditional_result = GetEmittedValueFor(conditional); // Generating: // if (pred) @@ -2495,12 +2432,12 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) { llvm_ir::EmitIfThenElse(pred_cond, "conditional", &b_); SetToFirstInsertPoint(if_data.true_block, &b_); - EmitArrayFunctionCallInto(true_function, {GetEmittedValueFor(true_arg)}, - conditional_result, IrName(conditional, "_true")); + EmitGlobalCall(*conditional->true_computation(), + IrName(conditional, "_true")); SetToFirstInsertPoint(if_data.false_block, &b_); - EmitArrayFunctionCallInto(false_function, {GetEmittedValueFor(false_arg)}, - conditional_result, IrName(conditional, "_false")); + EmitGlobalCall(*conditional->false_computation(), + IrName(conditional, "_false")); SetToFirstInsertPoint(if_data.after_block, &b_); return Status::OK(); @@ -2701,44 +2638,76 @@ llvm::Value* IrEmitter::GetExecutableRunOptionsArgument() { return compute_function_->exec_run_options_arg(); } -llvm::Value* IrEmitter::EmitTempBufferPointer( +llvm::Value* IrEmitter::EmitThreadLocalTempBufferPointer( const BufferAllocation::Slice& slice, const Shape& target_shape) { - llvm::Type* element_type = IrShapeType(target_shape); - // The alignment and number of bytes within the temporary buffer is determined - // by the maximal shape as determined by buffer assignment. - const BufferAllocation& allocation = assignment_.GetAllocation(slice.index()); - if (allocation.is_thread_local()) { + const BufferAllocation& allocation = *slice.allocation(); + llvm::Value* tempbuf_address = [&]() -> llvm::Value* { + if (slice == computation_root_allocation_) { + llvm::Argument* retval = compute_function_->result_arg(); + llvm::AttrBuilder attr_builder; + attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape)); + attr_builder.addDereferenceableAttr(ByteSizeOf(target_shape)); + retval->addAttrs(attr_builder); + return retval; + } + + auto param_it = + computation_parameter_allocations_.find(slice.allocation()->index()); + if (param_it != computation_parameter_allocations_.end()) { + int64 param_number = param_it->second; + // We have to access the parameter at offset param_number in the params + // array. The code generated here is equivalent to this C code: + // + // i8* param_address_untyped = params[param_number]; + // Param* param_address_typed = (Param*)param_address_untyped; + // + // Where Param is the actual element type of the underlying buffer (for + // example, float for an XLA F32 element type). + llvm::Value* params = compute_function_->parameters_arg(); + llvm::Value* param_address_offset = + llvm_ir::EmitBufferIndexingGEP(params, param_number, &b_); + llvm::LoadInst* param_address_untyped = + b_.CreateLoad(param_address_offset); + + if (!ShapeUtil::IsOpaque(target_shape)) { + AttachAlignmentMetadataForLoad(param_address_untyped, target_shape); + AttachDereferenceableMetadataForLoad(param_address_untyped, + target_shape); + } + return param_address_untyped; + } + // Thread-local allocations should only be assigned a single buffer. const auto& assigned_buffers = allocation.assigned_buffers(); CHECK_EQ(1, assigned_buffers.size()); const Shape& shape = assigned_buffers.begin()->first->shape(); - llvm::AllocaInst*& tempbuf_address = - thread_local_buffers_[{b_.GetInsertBlock()->getParent(), slice}]; - if (tempbuf_address == nullptr) { - tempbuf_address = llvm_ir::EmitAllocaAtFunctionEntry( + std::pair key = { + compute_function_->function(), slice}; + auto buf_it = thread_local_buffers_.find(key); + if (buf_it == thread_local_buffers_.end()) { + llvm::Value* buffer = llvm_ir::EmitAllocaAtFunctionEntry( IrShapeType(shape), tensorflow::strings::StrCat("thread_local", slice.ToString()), &b_, MinimumAlignmentForShape(target_shape)); + auto it_inserted_pair = thread_local_buffers_.insert({key, buffer}); + CHECK(it_inserted_pair.second); + buf_it = it_inserted_pair.first; } - return b_.CreateBitCast(tempbuf_address, element_type->getPointerTo()); - } - - if (allocation.is_constant()) { - return FindOrDie(constant_buffer_to_global_, allocation.index()); - } + return buf_it->second; + }(); + return b_.CreateBitCast(tempbuf_address, + IrShapeType(target_shape)->getPointerTo()); +} +llvm::Value* IrEmitter::EmitGlobalTempBufferPointer( + const BufferAllocation::Slice& slice, const Shape& target_shape) { + const BufferAllocation& allocation = *slice.allocation(); llvm::Value* tempbuf_address_ptr = llvm_ir::EmitBufferIndexingGEP( GetTempBuffersArgument(), slice.index(), &b_); llvm::LoadInst* tempbuf_address_base = b_.CreateLoad(tempbuf_address_ptr); - if (is_top_level_computation_ && - hlo_module_config_.debug_options() + if (hlo_module_config_.debug_options() .xla_llvm_enable_invariant_load_metadata()) { - // In the entry computation the parameter slots in the %params argument are - // invariant through program execution. In computations that are called - // from the entry computation (via kWhile, kCall and kConditional) the - // parameter slots are *not* invariant since they're written to by their - // callers. tempbuf_address_base->setMetadata( llvm::LLVMContext::MD_invariant_load, llvm::MDNode::get(tempbuf_address_base->getContext(), /*MDs=*/{})); @@ -2753,85 +2722,25 @@ llvm::Value* IrEmitter::EmitTempBufferPointer( b_.CreateInBoundsGEP(tempbuf_address_base, b_.getInt64(slice.offset())); } return b_.CreateBitCast(tempbuf_address_untyped, - element_type->getPointerTo()); + IrShapeType(target_shape)->getPointerTo()); } -// Emits a function call returning a single array element. Allocates space -// for a single element_type value, and loads it after call. -llvm::Value* IrEmitter::EmitElementFunctionCall( - llvm::Function* function, const Shape& return_shape, - gtl::ArraySlice parameter_addresses, - tensorflow::StringPiece name) { - llvm::Value* return_value_buffer = EmitArrayFunctionCall( - function, return_shape, 1, parameter_addresses, name); - return b_.CreateLoad( - return_value_buffer, - AsStringRef(tensorflow::strings::StrCat(name, "_return_value"))); -} - -// Emits a core function call based on the following pseudo-code. -// -// char** parameter_addresses_buffer = -// allocate buffer with a pointer for each parameter to the function -// for each parameter index, i.e. for i = 0, ..., #parameters: -// parameter_addresses_buffer[i] = parameter_addresses[i] -// call function(return_value_buffer, -// parameter_addresses_buffer, -// temps) -// return return_value_buffer -- address of the return value. -void IrEmitter::EmitArrayFunctionCallInto( - llvm::Function* function, gtl::ArraySlice parameter_addresses, - llvm::Value* return_value_buffer, tensorflow::StringPiece name) { - b_.CreateCall(function, - GetArrayFunctionCallArguments( - parameter_addresses, &b_, name, - /*return_value_buffer=*/return_value_buffer, - /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), - /*temp_buffers_arg=*/GetTempBuffersArgument(), - /*profile_counters_arg=*/GetProfileCountersArgument())); -} - -llvm::Value* IrEmitter::EmitArrayFunctionCall( - llvm::Function* function, const Shape& return_shape, int64 element_count, - gtl::ArraySlice parameter_addresses, - tensorflow::StringPiece name) { - llvm::Value* elements = - llvm::ConstantInt::get(b_.getInt64Ty(), element_count); - PrimitiveType return_type = return_shape.element_type(); - llvm::Value* return_value_buffer = - llvm_ir::EmitAllocaAtFunctionEntryWithCount( - llvm_ir::PrimitiveTypeToIrType(return_type, module_), elements, - tensorflow::strings::StrCat(name, "_return_value_address"), &b_, - MinimumAlignmentForPrimitiveType(return_type)); - EmitArrayFunctionCallInto(function, parameter_addresses, return_value_buffer, - name); - return return_value_buffer; +llvm::Value* IrEmitter::EmitTempBufferPointer( + const BufferAllocation::Slice& slice, const Shape& target_shape) { + if (slice.allocation()->is_thread_local()) { + return EmitThreadLocalTempBufferPointer(slice, target_shape); + } else if (slice.allocation()->is_constant()) { + return FindOrDie(constant_buffer_to_global_, slice.allocation()->index()); + } else { + return EmitGlobalTempBufferPointer(slice, target_shape); + } } Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) { - llvm::Value* addr; const Shape& target_shape = op->shape(); - if (op == op->parent()->root_instruction()) { - // For the root node, we write directly to the output buffer of the - // function. - llvm::Argument* retval = compute_function_->result_arg(); - if ((ShapeUtil::IsArray(target_shape) && - !ShapeUtil::IsZeroElementArray(target_shape)) || - (ShapeUtil::IsTuple(target_shape) && - !ShapeUtil::IsEmptyTuple(target_shape))) { - llvm::AttrBuilder attr_builder; - attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape)); - attr_builder.addDereferenceableAttr(ByteSizeOf(target_shape)); - retval->addAttrs(attr_builder); - } - addr = b_.CreateBitCast(retval, IrShapeType(target_shape)->getPointerTo()); - } else { - // For other nodes, we need the temporary buffer allocated for this node to - // write the result into. - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice, - assignment_.GetUniqueTopLevelSlice(op)); - addr = EmitTempBufferPointer(slice, target_shape); - } + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice, + assignment_.GetUniqueTopLevelSlice(op)); + llvm::Value* addr = EmitTempBufferPointer(slice, target_shape); addr->setName(AsStringRef(IrName(op))); emitted_value_[op] = addr; return Status::OK(); @@ -2936,20 +2845,69 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) { hlo, elemental_emitter.MakeElementGenerator(hlo, operand_to_generator)); } -StatusOr IrEmitter::EmitScalarCall( - PrimitiveType return_type, HloComputation* computation, - const std::vector& arguments, tensorflow::StringPiece name) { - llvm::Function* llvm_function = FindOrDie(emitted_functions_, computation); - std::vector argument_addrs; - for (auto argument : arguments) { - llvm::Value* argument_addr = llvm_ir::EmitAllocaAtFunctionEntry( - argument->getType(), "arg_addr", &b_); - b_.CreateStore(argument, argument_addr); - argument_addrs.push_back(argument_addr); +llvm::Value* IrEmitter::EmitThreadLocalCall( + const HloComputation& callee, + tensorflow::gtl::ArraySlice parameters, + tensorflow::StringPiece name) { + const Shape& return_shape = callee.root_instruction()->shape(); + + // Lifting this restriction to allow "small" arrays should be easy. Allowing + // larger arrays is difficult because we allocate the buffer for this return + // value on the stack. + CHECK(ShapeUtil::IsScalar(return_shape)); + + PrimitiveType return_type = return_shape.element_type(); + + std::vector parameter_addrs; + for (llvm::Value* parameter : parameters) { + CHECK(!parameter->getType()->isPointerTy()); + llvm::Value* parameter_addr = llvm_ir::EmitAllocaAtFunctionEntry( + parameter->getType(), "arg_addr", &b_); + b_.CreateStore(parameter, parameter_addr); + parameter_addrs.push_back(parameter_addr); + } + + llvm::Value* return_value_buffer = llvm_ir::EmitAllocaAtFunctionEntry( + llvm_ir::PrimitiveTypeToIrType(return_type, module_), + tensorflow::strings::StrCat(name, "_retval_addr"), &b_, + MinimumAlignmentForPrimitiveType(return_type)); + + b_.CreateCall( + FindOrDie(emitted_functions_, &callee), + GetArrayFunctionCallArguments( + parameter_addrs, &b_, name, + /*return_value_buffer=*/return_value_buffer, + /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), + /*temp_buffers_arg=*/ + llvm::Constant::getNullValue(b_.getInt8PtrTy()->getPointerTo()), + /*profile_counters_arg=*/GetProfileCountersArgument())); + + return b_.CreateLoad(return_value_buffer); +} + +void IrEmitter::EmitGlobalCall(const HloComputation& callee, + tensorflow::StringPiece name) { + b_.CreateCall(FindOrDie(emitted_functions_, &callee), + GetArrayFunctionCallArguments( + /*parameter_addresses=*/{}, &b_, name, + /*return_value_buffer=*/ + llvm::Constant::getNullValue(b_.getInt8PtrTy()), + /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(), + /*temp_buffers_arg=*/GetTempBuffersArgument(), + /*profile_counters_arg=*/GetProfileCountersArgument())); +} + +llvm::Value* IrEmitter::GetBufferForGlobalCallReturnValue( + const HloComputation& callee) { + const HloInstruction* root_inst = callee.root_instruction(); + if (root_inst->opcode() == HloOpcode::kOutfeed) { + return llvm::Constant::getNullValue(b_.getInt8PtrTy()); } - return EmitElementFunctionCall(llvm_function, - ShapeUtil::MakeShape(return_type, {}), - argument_addrs, name); + + const BufferAllocation::Slice root_buffer = + assignment_.GetUniqueTopLevelSlice(root_inst).ValueOrDie(); + return EmitTempBufferPointer(root_buffer, root_inst->shape()); } + } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h index 03bbb2afb587e2f95bcd2743d396d3d996041a21..c9a1dab62dcbcd926baa82737d24efa03fd326e9 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h @@ -100,14 +100,15 @@ class IrEmitter : public DfsHloVisitorWithDefault { llvm::IRBuilder<>* b() { return &b_; } - // Emits a call to `computation` with scalar arguments `arguments`. - StatusOr EmitScalarCall( - PrimitiveType return_type, HloComputation* computation, - const std::vector& arguments, tensorflow::StringPiece name); - // Emit an LLVM global variable for every constant buffer allocation. Status EmitConstantGlobals(); + // Emit code to map one element according to `map_instr`. + llvm::Value* EmitElementalMap( + const HloMapInstruction& map_instr, + tensorflow::gtl::ArraySlice elemental_operands, + tensorflow::StringPiece name); + protected: // // The following methods implement the DfsHloVisitor interface. @@ -143,13 +144,13 @@ class IrEmitter : public DfsHloVisitorWithDefault { Status HandleRecvDone(HloInstruction* recv_done) override; Status HandlePad(HloInstruction* pad) override; Status HandleTuple(HloInstruction* tuple) override; - Status HandleMap(HloInstruction* map) override; Status HandleFusion(HloInstruction* fusion) override; Status HandleCall(HloInstruction* call) override; Status HandleCustomCall(HloInstruction* custom_call) override; Status HandleWhile(HloInstruction* xla_while) override; Status HandleConcatenate(HloInstruction* concatenate) override; Status HandleConditional(HloInstruction* conditional) override; + Status HandleScatter(HloInstruction* scatter) override; Status HandleAfterAll(HloInstruction* gen_token) override; Status HandleIota(HloInstruction* iota) override; Status HandleRng(HloInstruction* rng) override; @@ -218,9 +219,18 @@ class IrEmitter : public DfsHloVisitorWithDefault { // computation function being emitted by this emitter. llvm::Value* GetTempBuffersArgument(); - // Emits code that computes the address of the given temporary buffer to the - // function. target_shape is the shape of this temporary buffer. - // The returned Value's type is a pointer to element_type. + // Helper for EmitTempBufferPointer. + llvm::Value* EmitGlobalTempBufferPointer(const BufferAllocation::Slice& slice, + const Shape& target_shape); + + // Helper for EmitTempBufferPointer. + llvm::Value* EmitThreadLocalTempBufferPointer( + const BufferAllocation::Slice& slice, const Shape& target_shape); + + // Emits code that computes the address of the given buffer allocation slice. + // + // TODO(sanjoy): This should be renamed to reflect that it no longer provides + // access to just temporaries. llvm::Value* EmitTempBufferPointer(const BufferAllocation::Slice& slice, const Shape& target_shape); @@ -232,44 +242,27 @@ class IrEmitter : public DfsHloVisitorWithDefault { tensorflow::StringPiece function_name_suffix); // Used for LLVM IR register names. - // Methods that emit a function call. - // Parameters: - // function - The LLVM function to call. - // return_shape - The return shape of the HLO computation that was used to - // make the function. Not the same as the return type of the function - // in LLVM, since we use output parameters for the return type. - // element_count - number of elements to return (array form only). - // parameter_addresses - pointers to be passed to the function as - // parameters. - // name - used for LLVM IR register names. - - // Emits a function call, returning a scalar, often an element of a larger - // array. Returns a Value for the scalar element returned by the function. - llvm::Value* EmitElementFunctionCall( - llvm::Function* function, const Shape& return_shape, - tensorflow::gtl::ArraySlice parameter_addresses, + // Emits a call to a thread local function (e.g. to the computation nested + // within a reduce or a map). Thread local callees (by definition) only write + // to and read from thread local allocations. + // + // `parameters` holds the *scalar values* that need to be passed to the + // callee. The return value is the scalar returned by the callee. + llvm::Value* EmitThreadLocalCall( + const HloComputation& callee, + tensorflow::gtl::ArraySlice parameters, tensorflow::StringPiece name); - // Array function call emitter. Stores the function's result into a supplied - // buffer. - // Parameters: - // function - The LLVM function to call. - // parameter_addresses - pointers to be passed to the function as - // parameters. - // return_value - pointer to a buffer where the call result is stored. - - void EmitArrayFunctionCallInto( - llvm::Function* function, - tensorflow::gtl::ArraySlice parameter_addresses, - llvm::Value* return_value_buffer, tensorflow::StringPiece name); - - // Array function call emitter. Returns a Value for the function's return - // value buffer address. The return value buffer is alloca'ed by this - // function. - llvm::Value* EmitArrayFunctionCall( - llvm::Function* function, const Shape& return_shape, int64 element_count, - tensorflow::gtl::ArraySlice parameter_addresses, - tensorflow::StringPiece name); + // Emits a call to a "global" function (e.g. to the computation nested within + // a kWhile or a kCall). Buffer assignment unabiguously assignes buffers to + // the parameters and return values for these computations so there is no need + // to explicitly pass parameters or return results. + void EmitGlobalCall(const HloComputation& callee, + tensorflow::StringPiece name); + + // Returns the buffer to which a global call to `callee` would have written + // its result. + llvm::Value* GetBufferForGlobalCallReturnValue(const HloComputation& callee); // Verifies that the element types of all of the given operand instructions // match and are of one of the given supported types. @@ -408,11 +401,10 @@ class IrEmitter : public DfsHloVisitorWithDefault { NameUniquer name_uniquer_; // Map containing all previously emitted computations. - std::map emitted_functions_; + std::map emitted_functions_; // Map containing all previously emitted thread-local temporary buffers. - std::map, - llvm::AllocaInst*> + std::map, llvm::Value*> thread_local_buffers_; // The following fields track the IR emission state. According to LLVM memory @@ -422,6 +414,16 @@ class IrEmitter : public DfsHloVisitorWithDefault { std::unique_ptr compute_function_; llvm::IRBuilder<> b_; + // The buffer allocation slice for the root of the computation being compiled. + // Only relevant for thread local computations. + BufferAllocation::Slice computation_root_allocation_; + + // Maps the buffer allocation slices for the parameters to the computation + // being compiled to their parameter numbers. Only relevant for thread local + // computations. + tensorflow::gtl::FlatMap + computation_parameter_allocations_; + // Maps HLO instructions to their index into the profile counter array. const std::unordered_map instruction_to_profile_idx_; diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc index 6aff838462ac6bfe8a31971108a721b66dbe45bd..2db4d000f5b149969c88fb4325ca28aa11dc3708 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_function.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc @@ -80,9 +80,16 @@ void IrFunction::Initialize(const string& function_name, // void function(i8* retval, i8* run_options, i8** params, i8** temps, // i64* dynamic_loop_bounds, i64* prof_counters) // - // retval: points to the returned value. - // params: address of an array with pointers to parameters. - // temps: address of an array with pointers to temporary buffers. + // For thread local functions: + // retval: points to the returned value. + // params: address of an array with pointers to parameters. + // temps: is null + // + // For global functions: + // retval: is null + // params: is null + // temps: address of an array with pointers to temporary buffers and entry + // computation parameters. // // Therefore, the generated function's signature (FunctionType) is statically // determined - parameter unpacking is done in code generated into the @@ -196,18 +203,25 @@ std::vector GetArrayFunctionCallArguments( llvm::IRBuilder<>* b, tensorflow::StringPiece name, llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg, llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg) { - llvm::Value* parameter_addresses_buffer = - llvm_ir::EmitAllocaAtFunctionEntryWithCount( - b->getInt8PtrTy(), b->getInt32(parameter_addresses.size()), - tensorflow::strings::StrCat(name, "_parameter_addresses"), b); - for (size_t i = 0; i < parameter_addresses.size(); ++i) { - llvm::Value* parameter_as_i8ptr = - b->CreateBitCast(parameter_addresses[i], b->getInt8PtrTy(), - AsStringRef(tensorflow::strings::StrCat( - name, "_parameter_", i, "_address_as_i8ptr"))); - llvm::Value* slot_in_param_addresses = - b->CreateInBoundsGEP(parameter_addresses_buffer, {b->getInt64(i)}); - b->CreateStore(parameter_as_i8ptr, slot_in_param_addresses); + llvm::Value* parameter_addresses_buffer; + + if (parameter_addresses.empty()) { + parameter_addresses_buffer = + llvm::Constant::getNullValue(b->getInt8PtrTy()->getPointerTo()); + } else { + parameter_addresses_buffer = llvm_ir::EmitAllocaAtFunctionEntryWithCount( + b->getInt8PtrTy(), b->getInt32(parameter_addresses.size()), + tensorflow::strings::StrCat(name, "_parameter_addresses"), b); + + for (size_t i = 0; i < parameter_addresses.size(); ++i) { + llvm::Value* parameter_as_i8ptr = + b->CreateBitCast(parameter_addresses[i], b->getInt8PtrTy(), + AsStringRef(tensorflow::strings::StrCat( + name, "_parameter_", i, "_address_as_i8ptr"))); + llvm::Value* slot_in_param_addresses = + b->CreateInBoundsGEP(parameter_addresses_buffer, {b->getInt64(i)}); + b->CreateStore(parameter_as_i8ptr, slot_in_param_addresses); + } } const auto to_int8_ptr = [=](llvm::Value* ptr) { diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc index d03da46575b331de113cc5f33c2b4267504e8308..a5f34908d70dd18ec017bdf9833c7df40f80db07 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc +++ b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc @@ -20,6 +20,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/core/lib/core/blocking_counter.h" +#include "tensorflow/core/platform/dynamic_annotations.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" @@ -58,13 +59,14 @@ using ComputeFunctionType = void (*)(void*, const void*, const void**, void**, // [partition1_dim2_start] // [partition1_dim2_limit] // -void __xla_cpu_runtime_ParallelForkJoin( +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ParallelForkJoin( void* result_ptr, const void* run_options_ptr, const void** params, void** temps, uint64* prof_counters, int32 num_partitions, int64* partitions, int32 num_partitioned_dims, void* function_ptr) { VLOG(2) << "ParallelForkJoin ENTRY" << " num_partitions: " << num_partitions << " num_partitioned_dims: " << num_partitioned_dims; + CHECK_EQ(params, nullptr); CHECK_GT(num_partitions, 1); CHECK_GT(num_partitioned_dims, 0); const xla::ExecutableRunOptions* run_options = @@ -79,9 +81,9 @@ void __xla_cpu_runtime_ParallelForkJoin( for (int32 i = 1; i < num_partitions; ++i) { const int64 offset = i * stride; run_options->intra_op_thread_pool()->enqueueNoNotification( - [i, function, result_ptr, run_options_ptr, params, temps, prof_counters, + [i, function, result_ptr, run_options_ptr, temps, prof_counters, partitions, offset, &bc]() { - function(result_ptr, run_options_ptr, params, temps, + function(result_ptr, run_options_ptr, nullptr, temps, &partitions[offset], prof_counters); bc.DecrementCount(); VLOG(3) << "ParallelForkJoin partition " << i << " done."; diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc index 39b13183ff093611a42b3931d45f64eadb420622..a71a85913cfef271bc2a226cb0cf2dd4204499a4 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc +++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc @@ -20,6 +20,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/compiler/xla/service/cpu/runtime_matvec.h" +#include "tensorflow/core/platform/dynamic_annotations.h" #include "tensorflow/core/platform/types.h" using tensorflow::int32; @@ -77,27 +78,24 @@ void MatMulImpl(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m, } // namespace -void __xla_cpu_runtime_EigenMatMulF16(const void* run_options_ptr, - Eigen::half* out, Eigen::half* lhs, - Eigen::half* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF16( + const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs, + Eigen::half* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs, + int32 transpose_rhs) { MatMulImpl(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } -void __xla_cpu_runtime_EigenMatMulF32(const void* run_options_ptr, float* out, - float* lhs, float* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF32( + const void* run_options_ptr, float* out, float* lhs, float* rhs, int64 m, + int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { MatMulImpl(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } -void __xla_cpu_runtime_EigenMatMulF64(const void* run_options_ptr, double* out, - double* lhs, double* rhs, int64 m, - int64 n, int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF64( + const void* run_options_ptr, double* out, double* lhs, double* rhs, int64 m, + int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { MatMulImpl(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc index f8c8dd5e93d53db8d87be0208b5cf4daac3464f1..8dc5f3c93b6ba1a722ea7b23b4b5190ac0600cd6 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc +++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML) +#if defined(INTEL_MKL) && !defined(INTEL_MKL_DNN_ONLY) #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h" #include "third_party/intel_mkl_ml/include/mkl_cblas.h" #include "third_party/intel_mkl_ml/include/mkl_service.h" @@ -23,6 +23,7 @@ limitations under the License. #define EIGEN_USE_THREADS #include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool" +#include "tensorflow/core/platform/dynamic_annotations.h" using tensorflow::int32; using tensorflow::int64; @@ -74,10 +75,9 @@ void MatMulF64(const void* run_options_ptr, double* out, double* lhs, } // namespace -void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out, - float* lhs, float* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_MKLMatMulF32( + const void* run_options_ptr, float* out, float* lhs, float* rhs, int64 m, + int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { const xla::ExecutableRunOptions* run_options = static_cast(run_options_ptr); // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread @@ -88,11 +88,11 @@ void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out, // Set thread number back to the previous number. mkl_set_num_threads_local(prev_num_threads); } + // BLAS GEMM API for 64-bit Matrix Multiplication -void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out, - double* lhs, double* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_MKLMatMulF64( + const void* run_options_ptr, double* out, double* lhs, double* rhs, int64 m, + int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { const xla::ExecutableRunOptions* run_options = static_cast(run_options_ptr); // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread @@ -103,22 +103,26 @@ void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out, // Set thread number back to the previous number. mkl_set_num_threads_local(prev_num_threads); } -void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr, - float* out, float* lhs, - float* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { + +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr, + float* out, float* lhs, float* rhs, + int64 m, int64 n, int64 k, + int32 transpose_lhs, + int32 transpose_rhs) { // Set the thread number to 1 for single threaded excution. int prev_num_threads = mkl_set_num_threads_local(1); MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); // Set thread number back to the previous number. mkl_set_num_threads_local(prev_num_threads); } -void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr, - double* out, double* lhs, - double* rhs, int64 m, int64 n, - int64 k, int32 transpose_lhs, - int32 transpose_rhs) { + +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr, + double* out, double* lhs, + double* rhs, int64 m, int64 n, + int64 k, int32 transpose_lhs, + int32 transpose_rhs) { // Set the thread number to 1 for single threaded excution. int prev_num_threads = mkl_set_num_threads_local(1); MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc index 17303e2f0d34e531a3a56aa147608b949e0f43ae..16692e7f2e6145b2649b67987eef47916e958be2 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc +++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc @@ -17,6 +17,7 @@ limitations under the License. #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/compiler/xla/service/cpu/runtime_matvec.h" +#include "tensorflow/core/platform/dynamic_annotations.h" #include "tensorflow/core/platform/types.h" using tensorflow::int32; @@ -71,7 +72,8 @@ void SingleThreadedMatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, } // namespace -void __xla_cpu_runtime_EigenSingleThreadedMatMulF16( +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_EigenSingleThreadedMatMulF16( const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs, Eigen::half* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { @@ -79,16 +81,22 @@ void __xla_cpu_runtime_EigenSingleThreadedMatMulF16( transpose_lhs, transpose_rhs); } -void __xla_cpu_runtime_EigenSingleThreadedMatMulF32( - const void* run_options_ptr, float* out, float* lhs, float* rhs, int64 m, - int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_EigenSingleThreadedMatMulF32(const void* run_options_ptr, + float* out, float* lhs, + float* rhs, int64 m, int64 n, + int64 k, int32 transpose_lhs, + int32 transpose_rhs) { SingleThreadedMatMul(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } -void __xla_cpu_runtime_EigenSingleThreadedMatMulF64( - const void* run_options_ptr, double* out, double* lhs, double* rhs, int64 m, - int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) { +TF_ATTRIBUTE_NO_SANITIZE_MEMORY void +__xla_cpu_runtime_EigenSingleThreadedMatMulF64(const void* run_options_ptr, + double* out, double* lhs, + double* rhs, int64 m, int64 n, + int64 k, int32 transpose_lhs, + int32 transpose_rhs) { SingleThreadedMatMul(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs); } diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc index c433bddc8432949905041b5e9e31fc6af9e8bd44..c35569c6619ba5b534c5d8bb7ad683d84b6ecf4b 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc @@ -220,7 +220,7 @@ TEST_F(InfeedTest, DISABLED_TwoInfeedsInTotalOrder) { // The body adds the reduced value of the Infeed data (first tuple element) // to the previous accumulator, and returns the accumulator and the continue // flag (second tuple element) as a tuple. - const auto build_body = [this, &result_shape](const Shape& infeed_shape) { + const auto build_body = [&result_shape](const Shape& infeed_shape) { XlaComputation body; XlaBuilder builder("body"); auto prev = Parameter(&builder, 0, result_shape, "prev"); diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h index 097fa23027bf55ad0b92c347c5a1209bb5836695..86d57581f84920e8005e8f3c420e7488fc095434 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h @@ -106,6 +106,7 @@ class DfsHloVisitorBase { virtual Status HandleConvolution(HloInstructionPtr hlo) = 0; virtual Status HandleFft(HloInstructionPtr fft) = 0; virtual Status HandleCrossReplicaSum(HloInstructionPtr hlo) = 0; + virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0; virtual Status HandleCompare(HloInstructionPtr hlo) { return HandleElementwiseBinary(hlo); } @@ -233,6 +234,7 @@ class DfsHloVisitorBase { virtual Status HandleWhile(HloInstructionPtr hlo) = 0; virtual Status HandleConditional(HloInstructionPtr hlo) = 0; virtual Status HandleGather(HloInstructionPtr hlo) = 0; + virtual Status HandleScatter(HloInstructionPtr hlo) = 0; virtual Status HandlePad(HloInstructionPtr hlo) = 0; diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h index f4316e0fb77855aad1c4710908df09c604da896e..617a5a2eb4796d8003099e39e3d26389e532e954 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h @@ -94,6 +94,9 @@ class DfsHloVisitorWithDefaultBase Status HandleCrossReplicaSum(HloInstructionPtr crs) override { return DefaultAction(crs); } + Status HandleAllToAll(HloInstructionPtr crs) override { + return DefaultAction(crs); + } Status HandleRng(HloInstructionPtr random) override { return DefaultAction(random); } @@ -194,6 +197,9 @@ class DfsHloVisitorWithDefaultBase Status HandleGather(HloInstructionPtr gather) override { return DefaultAction(gather); } + Status HandleScatter(HloInstructionPtr scatter) override { + return DefaultAction(scatter); + } Status HandleAfterAll(HloInstructionPtr token) override { return DefaultAction(token); } diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index f883eb828c7f6365dfd4d5e0b514dc6894adc12b..2e9d6be2de4a2ab918d9a5ea4881ad3fd036792e 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -431,6 +431,8 @@ StatusOr ElementalIrEmitter::EmitFloatUnaryOp( return EmitCos(op->shape().element_type(), operand_value); case HloOpcode::kSin: return EmitSin(op->shape().element_type(), operand_value); + case HloOpcode::kTanh: + return EmitTanh(op->shape().element_type(), operand_value); case HloOpcode::kFloor: return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor, {operand_value}, @@ -1060,6 +1062,11 @@ StatusOr ElementalIrEmitter::EmitAtan2(PrimitiveType prim_type, return Unimplemented("atan2"); } +StatusOr ElementalIrEmitter::EmitTanh(PrimitiveType prim_type, + llvm::Value* value) const { + return Unimplemented("tanh"); +} + StatusOr ElementalIrEmitter::EmitReducePrecision( const HloInstruction* hlo, llvm::Value* x) const { if (hlo->operand(0)->shape().element_type() != F32) { @@ -1239,13 +1246,23 @@ StatusOr ElementalIrEmitter::ConvertValueForDistribution( // Convert raw integer to float in range [0, 1) if the element is a float. llvm::Value* elem_value = raw_value; if (elem_ir_ty->isFloatingPointTy()) { - elem_value = b_->CreateUIToFP(elem_value, elem_ir_ty); unsigned raw_value_size_in_bits = raw_value_ty->getPrimitiveSizeInBits(); CHECK(raw_value_size_in_bits == 32 || raw_value_size_in_bits == 64); - elem_value = b_->CreateFDiv( - elem_value, - llvm::ConstantFP::get(elem_ir_ty, - raw_value_size_in_bits == 64 ? 0x1p64 : 0x1p32)); + // Perform the division using the float type with the same number of bits + // as the raw value to avoid overflow. + if (raw_value_size_in_bits == 32) { + elem_value = b_->CreateUIToFP(elem_value, b_->getFloatTy()); + elem_value = b_->CreateFDiv( + elem_value, llvm::ConstantFP::get(b_->getFloatTy(), std::exp2(32))); + } else { + elem_value = b_->CreateUIToFP(elem_value, b_->getDoubleTy()); + elem_value = b_->CreateFDiv( + elem_value, llvm::ConstantFP::get(b_->getDoubleTy(), std::exp2(64))); + } + + if (elem_ir_ty != elem_value->getType()) { + elem_value = b_->CreateFPTrunc(elem_value, elem_ir_ty); + } } // Convert the value for the requested distribution. @@ -1302,6 +1319,7 @@ int32 GetNumberOfElementsPerPhiloxRngSample(PrimitiveType elem_prim_ty) { case F16: return 4; case U64: + case S64: case F64: return 2; default: @@ -2134,7 +2152,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( return EmitElementalDot(hlo, operand_to_generator, dot_result_index); }; default: - return [this, hlo, &operand_to_generator](const IrArray::Index& index) { + return [hlo](const IrArray::Index& index) { return Unimplemented("Unhandled opcode for elemental IR emission: %s", HloOpcodeString(hlo->opcode()).c_str()); }; diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h index fcb34557a52d35ef30a5dee643171e17407d05c2..1598a4dd85632cfa9835a81a21eddff3e57bfa1f 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h @@ -122,6 +122,9 @@ class ElementalIrEmitter { llvm::Value* lhs, llvm::Value* rhs) const; + virtual StatusOr EmitTanh(PrimitiveType prim_type, + llvm::Value* value) const; + virtual StatusOr EmitReducePrecision(const HloInstruction* hlo, llvm::Value* x) const; diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc index e314a469f00abdb9f60ae812c0b78d273dc95dbe..0ce2db907b643f3beabd127388370dbe601179e1 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc @@ -24,7 +24,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/interpreter/platform_id.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -60,17 +59,19 @@ Status GenericTransferManager::WriteSingleTupleIndexTable( void GenericTransferManager::TransferLiteralFromDevice( se::Stream* stream, const ShapedBuffer& device_buffer, - std::function>)> done) { + MutableBorrowingLiteral literal, std::function done) { Status status = stream->BlockHostUntilDone(); if (!status.ok()) { return done(status); } - done(TransferLiteralFromDeviceInternal(stream->parent(), device_buffer)); + + done(TransferLiteralFromDeviceInternal(stream->parent(), device_buffer, + literal)); } -StatusOr> -GenericTransferManager::TransferLiteralFromDeviceInternal( - se::StreamExecutor* executor, const ShapedBuffer& device_buffer) { +Status GenericTransferManager::TransferLiteralFromDeviceInternal( + se::StreamExecutor* executor, const ShapedBuffer& device_buffer, + MutableBorrowingLiteral literal) { VLOG(2) << "transferring literal from device ordinal " << executor->device_ordinal() << "; device buffer: " << device_buffer; TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal()); @@ -80,9 +81,6 @@ GenericTransferManager::TransferLiteralFromDeviceInternal( TF_RET_CHECK(ShapeUtil::Equal(device_buffer.on_device_shape(), device_buffer.on_host_shape())); - std::unique_ptr literal = - Literal::CreateFromShape(device_buffer.on_host_shape()); - TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus( device_buffer.on_host_shape(), [&](const Shape& subshape, const ShapeIndex& index) -> Status { @@ -91,12 +89,12 @@ GenericTransferManager::TransferLiteralFromDeviceInternal( /*source=*/device_buffer.buffer(index), /*size=*/GetByteSizeRequirement(subshape), /*destination=*/ - literal->untyped_data(index))); + literal.untyped_data(index))); } return Status::OK(); })); - return std::move(literal); + return Status::OK(); } Status GenericTransferManager::TransferLiteralToDeviceAsync( @@ -160,7 +158,7 @@ Status GenericTransferManager::TransferLiteralToInfeed( Status GenericTransferManager::TransferLiteralFromOutfeed( se::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) { + MutableBorrowingLiteral literal) { return Unimplemented("Generic transfer from Outfeed"); } diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h index 3cd002c1bf3555cc2d2891c88b3ad648f8d9fd8c..6c1a21587a7ef5199afb93715dc57be5139fbc22 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.h +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h @@ -19,7 +19,6 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/transfer_manager.h" -#include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" @@ -41,9 +40,10 @@ class GenericTransferManager : public TransferManager { se::Platform::Id PlatformId() const override; - void TransferLiteralFromDevice( - se::Stream* stream, const ShapedBuffer& device_buffer, - std::function>)> done) override; + void TransferLiteralFromDevice(se::Stream* stream, + const ShapedBuffer& device_buffer, + MutableBorrowingLiteral literal, + std::function done) override; Status TransferLiteralToDeviceAsync( se::Stream* stream, const LiteralSlice& literal, @@ -53,7 +53,7 @@ class GenericTransferManager : public TransferManager { const LiteralSlice& literal) override; Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) override; + MutableBorrowingLiteral literal) override; Status ResetDevices( tensorflow::gtl::ArraySlice executors) override; @@ -67,8 +67,9 @@ class GenericTransferManager : public TransferManager { const Shape& shape, se::DeviceMemoryBase* region) override; private: - StatusOr> TransferLiteralFromDeviceInternal( - se::StreamExecutor* executor, const ShapedBuffer& device_buffer); + Status TransferLiteralFromDeviceInternal(se::StreamExecutor* executor, + const ShapedBuffer& device_buffer, + MutableBorrowingLiteral literal); // The platform this transfer manager targets. const se::Platform::Id platform_id_; diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index e0aae3866b3e5b25c611c49f4f3a8a4149e9f71e..a3f6e8d9893528642e05354994c1d826949c6063 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -153,7 +153,6 @@ cc_library( ":ir_emission_utils", ":parallel_loop_emitter", ":partition_assignment", - ":while_transformer", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -166,6 +165,7 @@ cc_library( "//tensorflow/compiler/xla/service:elemental_ir_emitter", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:name_uniquer", + "//tensorflow/compiler/xla/service:while_loop_analysis", "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util", "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util", "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter", @@ -636,7 +636,6 @@ cc_library( "//tensorflow/compiler/xla/service:buffer_liveness", "//tensorflow/compiler/xla/service:call_inliner", "//tensorflow/compiler/xla/service:conditional_simplifier", - "//tensorflow/compiler/xla/service:dot_decomposer", "//tensorflow/compiler/xla/service:executable", "//tensorflow/compiler/xla/service:flatten_call_graph", "//tensorflow/compiler/xla/service:hlo", @@ -656,7 +655,6 @@ cc_library( "//tensorflow/compiler/xla/service:transpose_folding", "//tensorflow/compiler/xla/service:tuple_simplifier", "//tensorflow/compiler/xla/service:while_loop_constant_sinking", - "//tensorflow/compiler/xla/service:while_loop_invariant_code_motion", "//tensorflow/compiler/xla/service:while_loop_simplifier", "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination", "//tensorflow/compiler/xla/service/gpu:cudnn_batchnorm_rewriter", @@ -749,6 +747,8 @@ tf_cc_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:computation_layout", "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_matchers", + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", # build_cleaner: keep ], @@ -787,32 +787,17 @@ tf_cc_test( ], ) -cc_library( - name = "while_transformer", - srcs = ["while_transformer.cc"], - hdrs = ["while_transformer.h"], - deps = [ - "//tensorflow/compiler/xla:literal", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/core:lib", - ], -) - tf_cc_test( name = "while_transformer_test", srcs = ["while_transformer_test.cc"], deps = [ ":instruction_fusion", - ":while_transformer", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/service:copy_insertion", "//tensorflow/compiler/xla/service:hlo_verifier", + "//tensorflow/compiler/xla/service:while_loop_analysis", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc index 5a63e65208ac3e8e23944bc31634f4d29d91c10c..7348307ec8a7286dfb733d6b9685862b20f11ac9 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/mutex.h" namespace xla { namespace gpu { @@ -137,6 +138,28 @@ string NumBytesToString(int64 bytes) { tensorflow::strings::HumanReadableNumBytes(bytes), " (", bytes, "B)"); } +// Acquires a process-global lock on the device pointed to by the given +// StreamExecutor. +// +// This is used to prevent other XLA instances from trying to autotune on this +// device while we're using it. +tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) { + static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); + // se::Platform*s are global singletons guaranteed to live forever. + static auto* mutexes = + new std::map, + tensorflow::mutex>(); + + tensorflow::mutex_lock global_lock(mu); + auto it = mutexes + ->emplace(std::piecewise_construct, + std::make_tuple(stream_exec->platform(), + stream_exec->device_ordinal()), + std::make_tuple()) + .first; + return tensorflow::mutex_lock{it->second}; +} + } // anonymous namespace // We could have caching here so that we don't redo this work for two identical @@ -155,6 +178,13 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm( CudnnConvKind kind, const Shape& input_shape, const Shape& filter_shape, const Shape& output_shape, const Window& window, const ConvolutionDimensionNumbers& dnums, HloInstruction* instr) { + // Don't run this function concurrently on the same GPU. + // + // This is a bit of a hack and doesn't protect us against arbitrary concurrent + // use of a GPU, but it's sufficient to let us compile two HLO modules + // concurrently and then run them sequentially. + tensorflow::mutex_lock lock = LockGpu(stream_exec_); + // Create a stream for us to do our work on. se::Stream stream{stream_exec_}; stream.Init(); diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc index cc38db27e2680e950f74e104cef8829585c7b81c..9b6de115ad7e7f87e431f839c1690858f4bce3fd 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc @@ -210,11 +210,13 @@ StatusOr GpuElementalIrEmitter::EmitPowerOp( return make_sqrt(); } - if (hlo_module_config_.debug_options().xla_enable_fast_math() && - IsFPLiteralWithValue(rhs, -.5)) { + if (IsFPLiteralWithValue(rhs, -.5)) { VLOG(10) << "emitting pow(A, -.5) as 1/sqrt(A): " << op->ToString(); // LLVM's NVPTX backend knows how to transform 1/sqrt(A) into the NVPTX // rsqrt.approx instruction. + // + // TODO(jlebar): Does this happen with fastmath disabled? If not, should + // we force-enable it? TF_ASSIGN_OR_RETURN(auto* sqrt, make_sqrt()); return b_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt); } @@ -272,27 +274,20 @@ StatusOr GpuElementalIrEmitter::EmitAtan2( prim_type); } -StatusOr GpuElementalIrEmitter::EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const { - PrimitiveType input_type = op->operand(0)->shape().element_type(); - PrimitiveType output_type = op->shape().element_type(); - switch (op->opcode()) { - case HloOpcode::kTanh: - // If we don't care much about precision, emit a fast approximation of - // tanh. - if (hlo_module_config_.debug_options().xla_enable_fast_math()) { - // Upcast F16 to F32 if necessary. - llvm::Type* type = - input_type == F16 ? b_->getFloatTy() : operand_value->getType(); - llvm::Value* input = b_->CreateFPCast(operand_value, type); - llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input); - return b_->CreateFPCast(fast_tanh, operand_value->getType()); - } - return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type}, - output_type); - default: - return ElementalIrEmitter::EmitFloatUnaryOp(op, operand_value); - } +StatusOr GpuElementalIrEmitter::EmitTanh( + PrimitiveType prim_type, llvm::Value* value) const { + // Emit a fast approximation of tanh instead of calling __nv_tanh. + // __nv_tanh is particularly bad because it contains branches, thus + // preventing LLVM's load-store vectorizer from working its magic across a + // function which contains tanh calls. + // + // This routine isn't numerically precise, but it's good enough for ML. + + // Upcast F16 to F32 if necessary. + llvm::Type* type = prim_type == F16 ? b_->getFloatTy() : value->getType(); + llvm::Value* input = b_->CreateFPCast(value, type); + llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input); + return b_->CreateFPCast(fast_tanh, value->getType()); } llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall( @@ -445,6 +440,8 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( return b_->CreateLoad(accum_ptr); }; case HloOpcode::kReduce: + // TODO(b/112040122): This should be supported. + CHECK_EQ(hlo->operand_count(), 2) << "Did not expect variadic reduce"; return [=, &operand_to_generator]( const IrArray::Index& output_index) -> StatusOr { const HloInstruction* operand = hlo->operand(0); diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h index e3eacef133cb8b615a645ca2f11dd6dedf9f0176..84454d31bb820a3de6ef3364bd205b8115bd95c0 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h @@ -51,9 +51,6 @@ class GpuElementalIrEmitter : public ElementalIrEmitter { const HloToElementGeneratorMap& operand_to_generator) const override; protected: - StatusOr EmitFloatUnaryOp( - const HloInstruction* op, llvm::Value* operand_value) const override; - StatusOr EmitFloatBinaryOp( const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) const override; @@ -85,6 +82,9 @@ class GpuElementalIrEmitter : public ElementalIrEmitter { StatusOr EmitAtan2(PrimitiveType prim_type, llvm::Value* lhs, llvm::Value* rhs) const override; + StatusOr EmitTanh(PrimitiveType prim_type, + llvm::Value* value) const override; + llvm::Value* EmitThreadId() const override; private: diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc index b3a3c5dcb4d77889b65a119f09ddef9ba95d6b52..2fd2206324e5f763490780a54880825a772b7ea2 100644 --- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc @@ -43,6 +43,8 @@ Status ForThunk::Initialize(const GpuExecutable& executable, Status ForThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, se::Stream* stream, HloExecutionProfiler* profiler) { + VLOG(2) << "Executing ForThunk with " << loop_limit_ << " iters for " + << (hlo_instruction() ? hlo_instruction()->ToString() : ""); auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); for (int64 i = 0; i < loop_limit_; ++i) { profiler->StartHloComputation(); diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc index dbc7754e251eb8075ab97dd2f36bbc400530fcf5..74282c568c09921dbeec2e9cce79b6c73b6ea592 100644 --- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" @@ -31,16 +32,19 @@ namespace { // dimensions. struct MatrixDescriptor { MatrixDescriptor(se::DeviceMemoryBase matrix_data, bool needs_transpose, - int64 matrix_num_rows, int64 matrix_num_cols) + int64 matrix_num_rows, int64 matrix_num_cols, + int64 matrix_batch_size) : data(matrix_data), transpose(needs_transpose), num_rows(matrix_num_rows), - num_cols(matrix_num_cols) {} + num_cols(matrix_num_cols), + batch_size(matrix_batch_size) {} se::DeviceMemoryBase data; bool transpose; // Whether this matrix needs to be transposed. int64 num_rows; int64 num_cols; + int64 batch_size; }; // Performs a gemm call without an explicit algorithm on lhs_matrix and @@ -50,6 +54,9 @@ bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix, MatrixDescriptor output_matrix, double alpha, se::Stream* stream) { DCHECK(!output_matrix.transpose); + const int64 batch_size = lhs_matrix.batch_size; + CHECK_EQ(batch_size, rhs_matrix.batch_size); + CHECK_EQ(batch_size, output_matrix.batch_size); se::DeviceMemory lhs_data(lhs_matrix.data); se::DeviceMemory rhs_data(rhs_matrix.data); se::DeviceMemory output_data(output_matrix.data); @@ -60,13 +67,30 @@ bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix, : se::blas::Transpose::kNoTranspose; auto k = lhs_matrix.transpose ? lhs_matrix.num_rows : lhs_matrix.num_cols; + if (batch_size == 1) { + return stream + ->ThenBlasGemm( + lhs_transpose, rhs_transpose, output_matrix.num_rows, + output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/alpha, + lhs_data, /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data, + /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/0.0, + &output_data, /*leading dim of output=*/output_matrix.num_rows) + .ok(); + } + + int64 lhs_stride = lhs_matrix.num_rows * lhs_matrix.num_cols; + int64 rhs_stride = rhs_matrix.num_rows * rhs_matrix.num_cols; + int64 output_stride = output_matrix.num_rows * output_matrix.num_cols; return stream - ->ThenBlasGemm( + ->ThenBlasGemmStridedBatched( lhs_transpose, rhs_transpose, output_matrix.num_rows, - output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/alpha, - lhs_data, /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data, - /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/0.0, - &output_data, /*leading dim of output=*/output_matrix.num_rows) + output_matrix.num_cols, /*size of reduce dim=*/k, + /*alpha=*/alpha, lhs_data, + /*leading dim of LHS=*/lhs_matrix.num_rows, lhs_stride, rhs_data, + /*leading dim of RHS=*/rhs_matrix.num_rows, rhs_stride, + /*beta=*/0.0, &output_data, + /*leading dim of output=*/output_matrix.num_rows, output_stride, + batch_size) .ok(); } @@ -93,6 +117,10 @@ bool DoGemmWithAlgorithm(MatrixDescriptor lhs_matrix, se::blas::ProfileResult* output_profile_result) { DCHECK(!output_matrix.transpose); + CHECK_EQ(1, lhs_matrix.batch_size); + CHECK_EQ(1, rhs_matrix.batch_size); + CHECK_EQ(1, output_matrix.batch_size); + se::DeviceMemory lhs_data(lhs_matrix.data); se::DeviceMemory rhs_data(rhs_matrix.data); se::DeviceMemory output_data(output_matrix.data); @@ -141,9 +169,15 @@ StatusOr DoGemmAutotune( alpha, computation_type, algorithm, stream, &profile_result)); - if (profile_result.is_valid() && profile_result.elapsed_time_in_ms() < - best_result.elapsed_time_in_ms()) { - best_result = profile_result; + if (profile_result.is_valid()) { + VLOG(3) << "cublas gemm algorithm " << algorithm << " took " + << profile_result.elapsed_time_in_ms() << "ms"; + if (profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + } + } else { + VLOG(4) << "cublas gemm algorithm " << algorithm << " failed."; } } @@ -167,6 +201,8 @@ auto GetGemmFn(PrimitiveType type) -> decltype(&DoGemm) { return &DoGemm; case F64: return &DoGemm; + case C64: + return &DoGemm>; default: LOG(FATAL) << "Unsupported type."; } @@ -180,6 +216,8 @@ auto GetGemmWithAlgorithmFn(PrimitiveType type) return &DoGemmWithAlgorithm; case F64: return &DoGemmWithAlgorithm; + case C64: + return &DoGemmWithAlgorithm>; default: LOG(FATAL) << "Unsupported type."; } @@ -192,6 +230,8 @@ auto GetGemmAutotuneFn(PrimitiveType type) -> decltype(&DoGemmAutotune) { return &DoGemmAutotune; case F64: return &DoGemmAutotune; + case C64: + return &DoGemmAutotune>; default: LOG(FATAL) << "Unsupported type."; } @@ -210,6 +250,8 @@ se::blas::ComputationType GetBlasComputationType(PrimitiveType type) { return se::blas::ComputationType::kF32; case F64: return se::blas::ComputationType::kF64; + case C64: + return se::blas::ComputationType::kComplexF32; default: LOG(FATAL) << "Unsupported type."; } @@ -263,12 +305,37 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, se::DeviceMemoryBase output_data = buffer_allocations.GetDeviceAddress(output_buffer_); + DotDimensionNumbers dim_nums = GetDimensionNumbers(*hlo_instruction()); + CHECK_EQ(dim_nums.lhs_batch_dimensions_size(), + dim_nums.rhs_batch_dimensions_size()); + CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2, + ShapeUtil::Rank(output_shape_)); + + int64 row_dim = dim_nums.lhs_batch_dimensions_size(); + int64 col_dim = dim_nums.lhs_batch_dimensions_size() + 1; + int64 batch_size = std::accumulate(output_shape_.dimensions().begin(), + output_shape_.dimensions().end() - 2, 1, + std::multiplies()); + + // Check that the batch dims don't cover the last two dims. + for (int64 batch_dim : dim_nums.lhs_batch_dimensions()) { + CHECK_NE(row_dim, batch_dim); + CHECK_NE(col_dim, batch_dim); + } + + // Verify that the non-batch dimensions are minor-most. This is required for + // efficient access. + for (const auto* shape : {&lhs_shape_, &rhs_shape_, &output_shape_}) { + CHECK_LT(shape->layout().minor_to_major(row_dim), 2); + CHECK_LT(shape->layout().minor_to_major(col_dim), 2); + } + // BLAS gemm reduces rows of LHS and columns of RHS. The Dot operator between // matrices reduces dimension 1 of LHS and dimension 0 of RHS regardless of // their layout. Therefore, we should treat dimension 0 as row and dimension 1 // as column when mapping a matrix Dot to BLAS gemm. - int64 output_num_rows = output_shape_.dimensions(0); - int64 output_num_cols = output_shape_.dimensions(1); + int64 output_num_rows = output_shape_.dimensions(row_dim); + int64 output_num_cols = output_shape_.dimensions(col_dim); // BLAS gemm expects the inputs and the output are in column-major order. // Therefore, we need to convert dot between row-major matrices to that @@ -291,34 +358,46 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, // the leading dimension of the LHS matrix of gemm is the number of rows in // B^T and thus the number of columns in B. - auto make_descriptor = [this](se::DeviceMemoryBase data, const Shape& shape, - bool transpose) -> MatrixDescriptor { - bool is_row_major = LayoutUtil::Minor(shape.layout(), 0) != 0; - bool layout_mismatch = LayoutUtil::Minor(shape.layout(), 0) != - LayoutUtil::Minor(output_shape_.layout(), 0); - return MatrixDescriptor(data, transpose ^ layout_mismatch, - shape.dimensions(is_row_major), - shape.dimensions(!is_row_major)); + auto make_descriptor = [&](se::DeviceMemoryBase data, const Shape& shape, + bool transpose) -> MatrixDescriptor { + bool is_row_major = LayoutUtil::Minor(shape.layout(), row_dim) != 0; + bool layout_mismatch = LayoutUtil::Minor(shape.layout(), row_dim) != + LayoutUtil::Minor(output_shape_.layout(), row_dim); + return MatrixDescriptor( + data, transpose ^ layout_mismatch, + shape.dimensions(row_dim + static_cast(is_row_major)), + shape.dimensions(row_dim + static_cast(!is_row_major)), + batch_size); }; - DotDimensionNumbers dim_nums = GetDimensionNumbers(*hlo_instruction()); - const MatrixDescriptor lhs_descriptor = make_descriptor( - lhs_data, lhs_shape_, dim_nums.lhs_contracting_dimensions(0) == 0); + lhs_data, lhs_shape_, dim_nums.lhs_contracting_dimensions(0) == row_dim); const MatrixDescriptor rhs_descriptor = make_descriptor( - rhs_data, rhs_shape_, dim_nums.rhs_contracting_dimensions(0) == 1); + rhs_data, rhs_shape_, dim_nums.rhs_contracting_dimensions(0) == col_dim); // Dispatches to a regular cublas gemm, a gemm-with-algorithm, or attempts to // autotune this gemm to figure out the best algorithm. - auto launch = [this](MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix, - MatrixDescriptor output_matrix, se::Stream* stream) { + auto launch = [&](MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix, + MatrixDescriptor output_matrix, se::Stream* stream) { PrimitiveType element_type = output_shape_.element_type(); se::blas::ComputationType computation_type = GetBlasComputationType(element_type); + // TODO(b/112111608): Implement auto tune for batched gemm. + if (batch_size != 1) { + return GetGemmFn(element_type)(lhs_matrix, rhs_matrix, output_matrix, + alpha_, stream); + } + + auto thunk_name = [&] { + return hlo_instruction() != nullptr ? hlo_instruction()->ToString() + : ""; + }; + const string& device_name = stream->parent()->GetDeviceDescription().name(); auto autotune_it = autotune_results_.find(device_name); if (autotune_it == autotune_results_.end()) { + VLOG(3) << "Starting autotune of GemmThunk " << thunk_name(); StatusOr best_algorithm = GetGemmAutotuneFn(element_type)(lhs_matrix, rhs_matrix, output_matrix, alpha_, computation_type, stream); @@ -326,11 +405,11 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, autotune_results_.insert({device_name, best_algorithm}).first; if (autotune_it->second.ok()) { - VLOG(2) << "Autotune on GemmThunk " << this + VLOG(2) << "Autotune on GemmThunk " << thunk_name() << " successful; best algorithm is " << best_algorithm.ValueOrDie(); } else { - VLOG(2) << "Autotune on GemmThunk " << this + VLOG(2) << "Autotune on GemmThunk " << thunk_name() << " unsuccessful. Will use generic gemm."; } } @@ -340,7 +419,7 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, if (best_algorithm.ok()) { auto algorithm = best_algorithm.ValueOrDie(); VLOG(2) << "Using algorithm " << algorithm - << " chosen by autotuning on GemmThunk " << this; + << " chosen by autotuning on GemmThunk " << thunk_name(); return GetGemmWithAlgorithmFn(element_type)( lhs_matrix, rhs_matrix, output_matrix, alpha_, computation_type, algorithm, stream, @@ -355,16 +434,16 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); bool launch_ok; - if (LayoutUtil::Minor(output_shape_.layout(), 0) == 0) { - launch_ok = launch( - lhs_descriptor, rhs_descriptor, - MatrixDescriptor(output_data, false, output_num_rows, output_num_cols), - stream); + if (LayoutUtil::Minor(output_shape_.layout(), row_dim) == 0) { + launch_ok = launch(lhs_descriptor, rhs_descriptor, + MatrixDescriptor(output_data, false, output_num_rows, + output_num_cols, batch_size), + stream); } else { - launch_ok = launch( - rhs_descriptor, lhs_descriptor, - MatrixDescriptor(output_data, false, output_num_cols, output_num_rows), - stream); + launch_ok = launch(rhs_descriptor, lhs_descriptor, + MatrixDescriptor(output_data, false, output_num_cols, + output_num_rows, batch_size), + stream); } if (!launch_ok) { diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h index 939c7f85e35b4fcb943a25aa6346d72798432920..12c81f9bfc6bfdac63edf9c826b835057107fa41 100644 --- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h @@ -52,12 +52,12 @@ class GemmThunk : public Thunk { se::Stream* stream, HloExecutionProfiler* profiler) override; - // Returns true if we'll perform autotuning if run on the given stream. If - // so, we want the GPU to be quiescent during autotuning, so as not to - // introduce noise in our results. - bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override { - return autotune_results_.count( - stream->parent()->GetDeviceDescription().name()) != 0; + bool WillAutotuneKernel(se::Stream* stream) override { + // We will autotune this kernel if we don't already have a autotune result + // for the stream device. + return autotune_results_.find( + stream->parent()->GetDeviceDescription().name()) == + autotune_results_.end(); } private: @@ -75,6 +75,8 @@ class GemmThunk : public Thunk { // results. The map's value is the best algorithm we've found for this thunk // on this device, or an error if none of the algorithms worked and we should // use the regular gemm without an algorithm. + // + // TODO(b/112415150): Make this thread safe. std::unordered_map> autotune_results_; }; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index bb71c79fd7646c9d3bad282d8041a9a05aec0485..70608379048871cf6ee72145fa9afff71a3eabe6 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -131,9 +131,10 @@ Status GpuExecutable::ExecuteThunks( stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get()); } - // If this thunk requests it, wait for all currently-executing thunks to - // finish. This is useful e.g. if the thunk is about to perform autotuning. - if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) { + // If this thunk is about to autotune then wait for all currently executing + // thunks to finish. This reduces noise and thus the probability of + // choosing a suboptimal algorithm. + if (thunk->WillAutotuneKernel(stream)) { TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone()); } @@ -293,7 +294,7 @@ StatusOr GpuExecutable::ExecuteOnStream( // the respective location in ShapedBuffer. std::set buffers_in_result; TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus( - [&buffer_allocations, &buffers_in_result, &shaped_buffer, this]( + [&buffer_allocations, &buffers_in_result, this]( const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { const auto& sources = this->GetRootPointsToSet().element(index); // The points-to set is unambiguous so the set should be a diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc index 6ac5dfbcd5e3bfcca179ff82819120ce14e3c9da..d033faee8d25ed81a1483f8314652ef999ab36c5 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc @@ -176,6 +176,38 @@ Status GpuLayoutAssignment::AddBackendConstraints( TF_RETURN_IF_ERROR( AddBackendConstraintsToDnnConvCustomCall(instruction, constraints)); } + + // For batched dot we require the default layout. + // TODO(b/112111608): This is overly conservative, the only real restriction + // is that batch dimensions must be major. + if (instruction->opcode() == HloOpcode::kDot && + ImplementedAsGemm(*instruction) && + instruction->dot_dimension_numbers().lhs_batch_dimensions_size() > 0) { + // Verify that the batch dims come before the row and col dims. + const DotDimensionNumbers& dim_nums = + instruction->dot_dimension_numbers(); + CHECK_EQ(dim_nums.lhs_batch_dimensions_size(), + dim_nums.rhs_batch_dimensions_size()); + CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2, + ShapeUtil::Rank(instruction->shape())); + for (int64 batch_dim : dim_nums.lhs_batch_dimensions()) { + CHECK_LT(batch_dim, ShapeUtil::Rank(instruction->shape()) - 2); + } + + // Set both inputs and the output to default layout. + Shape op0_shape = instruction->operand(0)->shape(); + LayoutUtil::SetToDefaultLayout(&op0_shape); + Shape op1_shape = instruction->operand(1)->shape(); + LayoutUtil::SetToDefaultLayout(&op1_shape); + Shape output_shape = instruction->shape(); + LayoutUtil::SetToDefaultLayout(&output_shape); + TF_RETURN_IF_ERROR( + constraints->SetOperandLayout(op0_shape, instruction, 0)); + TF_RETURN_IF_ERROR( + constraints->SetOperandLayout(op1_shape, instruction, 1)); + TF_RETURN_IF_ERROR( + constraints->SetInstructionLayout(output_shape, instruction)); + } } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc index 95f78ae29326caad2f0785e2ba285a996e685899..286547ebae2f1a4b8d783a06d13b4dd96052b952 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc @@ -20,8 +20,10 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/shape_layout.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" @@ -31,6 +33,8 @@ namespace xla { namespace gpu { namespace { +namespace op = xla::testing::opcode_matchers; + using LayoutAssignmentTest = HloTestBase; TEST_F(LayoutAssignmentTest, Elementwise) { @@ -327,6 +331,33 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) { } } +TEST_F(LayoutAssignmentTest, DotLayout) { + const char* hlo_text = R"( + HloModule DotLayout + ENTRY dot { + p0 = f32[8,8,256,64]{3,1,2,0} parameter(0) + p1 = f32[8,8,256,64]{3,1,2,0} parameter(1) + ROOT dot.1330.10585 = f32[8,8,256,256]{3,2,1,0} dot(p0, p1), + lhs_batch_dims={0,1}, lhs_contracting_dims={3}, + rhs_batch_dims={0,1}, rhs_contracting_dims={3} + })"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(hlo_text)); + + ComputationLayout computation_layout( + module->entry_computation()->ComputeProgramShape()); + GpuLayoutAssignment layout_assignment(&computation_layout, + backend().default_stream_executor()); + EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie()); + + Shape expected_shape = + ShapeUtil::MakeShapeWithLayout(F32, {8, 8, 256, 64}, {3, 2, 1, 0}); + EXPECT_THAT(module->entry_computation()->root_instruction(), + op::Dot(op::ShapeWithLayout(expected_shape), + op::ShapeWithLayout(expected_shape))); +} + } // namespace } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc index 79b3f1efecdf06bfa93b17a1799f3009d517f3b5..a2f53f844613da9fe8166489dc9959e8d30c6332 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc @@ -117,38 +117,37 @@ StatusOr GpuTransferManager::TransferBufferToInfeedInternal( return std::move(buffer); } -static std::unique_ptr ShapeTreeToLiteral( +static void ShapeTreeToLiteral( ShapeTree>* shape_tree) { // This is a struct instead of a lambda for std::function-free recursion. struct Helper { - static std::unique_ptr helper( + static void helper( ShapeTree>* shape_tree, ShapeIndex* index) { const Shape& shape = ShapeUtil::GetSubshape(shape_tree->shape(), *index); if (ShapeUtil::IsArray(shape)) { - return (*shape_tree->mutable_element(*index))->WaitUntilAvailable(); + (*shape_tree->mutable_element(*index))->WaitUntilAvailable(); + return; } CHECK(ShapeUtil::IsTuple(shape)) << ShapeUtil::HumanStringWithLayout(shape); const int64 tuple_element_count = ShapeUtil::TupleElementCount(shape); index->push_back(0); - std::vector> tuple_operands; for (int64 i = 0; i < tuple_element_count; ++i) { index->back() = i; - tuple_operands.push_back(helper(shape_tree, index)); + helper(shape_tree, index); } index->pop_back(); - return LiteralUtil::MakeTupleOwned(std::move(tuple_operands)); } }; ShapeIndex index; - return Helper::helper(shape_tree, &index); + Helper::helper(shape_tree, &index); } Status GpuTransferManager::TransferLiteralFromOutfeed( se::StreamExecutor* /*executor*/, const Shape& literal_shape, - Literal* literal) { + MutableBorrowingLiteral literal) { ShapeTree> outfeed_buffers( &literal_shape); @@ -162,6 +161,8 @@ Status GpuTransferManager::TransferLiteralFromOutfeed( return; } *buffer = MakeUnique(GetByteSizeRequirement(shape)); + (*buffer)->set_destination( + MakeUnique(literal, index)); }); // Give the tree of buffers to the outfeed mananger. The device will fill it @@ -169,8 +170,8 @@ Status GpuTransferManager::TransferLiteralFromOutfeed( gpu::OutfeedManager* outfeed_manager = gpu::GetOrCreateOutfeedManager(); outfeed_manager->EnqueueDestination(&outfeed_buffers); - // Now turn the tree of buffers back into a literal. - *literal = std::move(*ShapeTreeToLiteral(&outfeed_buffers)); + // Now wait for the tree of buffers are written. + ShapeTreeToLiteral(&outfeed_buffers); return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h index dceeb9e2eb01a7dd5e978d819ed1db56d828f353..7929042869763dfeab2fe8f87093b7ea758337d0 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h @@ -42,7 +42,7 @@ class GpuTransferManager : public GenericTransferManager { const LiteralSlice& literal) override; Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) override; + MutableBorrowingLiteral literal) override; private: // Initiates the infeed data transfers. InfeedBuffer->Done() must be diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc index 6352b330d17d77da65ed4ffb5a225535ff6caf82..c349063c71f000435a05306101ad724505f2d197 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc @@ -38,24 +38,27 @@ namespace gpu { namespace { // Return whether the given shape is a matrix with no padding. -bool IsRank2WithNoPadding(const Shape& shape) { - return ShapeUtil::Rank(shape) == 2 && !LayoutUtil::IsPadded(shape); +bool IsRank2WithNoPadding(const Shape& shape, int64 batch_dimensions_size) { + return ShapeUtil::Rank(shape) == batch_dimensions_size + 2 && + !LayoutUtil::IsPadded(shape); } // In a gemm operation where output = lhs * rhs, check whether the given shapes // are valid for the operation. bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape, - const Shape& output_shape) { + const Shape& output_shape, + int64 batch_dimensions_size) { // The inputs and the output must // 1) be matrices with no padding and a non-zero number of elements, // 2) have an allowed element type. PrimitiveType output_primitive_type = output_shape.element_type(); bool type_is_allowed = (output_primitive_type == F16 || output_primitive_type == F32 || - output_primitive_type == F64); - return type_is_allowed && IsRank2WithNoPadding(lhs_shape) && - IsRank2WithNoPadding(rhs_shape) && - IsRank2WithNoPadding(output_shape) && + output_primitive_type == F64 || output_primitive_type == C64); + return type_is_allowed && + IsRank2WithNoPadding(lhs_shape, batch_dimensions_size) && + IsRank2WithNoPadding(rhs_shape, batch_dimensions_size) && + IsRank2WithNoPadding(output_shape, batch_dimensions_size) && !ShapeUtil::IsZeroElementArray(lhs_shape) && !ShapeUtil::IsZeroElementArray(rhs_shape); } @@ -64,14 +67,15 @@ bool DotImplementedAsGemm(const HloInstruction& dot) { CHECK_EQ(dot.opcode(), HloOpcode::kDot); const Shape& lhs_shape = dot.operand(0)->shape(); const Shape& rhs_shape = dot.operand(1)->shape(); + const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers(); // If gemm can accept the operand shapes, use it rather than a custom // kernel. - if (AreValidGemmShapes(lhs_shape, rhs_shape, dot.shape())) { + if (AreValidGemmShapes(lhs_shape, rhs_shape, dot.shape(), + dim_numbers.lhs_batch_dimensions_size())) { // The size of the reduction dimension should match. The shape inference // guarantees this invariant, so the check here is for programming // errors. - const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers(); CHECK_EQ(lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions(0)), rhs_shape.dimensions(dim_numbers.rhs_contracting_dimensions(0))); return true; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc index 1295e83c0c4c16a1a18eaaadbafb5fd226be6eff..6675dbd3f9eef8d13c9dec200e5bf47faa5b514d 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc @@ -64,7 +64,7 @@ IrEmitter::IrEmitter(const HloModuleConfig& hlo_module_config, hlo_module_config_(hlo_module_config) { b_.setFastMathFlags(llvm_ir::GetFastMathFlags( /*fast_math_enabled=*/hlo_module_config.debug_options() - .xla_enable_fast_math())); + .xla_gpu_enable_fast_math())); } Status IrEmitter::DefaultAction(HloInstruction* hlo) { @@ -125,6 +125,10 @@ Status IrEmitter::HandleRecvDone(HloInstruction*) { return Unimplemented("Recv-done is not implemented on GPU"); } +Status IrEmitter::HandleScatter(HloInstruction*) { + return Unimplemented("Scatter is not implemented on GPUs."); +} + Status IrEmitter::HandleTuple(HloInstruction* tuple) { std::vector base_ptrs; for (const HloInstruction* operand : tuple->operands()) { @@ -450,6 +454,9 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { const Shape& lhs_shape = lhs_instruction->shape(); const Shape& rhs_shape = rhs_instruction->shape(); + const DotDimensionNumbers& dnums = dot->dot_dimension_numbers(); + CHECK_EQ(dnums.lhs_batch_dimensions_size(), + dnums.rhs_batch_dimensions_size()); // TODO(b/110211620): Convert to use i32 index_type when it is possible. llvm::Type* index_type = b_.getInt64Ty(); @@ -485,9 +492,15 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { const int64 lhs_reduction_dimension = ShapeUtil::GetDimensionNumber(lhs_shape, -1); const int64 rhs_reduction_dimension = - ShapeUtil::Rank(rhs_shape) >= 2 + ShapeUtil::Rank(rhs_shape) >= 2 + dnums.lhs_batch_dimensions_size() ? ShapeUtil::GetDimensionNumber(rhs_shape, -2) - : 0; + : dnums.lhs_batch_dimensions_size(); + + // Check that the batch dims don't cover the last two dims. + for (int64 batch_dim : dnums.lhs_batch_dimensions()) { + CHECK_NE(lhs_reduction_dimension, batch_dim); + CHECK_NE(rhs_reduction_dimension, batch_dim); + } // Verify the reduction dimension in the two operands are the same size. TF_RET_CHECK(lhs_shape.dimensions(lhs_reduction_dimension) == @@ -502,6 +515,13 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest( rhs_array, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs"); + // We don't have to iterate over the batch dimensions in both arrays, simplify + // the loop nest of the rhs. + for (int i = 0; i != dnums.lhs_batch_dimensions_size(); ++i) { + DCHECK(c_linear_search(dnums.lhs_batch_dimensions(), i)); + rhs_index[i] = lhs_index[i]; + } + // Create the reduction loop which does the sum of products reduction. std::unique_ptr reduction_loop = loop_nest.AddLoop( /*start_index=*/0, @@ -564,7 +584,9 @@ Status IrEmitter::HandleDot(HloInstruction* dot) { target_index.push_back(lhs_index[dimension]); } } - for (size_t dimension = 0; dimension < rhs_index.size(); ++dimension) { + // Skip over the batch dimensions to not have them in the index twice. + for (size_t dimension = dnums.lhs_batch_dimensions_size(); + dimension < rhs_index.size(); ++dimension) { if (dimension != rhs_reduction_dimension) { target_index.push_back(rhs_index[dimension]); } @@ -610,6 +632,10 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) { } Status IrEmitter::HandleReduce(HloInstruction* reduce) { + // TODO(b/112040122): Support variadic reduce. + if (!ShapeUtil::IsArray(reduce->shape())) { + return Unimplemented("Variadic reduce is not supported on GPU"); + } auto arg = reduce->operand(0); auto init_value = reduce->operand(1); tensorflow::gtl::ArraySlice dimensions(reduce->dimensions()); diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h index 80e2a203ac3a1fbe95bf38a886288ea8be130148..561c6838798aa92ce2c96b3c45d5ba42fe6edef3 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h @@ -86,6 +86,7 @@ class IrEmitter : public DfsHloVisitorWithDefault { Status HandleParameter(HloInstruction* parameter) override; Status HandleReduce(HloInstruction* reduce) override; Status HandleTuple(HloInstruction* tuple) override; + Status HandleScatter(HloInstruction* scatter) override; Status HandleSelect(HloInstruction* select) override; Status HandleTupleSelect(HloInstruction* tuple_select) override; Status HandleFusion(HloInstruction* fusion) override; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 874c7cfb8ae9c2f23c6af9b28f06395730dccf2d..1e81cbde35372d9f7d6ee234d2408038d6f99dc7 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -56,7 +56,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/thunk.h" #include "tensorflow/compiler/xla/service/gpu/tuple_thunk.h" #include "tensorflow/compiler/xla/service/gpu/while_thunk.h" -#include "tensorflow/compiler/xla/service/gpu/while_transformer.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" @@ -68,6 +67,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h" #include "tensorflow/compiler/xla/service/name_uniquer.h" +#include "tensorflow/compiler/xla/service/while_loop_analysis.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" @@ -171,40 +171,6 @@ Status IrEmitterUnnested::Postprocess(HloInstruction* hlo) { return DfsHloVisitor::Postprocess(hlo); } -namespace { -bool ImplementedAsHostToDeviceMemcpy(const BufferAssignment& buffer_assignment, - const HloInstruction& hlo) { - // `hlo` needs to satisfy the following conditions to be implemented as a - // host-to-device cuMemcpy. - // - // 1. `hlo` is a kCopy instruction. - // 2. `hlo`'s only operand is a kConstant instruction. - // 3. `hlo` and its operand have the same shape (thus the same layout too). - // 4. The address of `hlo`'s buffer is known at runtime (without dereferencing - // pointers in a tuple). - return hlo.opcode() == HloOpcode::kCopy && - hlo.operand(0)->opcode() == HloOpcode::kConstant && - ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) && - buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok(); -} - -bool ImplementedAsDeviceToDeviceMemcpy( - const BufferAssignment& buffer_assignment, const HloInstruction& hlo) { - // `hlo` needs to satisfy three conditions to be implemented as a - // device-to-device cuMemcpy. - // - // 1. `hlo` is a kCopy instruction. - // 2. `hlo` and its operand have the same shape (thus the same layout too). - // 3. `hlo` and its operand have a statically-known buffer assignment - // (constants do not, for instance), which means the source buffer also - // resides on the device. - return hlo.opcode() == HloOpcode::kCopy && - ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) && - buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok() && - buffer_assignment.GetUniqueTopLevelSlice(hlo.operand(0)).ok(); -} -} // namespace - llvm::Function* IrEmitterUnnested::BuildKernelPrototype( const HloInstruction& inst, tensorflow::gtl::ArraySlice args) { @@ -379,11 +345,6 @@ Status IrEmitterUnnested::DefaultAction(HloInstruction* hlo) { } Status IrEmitterUnnested::HandleDot(HloInstruction* dot) { - const DotDimensionNumbers& dnums = dot->dot_dimension_numbers(); - if (dnums.lhs_batch_dimensions_size() > 0 || - dnums.rhs_batch_dimensions_size() > 0) { - return Unimplemented("Dot with batch dimensions not implemented."); - } if (ImplementedAsGemm(*dot)) { thunk_sequence_->emplace_back(BuildGemmThunk(dot)); return Status::OK(); @@ -584,6 +545,11 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { switch (root->opcode()) { case HloOpcode::kTuple: case HloOpcode::kReduce: { + if (root->opcode() == HloOpcode::kReduce && + ShapeUtil::IsTuple(root->shape())) { + // TODO(b/112040122): Support variadic reduce. + return Unimplemented("Variadic reduce is not supported on GPU"); + } VLOG(3) << "Emitting fused reduction to vector: " << fusion->ToString(); std::vector> thunks; ArraySlice output_instructions = @@ -730,13 +696,12 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) { } Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) { - if (ImplementedAsHostToDeviceMemcpy(ir_emitter_context_->buffer_assignment(), - *copy)) { - thunk_sequence_->emplace_back(BuildHostToDeviceCopyThunk(copy)); - return Status::OK(); - } - if (ImplementedAsDeviceToDeviceMemcpy( - ir_emitter_context_->buffer_assignment(), *copy)) { + CHECK(ShapeUtil::Compatible(copy->operand(0)->shape(), copy->shape())); + const BufferAssignment& buffer_assignment = + ir_emitter_context_->buffer_assignment(); + if (LayoutUtil::Equal(copy->operand(0)->shape().layout(), + copy->shape().layout()) && + buffer_assignment.GetUniqueTopLevelSlice(copy->operand(0)).ok()) { thunk_sequence_->emplace_back(BuildDeviceToDeviceCopyThunk(copy)); return Status::OK(); } @@ -1734,6 +1699,10 @@ Status IrEmitterUnnested::EmitReductionToVector( } Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) { + // TODO(b/112040122): Support multi-output reduce. + if (!ShapeUtil::IsArray(reduce->shape())) { + return Unimplemented("Multi-output reduce is not supported on GPU"); + } auto input = reduce->operand(0); auto init_value = reduce->operand(1); tensorflow::gtl::ArraySlice dimensions_to_reduce(reduce->dimensions()); @@ -2003,19 +1972,13 @@ Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) { condition->root_instruction()->shape().element_type() == PRED) << "While condition computation must return bool"; // Build ForThunk for conformant while loops, otherwise build WhileThunk. - auto result = CanTransformWhileToFor(xla_while); - if (result.ok()) { - auto tuple = result.ConsumeValueOrDie(); - // loop_trip_count = (limit - start + increment - 1) / increment - const int64 loop_trip_count = - (std::get<1>(tuple) - std::get<0>(tuple) + std::get<2>(tuple) - 1) / - std::get<2>(tuple); - thunk_sequence_->emplace_back(BuildForThunk(xla_while, loop_trip_count)); + // TODO(b/112163966): Move trip count computation earlier in the pipeline. + if (auto loop_trip_count = ComputeWhileLoopTripCount(xla_while)) { + thunk_sequence_->emplace_back(BuildForThunk(xla_while, *loop_trip_count)); VLOG(3) << "Built ForThunk for while: " << xla_while->name(); } else { thunk_sequence_->emplace_back(BuildWhileThunk(xla_while)); - VLOG(3) << "Built WhileThunk for while: " << xla_while->name() - << " while-to-for transform status: " << result.status(); + VLOG(3) << "Built WhileThunk for while: " << xla_while->name(); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index 6c1c20fc0464927054deace8980620c3a9c6f09b..ff4ae1f9ef2ad2fda4bb9100de93019c0b88fbd1 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -114,21 +114,20 @@ static string GetLibdeviceFilename(const string& libdevice_dir_path, // Gets the GPU name as it's known to LLVM for a given compute capability. If // we see an unrecognized compute capability, we return "sm_30". static string GetSmName(std::pair compute_capability) { - static auto* m = new std::map, int>( - {{{2, 0}, 20}, - {{2, 1}, 21}, - {{3, 0}, 30}, - {{3, 2}, 32}, - {{3, 5}, 35}, - {{3, 7}, 37}, - {{5, 0}, 50}, - {{5, 2}, 52}, - {{5, 3}, 53}, - {{6, 0}, 60}, - {{6, 1}, 61}, - {{6, 2}, 62}, - // TODO: Change this to 70 once LLVM NVPTX supports it - {{7, 0}, 60}}); + static auto* m = new std::map, int>({ + {{3, 0}, 30}, + {{3, 2}, 32}, + {{3, 5}, 35}, + {{3, 7}, 37}, + {{5, 0}, 50}, + {{5, 2}, 52}, + {{5, 3}, 53}, + {{6, 0}, 60}, + {{6, 1}, 61}, + {{6, 2}, 62}, + {{7, 0}, 70}, + {{7, 2}, 72}, + }); int sm_version = 30; auto it = m->find(compute_capability); if (it != m->end()) { @@ -181,7 +180,7 @@ std::unique_ptr GetTargetMachine( TargetOptions target_options = InitTargetOptionsFromCodeGenFlags(); llvm_ir::SetTargetOptions( /*fast_math_enabled=*/hlo_module_config.debug_options() - .xla_enable_fast_math(), + .xla_gpu_enable_fast_math(), &target_options); // Enable FMA synthesis. @@ -329,7 +328,7 @@ Status LinkLibdeviceIfNecessary(llvm::Module* module, if (linker.linkInModule( std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded, [](Module& M, const StringSet<>& GVS) { - internalizeModule(M, [&M, &GVS](const GlobalValue& GV) { + internalizeModule(M, [&GVS](const GlobalValue& GV) { return !GV.hasName() || (GVS.count(GV.getName()) == 0); }); })) { diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc index c67dcbce77a19abe60f9f871f7cce09b20d3d455..c62bae0628f7b2fbfe822104fbe5f3528e0e09c3 100644 --- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc @@ -115,15 +115,23 @@ bool IsInputFusibleReduction(HloInstruction* instr) { // will be broadcasted and have not been observed to cause data locality issues. // TODO(b/111977086): Improve reduce emitters to remove this limitation. bool ReduceFriendlyInputLayouts(HloInstruction* instr) { + std::vector params; + if (instr->opcode() == HloOpcode::kFusion) { + params = instr->fused_parameters(); + } else { + for (HloInstruction* operand : instr->operands()) { + params.push_back(operand); + } + } int64 max_rank = 0; const Layout* max_rank_layout; - for (HloInstruction* param : instr->fused_parameters()) { + for (HloInstruction* param : params) { if (ShapeUtil::Rank(param->shape()) > max_rank) { max_rank = ShapeUtil::Rank(param->shape()); max_rank_layout = ¶m->shape().layout(); } } - return c_all_of(instr->fused_parameters(), [&](HloInstruction* param) { + return c_all_of(params, [&](HloInstruction* param) { return (ShapeUtil::Rank(param->shape()) < max_rank) || (LayoutUtil::Equal(param->shape().layout(), *max_rank_layout)); }); @@ -221,7 +229,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() { const bool is_loop_fusion = producer->opcode() == HloOpcode::kFusion && producer->fusion_kind() == HloInstruction::FusionKind::kLoop; - if (!is_loop_fusion) { + if (!producer->IsElementwise() && !is_loop_fusion) { VLOG(3) << producer->name() << " is not a loop fusion."; continue; } diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc index ec4234b8d9a5da299a9dc574169b0bb5fe6a575f..14f157a5e518a0ec82c664c123629d04bd385bbf 100644 --- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc +++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc @@ -256,6 +256,26 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionTwoLoops) { op::Tuple(op::Multiply(), op::Divide())); } +TEST_F(MultiOutputFusionTest, ProducerConsumerFusionElementwiseAndReduce) { + auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"( + ENTRY reduce { + p0 = f32[2,2,2]{2,1,0} parameter(0) + c0 = f32[] constant(0) + exp = f32[2,2,2]{2,1,0} exponential(p0) + reduce = f32[2,2]{1,0} reduce(exp, c0), dimensions={2}, to_apply=scalar_add_computation + ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce, exp) + })")) + .ValueOrDie(); + ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie()); + SCOPED_TRACE(module->ToString()); + const HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, op::Tuple(op::GetTupleElement(), op::GetTupleElement())); + const HloInstruction* fusion = root->operand(0)->operand(0); + ASSERT_TRUE(fusion->IsMultiOutputFusion()); + EXPECT_THAT(fusion->fused_expression_root(), + op::Tuple(op::Reduce(), op::Exp())); +} + TEST_F(MultiOutputFusionTest, ProducerConsumerFusionLoopFusionAndReduce) { auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"( fused_add { diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 7a683ede54354245c07ee2559172efe298f34950..76c9b6ab33befa98f03821fac84071bd978ae24d 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -34,7 +34,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/buffer_liveness.h" #include "tensorflow/compiler/xla/service/call_inliner.h" #include "tensorflow/compiler/xla/service/conditional_simplifier.h" -#include "tensorflow/compiler/xla/service/dot_decomposer.h" #include "tensorflow/compiler/xla/service/flatten_call_graph.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h" @@ -76,7 +75,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/transpose_folding.h" #include "tensorflow/compiler/xla/service/tuple_simplifier.h" #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h" -#include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h" #include "tensorflow/compiler/xla/service/while_loop_simplifier.h" #include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h" #include "tensorflow/compiler/xla/status_macros.h" @@ -148,7 +146,6 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec, // support BF16 operations without directly implementing a BF16 lowering for // most ops. pipeline.AddPass(BF16, F32); - pipeline.AddPass(); { auto& pass = @@ -283,14 +280,6 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec, } } - { - // Do an aggressive LICM pass over while loops. In particular, this hoists - // constants that were sunk by WhileLoopConstantSinking. Leaving them in - // the while loop may result in unnecessary copies. - HloPassPipeline pipeline("while-loop-licm"); - pipeline.AddPass(true); - TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status()); - } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_manager.h b/tensorflow/compiler/xla/service/gpu/outfeed_manager.h index a752eb70119b00e8cca7ddce26da7730ef5db8cb..160ba4b691f818ff01b41b8603c11853ea12c253 100644 --- a/tensorflow/compiler/xla/service/gpu/outfeed_manager.h +++ b/tensorflow/compiler/xla/service/gpu/outfeed_manager.h @@ -36,22 +36,19 @@ class OutfeedBuffer { OutfeedBuffer(int64 length) : length_(length) {} // Waits for the device transfer to be finished. - std::unique_ptr WaitUntilAvailable() { - done_.WaitForNotification(); - return std::move(destination_); - } + void WaitUntilAvailable() { done_.WaitForNotification(); } int64 length() const { return length_; } - void set_destination(std::unique_ptr destination) { + void set_destination(std::unique_ptr destination) { destination_ = std::move(destination); } - Literal* destination() { return destination_.get(); } + MutableBorrowingLiteral* destination() { return destination_.get(); } // Callback to signal that this buffer is consumed. void Done() { done_.Notify(); } private: - std::unique_ptr destination_; + std::unique_ptr destination_; const int64 length_; tensorflow::Notification done_; }; diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc index 7986e63f43ee508370f94fdb9057b91bfe4add18..b99d998c4d7df514c024b1f8d643d08c72059d0e 100644 --- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc @@ -50,10 +50,6 @@ Status OutfeedThunk::ExecuteOnStream( if (!*buffer) { // Tuple pointers. return Status::OK(); } - // Allocate storage for the literal data. - const Shape& shape = - ShapeUtil::GetSubshape(outfeed_buffers->shape(), index); - (*buffer)->set_destination(Literal::CreateFromShape(shape)); BufferAllocation::Slice slice = outfeed_slices_.element(index); se::DeviceMemoryBase data_address; diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h index 4df0bb005b623e5ac79a4dfcb7c5a8a7a400940c..e68bee035a029178844282995429eaa960cc4817 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk.h +++ b/tensorflow/compiler/xla/service/gpu/thunk.h @@ -82,17 +82,9 @@ class Thunk { return Status::OK(); } - // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream) - // before calling ExecuteOnStream(stream). If it returns true, it's the - // user's responsibility to wait for all activity on the GPU to finish before - // calling ExecuteOnStream. - // - // This value is not required to be constant for a given Thunk. For example, - // a Thunk that performs autotuning may return true for its first run and - // false thereafter. - virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) { - return false; - } + // Returns true if this kernel will autotune for the stream device the next + // time it is run. + virtual bool WillAutotuneKernel(se::Stream* /*stream*/) { return false; } // Execute the kernel for the thunk on the given stream. This method must be // called after Initialize and can be called multiple times over Thunk's diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc index a10e40451c1db01ce73db7b56a3a0599769fa49b..8579b1545fd24f80621ac0f53b997e33586cbabe 100644 --- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc @@ -24,24 +24,32 @@ namespace gpu { Status TupleThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, se::Stream* stream, HloExecutionProfiler* profiler) { - std::vector tuple_element_buffer_addresses; - for (BufferAllocation::Slice tuple_element_buffer : tuple_element_buffers_) { - tuple_element_buffer_addresses.push_back( - buffer_allocations.GetDeviceAddress(tuple_element_buffer).opaque()); + auto size = tuple_element_buffers_.size(); + auto tuple_element_buffer_addresses = MakeUnique(size); + for (int i = 0; i != size; ++i) { + tuple_element_buffer_addresses[i] = + buffer_allocations.GetDeviceAddress(tuple_element_buffers_[i]).opaque(); } se::DeviceMemory dest_buffer_address( buffer_allocations.GetDeviceAddress(dest_buffer_)); - auto host_size = tuple_element_buffer_addresses.size() * sizeof(void*); + auto host_size = size * sizeof(void*); auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); if (!stream ->ThenMemcpy(&dest_buffer_address, - tuple_element_buffer_addresses.data(), host_size) + tuple_element_buffer_addresses.get(), host_size) .ok()) { return InternalError( "Unable to launch MemcpyH2D from %p to %p with size %lu", - tuple_element_buffer_addresses.data(), dest_buffer_address.opaque(), - sizeof(void*) * tuple_element_buffer_addresses.size()); + tuple_element_buffer_addresses.get(), dest_buffer_address.opaque(), + host_size); + } + // Free the tuple address buffer when memcpy is done. + auto* buffers_raw = tuple_element_buffer_addresses.release(); + if (!stream->ThenDoHostCallback([buffers_raw] { delete[] buffers_raw; }) + .ok()) { + delete[] buffers_raw; + return InternalError("Unable to enqueue host callback!"); } return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc deleted file mode 100644 index c5321df6c466fcb3816fb2aedad65b7c3811cb37..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc +++ /dev/null @@ -1,521 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/gpu/while_transformer.h" - -#include -#include - -#include "tensorflow/compiler/xla/literal.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/core/errors.h" - -namespace xla { -namespace gpu { - -namespace { - -// TODO(b/33483676) Use an expression tree to specify computations to pattern -// match for while transformations. - -// ExprTree is a simple recursive data structure used to express computation -// patterns to match. -// -// Each ExprTree node is comprised of an HloOpcode, and a set of operands (each -// of type ExprTree). Operands can be added by specifying the index and -// HloOpcode of the operand. -// -// For example, the following computation: -// -// Parameter -// | -// Const GetTupleElement -// \ / -// Add (root) -// -// Can be matched with the following expression tree: -// -// ExprTree add(HloOpcode::kAdd, -// ExprTree(HloOpcode::kConstant), -// ExprTree(HloOpcode::kGetTupleElement, -// tuple_index, ExprTree(HloOpcode::kParameter))); -// -// Match the ExprTree root against an Hlo graph: -// -// ExprTree::TaggedInstructionMap tagged_instructions; -// TF_RETURN_IF_ERROR(add.Match(computation_->root_instruction(), -// &tagged_instructions)); -// -// Instructions that are "tagged" with a context-specific string will -// be returned in 'tagged_instructions' for further processing (i.e. parsing -// constants or recording the tuple_index). -// -class ExprTree { - public: - explicit ExprTree(HloOpcode opcode) : opcode_(opcode) {} - ExprTree(HloOpcode opcode, const string& tag) : opcode_(opcode), tag_(tag) {} - ExprTree(HloOpcode opcode, const ExprTree& operand0) : opcode_(opcode) { - SetOperand(0, operand0); - } - ExprTree(HloOpcode opcode, int64 index0, const ExprTree& operand0) - : opcode_(opcode) { - SetOperand(index0, operand0); - } - ExprTree(HloOpcode opcode, int64 index0, const ExprTree& operand0, - int64 index1, const ExprTree& operand1) - : opcode_(opcode) { - SetOperand(index0, operand0); - SetOperand(index1, operand1); - } - ExprTree(HloOpcode opcode, const string& tag, const ExprTree& operand0) - : opcode_(opcode), tag_(tag) { - SetOperand(0, operand0); - } - ExprTree(HloOpcode opcode, const ExprTree& operand0, const ExprTree& operand1) - : opcode_(opcode) { - SetOperand(0, operand0); - SetOperand(1, operand1); - } - - ExprTree(const ExprTree& to_copy) { - opcode_ = to_copy.opcode_; - tag_ = to_copy.tag_; - if (to_copy.fused_root_tree_ != nullptr) { - fused_root_tree_.reset(new ExprTree(*to_copy.fused_root_tree_)); - } - for (auto& pair : to_copy.operands_) { - CHECK(operands_.find(pair.first) == operands_.end()); - operands_.insert(std::make_pair( - pair.first, std::unique_ptr(new ExprTree(*pair.second)))); - } - } - - void SetFusedRoot(const ExprTree& fused_root) { - fused_root_tree_.reset(new ExprTree(fused_root)); - } - - typedef std::unordered_map - TaggedInstructionMap; - - // Matches 'instruction' HloOpcode against 'opcode_'. - // Recursively matches each operand in 'operands_'. - // Recursively matches fused instructions starting at 'fused_root_tree_' - // if 'opcode_ == kFusion'. - // Returns OK status, and instructions in 'tagged_instructions' for each - // matched ExprTree node with a non-empty 'tag_'. - // Returns error message on failure. - Status Match(const HloInstruction* instruction, - TaggedInstructionMap* tagged_instructions) const { - if (opcode_ != instruction->opcode()) { - return InvalidArgument("got opcode %s, want %s", - HloOpcodeString(instruction->opcode()).c_str(), - HloOpcodeString(opcode_).c_str()); - } - - VLOG(2) << "Matched " << HloOpcodeString(opcode_) << ": " << tag_; - if (!tag_.empty()) { - tagged_instructions->insert({tag_, instruction}); - } - - if (instruction->opcode() == HloOpcode::kFusion) { - CHECK(fused_root_tree_ != nullptr); - // Match fused instructions for this node starting a 'fused_root_tree'. - TF_RETURN_IF_ERROR(fused_root_tree_->Match( - instruction->fused_expression_root(), tagged_instructions)); - } - - // Match each operand in 'operands_'. - for (auto& pair : operands_) { - TF_RETURN_IF_ERROR(pair.second->Match(instruction->operand(pair.first), - tagged_instructions)); - } - return Status::OK(); - } - - private: - void SetOperand(int64 index, const ExprTree& operand) { - CHECK_EQ(0, operands_.count(index)); - operands_.insert(std::make_pair(index, MakeUnique(operand))); - } - - HloOpcode opcode_; - std::unordered_map> operands_; - std::unique_ptr fused_root_tree_; - string tag_; -}; - -// MatcherBase is a base class that provides common functionality for -// sub-classes which match specific target sub-computations (i.e. loop -// induction variable initialization, comparison and update). -class MatcherBase { - public: - MatcherBase() {} - virtual ~MatcherBase() {} - - // Attempts to match each ExprTree in 'expr_trees_'. - // Returns OK on the first successful match, error status otherwise. - virtual Status Run() { - Status status; - for (const ExprTree& expr_tree : expr_trees_) { - status = MatchExprTree(expr_tree); - if (status.ok()) { - return status; - } - } - return status; - } - - virtual Status MatchExprTree(const ExprTree& expr_tree) = 0; - - // Returns the constant value parsed form kConstant 'instruction'. - // Returns error status otherwise. - Status ParseConstInteger(const HloInstruction* instruction, - int64* const_value) const { - CHECK_EQ(HloOpcode::kConstant, instruction->opcode()); - PrimitiveType element_type = instruction->shape().element_type(); - if (element_type != S32 && element_type != S64) { - return InvalidArgument("Expected constant of integral type."); - } - const Literal& literal = instruction->literal(); - PrimitiveType type = literal.shape().element_type(); - if (type != S32 && type != S64) { - return InvalidArgument("Must use S32 or S64 integral types."); - } - if (type == S32) { - *const_value = static_cast(literal.GetFirstElement()); - } else if (type == S64) { - *const_value = literal.GetFirstElement(); - } - return Status::OK(); - } - - StatusOr GetTaggedInstruction( - const string& tag, - const ExprTree::TaggedInstructionMap& tagged_instructions) { - auto it = tagged_instructions.find(tag); - if (it == tagged_instructions.end()) { - return InvalidArgument("Cound not find instruction for tag: %s", - tag.c_str()); - } - return it->second; - } - - protected: - std::vector expr_trees_; - - private: - TF_DISALLOW_COPY_AND_ASSIGN(MatcherBase); -}; - -// WhileConditionComputationMatcher attempts to match a target computation -// pattern in the while condition sub-computation. -// If the target pattern is matched, two pieces of information are extracted -// from 'tagged' instructions returned by the matcher: -// -// *) 'tuple_index': -// *) The loop induction variable tuple_index from the GetTupleElement -// instruction of the matched computation. -// *) Used in subsequent matching passes of while init operand and body -// computations to select loop induction variable tuple element. -// -// *) 'loop_limit': -// *) The integral value from Constant root operand in matched computation. -// *) Used as the constant for the loop limit. -// -class WhileConditionComputationMatcher : public MatcherBase { - public: - explicit WhileConditionComputationMatcher(const HloComputation* computation) - : computation_(computation) { - expr_trees_.emplace_back(BuildCondExprTree()); - } - - int64 loop_limit() const { return loop_limit_; } - int64 tuple_index() const { return tuple_index_; } - - private: - // Builds expression tree for the following condition computation: - // - // Const Parameter - // \ / - // Fusion ------------> FusionParam FusionParam - // \ / - // GTE / - // \ / - // LessThan (fused root) - // - ExprTree BuildCondExprTree() { - // Build ExprTree for fused instructions. - ExprTree fused_root( - HloOpcode::kLt, - ExprTree(HloOpcode::kGetTupleElement, "gte", - ExprTree(HloOpcode::kParameter, "gte.fusion_param.param0")), - ExprTree(HloOpcode::kParameter)); - - // Build top-level computation. - ExprTree root(HloOpcode::kFusion, - ExprTree(HloOpcode::kConstant, "loop_limit"), - ExprTree(HloOpcode::kParameter, "param0")); - - root.SetFusedRoot(fused_root); - return root; - } - - Status MatchExprTree(const ExprTree& expr_tree) override { - VLOG(2) << "MATCHING while condition"; - ExprTree::TaggedInstructionMap tagged_instructions; - TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(), - &tagged_instructions)); - - // Get tagged GTE instruction and set 'tuple_index_'. - TF_ASSIGN_OR_RETURN(const HloInstruction* gte, - GetTaggedInstruction("gte", tagged_instructions)); - tuple_index_ = gte->tuple_index(); - - // Get tagged Constant instruction and parse 'loop_limit_'. - TF_ASSIGN_OR_RETURN( - const HloInstruction* const_hlo, - GetTaggedInstruction("loop_limit", tagged_instructions)); - TF_RETURN_IF_ERROR(ParseConstInteger(const_hlo, &loop_limit_)); - - // Get tagged "param0" instruction, and check that it matches - // 'computation_' parameter 0. - TF_ASSIGN_OR_RETURN(const HloInstruction* param0, - GetTaggedInstruction("param0", tagged_instructions)); - if (param0 != computation_->parameter_instruction(0)) { - return InvalidArgument("Unexpected Parameter0 instruction : %s", - param0->name().c_str()); - } - - // Get tagged 'gte.fusion_param.param0', find its associated fusion operand, - // and compare it to 'computation_' parameter0. - TF_ASSIGN_OR_RETURN( - const HloInstruction* gte_fusion_param0, - GetTaggedInstruction("gte.fusion_param.param0", tagged_instructions)); - CHECK_EQ(HloOpcode::kParameter, gte_fusion_param0->opcode()); - CHECK(gte_fusion_param0->IsFused()); - if (gte_fusion_param0->parent()->FusionInstruction()->operand( - gte_fusion_param0->parameter_number()) != - computation_->parameter_instruction(0)) { - return InvalidArgument("Could not match fusion param: %s", - gte_fusion_param0->name().c_str()); - } - - return Status::OK(); - } - - const HloComputation* computation_; - - int64 loop_limit_ = -1; - int64 tuple_index_ = -1; - - TF_DISALLOW_COPY_AND_ASSIGN(WhileConditionComputationMatcher); -}; - -// WhileInitOperandMatcher matches a target computation pattern of the -// while instructions 'init' operand, indexing the tuple at 'tuple_index'. -// On success, parses constant 'loop_start' which represents the loop induction -// variable start values, then returns OK. -// Returns error status otherwise. -class WhileInitOperandMatcher : public MatcherBase { - public: - WhileInitOperandMatcher(const HloInstruction* while_hlo, - const int64 tuple_index) - : while_hlo_(while_hlo), tuple_index_(tuple_index) { - expr_trees_.emplace_back(BuildInitExprTree()); - } - - int64 loop_start() const { return loop_start_; } - - private: - // Builds expression tree for the following while init operand subcomputation: - // - // Const - // | - // Copy - // | - // Tuple0 - // | - // While - // - ExprTree BuildInitExprTree() { - return ExprTree( - HloOpcode::kWhile, "while", - ExprTree(HloOpcode::kTuple, tuple_index_, - ExprTree(HloOpcode::kCopy, - ExprTree(HloOpcode::kConstant, "loop_start")))); - } - - Status MatchExprTree(const ExprTree& expr_tree) override { - VLOG(2) << "MATCHING while init"; - ExprTree::TaggedInstructionMap tagged_instructions; - TF_RETURN_IF_ERROR(expr_tree.Match(while_hlo_, &tagged_instructions)); - - // Get tagged while instruction check against 'while_hlo_'. - TF_ASSIGN_OR_RETURN(const HloInstruction* while_hlo, - GetTaggedInstruction("while", tagged_instructions)); - if (while_hlo != while_hlo_) { - return InvalidArgument("Expected While for instruction : %s", - while_hlo->name().c_str()); - } - - // Get tagged Constant instruction and parse 'loop_start_'. - TF_ASSIGN_OR_RETURN( - const HloInstruction* const_hlo, - GetTaggedInstruction("loop_start", tagged_instructions)); - TF_RETURN_IF_ERROR(ParseConstInteger(const_hlo, &loop_start_)); - - return Status::OK(); - } - - const HloInstruction* while_hlo_; - const int64 tuple_index_; - - int64 loop_start_ = -1; - - TF_DISALLOW_COPY_AND_ASSIGN(WhileInitOperandMatcher); -}; - -// WhileBodyComputationMatcher matches a target computation pattern for -// the loop induction variable update. Matching proceeds from the while body -// computation root[tuple_index] to param[tuple_index], where 'tuple_index' -// If the target pattern is matched, parses a constant which represents the -// loop induction variable increment value, then returns status OK. -// Returns error status otherwise. -class WhileBodyComputationMatcher : public MatcherBase { - public: - WhileBodyComputationMatcher(const HloComputation* computation, - const int64 tuple_index) - : computation_(computation), tuple_index_(tuple_index) { - expr_trees_.emplace_back(BuildBodyExprTree(0, 1)); - expr_trees_.emplace_back(BuildBodyExprTree(1, 0)); - } - - int64 loop_increment() const { return loop_increment_; } - - private: - // Builds expression tree for the following while body computation: - // - // - // FusionParam FusionParam - // \ / - // Const Param \ GTE1 - // \ / \ / - // Fusion -----------> Add - // | - // Copy - // | - // Tuple0 - // - ExprTree BuildBodyExprTree(const int64 const_index, const int64 gte_index) { - // Build ExprTree for fused instructions. - ExprTree gte1 = - ExprTree(HloOpcode::kGetTupleElement, "gte", - ExprTree(HloOpcode::kParameter, "gte.fusion_param.param0")); - ExprTree fused_root(HloOpcode::kAdd, const_index, - ExprTree(HloOpcode::kParameter), gte_index, gte1); - - // Build fusion instruction (and set fused root). - ExprTree fusion(HloOpcode::kFusion, 0, - ExprTree(HloOpcode::kConstant, "loop_increment"), 1, - ExprTree(HloOpcode::kParameter, "param0")); - fusion.SetFusedRoot(fused_root); - - // Build top-level computation. - ExprTree tuple0(HloOpcode::kTuple, tuple_index_, - ExprTree(HloOpcode::kCopy, fusion)); - return tuple0; - } - - Status MatchExprTree(const ExprTree& expr_tree) override { - VLOG(2) << "MATCHING while body"; - ExprTree::TaggedInstructionMap tagged_instructions; - TF_RETURN_IF_ERROR(expr_tree.Match(computation_->root_instruction(), - &tagged_instructions)); - - for (const auto& pair : tagged_instructions) { - const auto& tag = pair.first; - const auto& inst = pair.second; - - if (tag == "gte" && inst->tuple_index() != tuple_index_) { - // Check that the matched GTE instruction is at the 'tuple_index' we - // matched in the while condition computation. - return InvalidArgument("Unexpected tuple index instruction : %s", - inst->name().c_str()); - } else if (tag == "loop_increment") { - // ParseHloString the constant which represents the loop induction - // variable increment value. - TF_RETURN_IF_ERROR(ParseConstInteger(inst, &loop_increment_)); - } else if (tag == "param0" && - inst != computation_->parameter_instruction(0)) { - // Check that the matched parameter == parameter 0 from 'computation_'. - return InvalidArgument("Unexpected Parameter0 instruction : %s", - inst->name().c_str()); - } else if (tag == "gte.fusion_param.param0") { - // Fusion parameter: lookup and compare with associated fusion operand. - CHECK_EQ(HloOpcode::kParameter, inst->opcode()); - CHECK(inst->IsFused()); - if (inst->parent()->FusionInstruction()->operand( - inst->parameter_number()) != - computation_->parameter_instruction(0)) { - return InvalidArgument("Could not match fusion param: %s", - inst->name().c_str()); - } - } - } - return Status::OK(); - } - - const HloComputation* computation_; - const int64 tuple_index_; - - int64 loop_increment_ = -1; - - TF_DISALLOW_COPY_AND_ASSIGN(WhileBodyComputationMatcher); -}; - -} // namespace - -StatusOr> CanTransformWhileToFor( - const HloInstruction* while_hlo) { - if (while_hlo->opcode() != HloOpcode::kWhile) { - return InvalidArgument("Expected While instruction."); - } - - WhileConditionComputationMatcher cond_matcher(while_hlo->while_condition()); - TF_RETURN_IF_ERROR(cond_matcher.Run()); - - WhileInitOperandMatcher init_matcher(while_hlo, cond_matcher.tuple_index()); - TF_RETURN_IF_ERROR(init_matcher.Run()); - - WhileBodyComputationMatcher body_matcher(while_hlo->while_body(), - cond_matcher.tuple_index()); - TF_RETURN_IF_ERROR(body_matcher.Run()); - - // Check for valid For loop parameters. - if (init_matcher.loop_start() >= cond_matcher.loop_limit()) { - return InvalidArgument("Loop start must be less than loop limit."); - } - if (body_matcher.loop_increment() <= 0) { - return InvalidArgument("Loop increment must greater than zero."); - } - return std::make_tuple(init_matcher.loop_start(), cond_matcher.loop_limit(), - body_matcher.loop_increment()); -} - -} // namespace gpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.h b/tensorflow/compiler/xla/service/gpu/while_transformer.h deleted file mode 100644 index fe3a954e1828ee4a323872eea81f64c7e780ad24..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/gpu/while_transformer.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_ - -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/statusor.h" - -namespace xla { -namespace gpu { - -// Runs an analysis of the while loop instruction 'while_hlo' (and its -// associated sub-computations) to determine if it can be transformed into an -// equivalent "for" loop with the following "for" loop parameters: -// -// *) 'loop_start': loop induction variable starting value. -// *) 'loop_limit': loop induction variable limit value. -// *) 'loop_increment': loop induction variable per-iteration increment value. -// -// Returns an std::tuple = (loop_start, loop_limit, loop_increment) on success. -// The values in the returned tuple are values extracted from the 'while_hlo' -// operand (and its sub-computations) during analysis. -// Returns an error status on failure. -StatusOr> CanTransformWhileToFor( - const HloInstruction* while_hlo); - -} // namespace gpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_WHILE_TRANSFORMER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc index dbc8442ed2785a112b674632689256c01282156b..c5f3906356d821e059d2b1213c9083c4408a4d1c 100644 --- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc +++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc @@ -13,11 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/service/gpu/while_transformer.h" - #include "tensorflow/compiler/xla/service/copy_insertion.h" #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" +#include "tensorflow/compiler/xla/service/while_loop_analysis.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -110,12 +109,12 @@ class WhileTransformerTest : public HloTestBase { void RunFusionPasses() { // Run standard fusion passes. - EXPECT_TRUE(gpu::GpuInstructionFusion(/*may_duplicate=*/false) - .Run(module_.get()) - .ValueOrDie()); - EXPECT_TRUE(gpu::GpuInstructionFusion(/*may_duplicate=*/true) - .Run(module_.get()) - .ValueOrDie()); + TF_ASSERT_OK(gpu::GpuInstructionFusion(/*may_duplicate=*/false) + .Run(module_.get()) + .status()); + TF_ASSERT_OK(gpu::GpuInstructionFusion(/*may_duplicate=*/true) + .Run(module_.get()) + .status()); } void RunCopyInsertionPass() { @@ -141,10 +140,7 @@ class WhileTransformerTest : public HloTestBase { Shape condition_result_shape_; }; -// TODO(b/68830972): The while transformer is far too fragile. It patterns -// matches the exact expressions of opcodes. Re-enable when transformation is -// more general -TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement0) { +TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) { // Build computation with induction variable at tuple element 0. auto condition = module_->AddEmbeddedComputation(BuildConditionComputation(0, 10)); @@ -153,18 +149,13 @@ TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement0) { // Run HLO Optimization passes. RunFusionPasses(); RunCopyInsertionPass(); - // Run WhileTransformer. - auto result = gpu::CanTransformWhileToFor(while_hlo); - TF_ASSERT_OK(result.status()); - // Check results. - EXPECT_THAT(result.ConsumeValueOrDie(), - Eq(std::tuple(0, 10, 1))); + + auto result = ComputeWhileLoopTripCount(while_hlo); + ASSERT_TRUE(result); + EXPECT_EQ(10, *result); } -// TODO(b/68830972): The while transformer is far too fragile. It patterns -// matches the exact expressions of opcodes. Re-enable when transformation is -// more general -TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement1) { +TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) { // Build computation with induction variable at tuple element 1. auto condition = module_->AddEmbeddedComputation(BuildConditionComputation(1, 10)); @@ -173,19 +164,14 @@ TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement1) { // Run HLO Optimization passes. RunFusionPasses(); RunCopyInsertionPass(); - // Run WhileTransformer. - auto result = gpu::CanTransformWhileToFor(while_hlo); - TF_ASSERT_OK(result.status()); - // Check results. - EXPECT_THAT(result.ConsumeValueOrDie(), - Eq(std::tuple(0, 10, 1))); + + auto result = ComputeWhileLoopTripCount(while_hlo); + ASSERT_TRUE(result); + EXPECT_EQ(10, *result); } -// TODO(b/68830972): The while transformer is far too fragile. It patterns -// matches the exact expressions of opcodes. Re-enable when transformation is -// more general -TEST_F(WhileTransformerTest, DISABLED_InvalidLoopLimit) { - // Build computation with invalid loop limit. +TEST_F(WhileTransformerTest, ImpossibleLoopLimit) { + // Build computation with an impossible loop limit. auto condition = module_->AddEmbeddedComputation(BuildConditionComputation(0, 5)); auto body = module_->AddEmbeddedComputation(BuildBodyComputation(0, 1, 1)); @@ -193,17 +179,13 @@ TEST_F(WhileTransformerTest, DISABLED_InvalidLoopLimit) { // Run HLO Optimization passes. RunFusionPasses(); RunCopyInsertionPass(); - // Run WhileTransformer. - auto result = gpu::CanTransformWhileToFor(while_hlo); - ASSERT_FALSE(result.ok()); - EXPECT_THAT(result.status().error_message(), - HasSubstr("Loop start must be less than loop limit.")); + + auto result = ComputeWhileLoopTripCount(while_hlo); + ASSERT_TRUE(result); + EXPECT_EQ(0, *result); } -// TODO(b/68830972): The while transformer is far too fragile. It patterns -// matches the exact expressions of opcodes. Re-enable when transformation is -// more general -TEST_F(WhileTransformerTest, DISABLED_InvalidLoopIncrement) { +TEST_F(WhileTransformerTest, InvalidLoopIncrement) { // Build computation with invalid loop increment. auto condition = module_->AddEmbeddedComputation(BuildConditionComputation(0, 10)); @@ -212,11 +194,9 @@ TEST_F(WhileTransformerTest, DISABLED_InvalidLoopIncrement) { // Run HLO Optimization passes. RunFusionPasses(); RunCopyInsertionPass(); - // Run WhileTransformer. - auto result = gpu::CanTransformWhileToFor(while_hlo); - ASSERT_FALSE(result.ok()); - EXPECT_THAT(result.status().error_message(), - HasSubstr("Loop increment must greater than zero.")); + + auto result = ComputeWhileLoopTripCount(while_hlo); + ASSERT_FALSE(result); } } // namespace diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto index 63a8a813cddf304e60fa9b4bbf709eca2d7c2cae..be9098f555e78f3cabfe55481356f8b6841a3a2b 100644 --- a/tensorflow/compiler/xla/service/hlo.proto +++ b/tensorflow/compiler/xla/service/hlo.proto @@ -151,8 +151,11 @@ message HloInstructionProto { // Backend configuration for the instruction. Has backend-specific meaning. string backend_config = 43; - // Cross Replica Sum fields. + // Cross replica op fields. + // TODO(b/112107579): remove replica_group_ids field and always use + // replica_groups. repeated int64 replica_group_ids = 44; + repeated ReplicaGroup replica_groups = 49; int64 all_reduce_id = 45; string cross_replica_sum_barrier = 46; @@ -160,6 +163,8 @@ message HloInstructionProto { // present for Send and Recv instructions and their SendDone and RecvDone // partners. bool is_host_transfer = 47; + + xla.ScatterDimensionNumbers scatter_dimension_numbers = 48; } // Serialization of HloComputation. diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc index 1f672502f72f9c658b681383e858995f6e94d2c7..1bbb0ff08e26f626f4c3992a5f20ec4990f7db2d 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc @@ -49,9 +49,9 @@ Status HloCostAnalysis::Preprocess(const HloInstruction* hlo) { // The default number of bytes accessed for an instruction is the sum of the // sizes of the inputs and outputs. The default ShapeUtil::ByteSizeOf does not // handle opaque types. - float bytes_accessed = shape_size_(hlo->shape()); + float bytes_accessed = GetShapeSize(hlo->shape()); for (const HloInstruction* operand : hlo->operands()) { - bytes_accessed += shape_size_(operand->shape()); + bytes_accessed += GetShapeSize(operand->shape()); } current_properties_[kBytesAccessedKey] = bytes_accessed; @@ -121,6 +121,13 @@ Status HloCostAnalysis::HandleElementwiseOp( } } +int64 HloCostAnalysis::GetShapeSize(const Shape& shape) const { + if (!LayoutUtil::HasLayout(shape)) { + return 0; + } + return shape_size_(shape); +} + Status HloCostAnalysis::HandleElementwiseUnary(const HloInstruction* hlo) { return HandleElementwiseOp(hlo); } @@ -181,21 +188,21 @@ Status HloCostAnalysis::HandleReverse(const HloInstruction*) { } Status HloCostAnalysis::HandleSlice(const HloInstruction* slice) { - current_properties_[kBytesAccessedKey] = shape_size_(slice->shape()) * 2; + current_properties_[kBytesAccessedKey] = GetShapeSize(slice->shape()) * 2; return Status::OK(); } Status HloCostAnalysis::HandleDynamicSlice( const HloInstruction* dynamic_slice) { current_properties_[kBytesAccessedKey] = - shape_size_(dynamic_slice->shape()) * 2; + GetShapeSize(dynamic_slice->shape()) * 2; return Status::OK(); } Status HloCostAnalysis::HandleDynamicUpdateSlice( const HloInstruction* dynamic_update_slice) { current_properties_[kBytesAccessedKey] = - shape_size_(dynamic_update_slice->operand(1)->shape()) * 2; + GetShapeSize(dynamic_update_slice->operand(1)->shape()) * 2; return Status::OK(); } @@ -204,7 +211,7 @@ Status HloCostAnalysis::HandleTuple(const HloInstruction* tuple) { // through them). The memory touched is then only the size of the output // index table of the tuple. - current_properties_[kBytesAccessedKey] = shape_size_(tuple->shape()); + current_properties_[kBytesAccessedKey] = GetShapeSize(tuple->shape()); return Status::OK(); } @@ -526,12 +533,25 @@ Status HloCostAnalysis::HandleCrossReplicaSum(const HloInstruction* crs) { // TODO(b/33004697): Compute correct cost here, taking the actual number of // replicas into account. double flops = 0.0; - ShapeUtil::ForEachSubshape( - crs->shape(), [&, this](const Shape& subshape, const ShapeIndex&) { - if (ShapeUtil::IsArray(subshape)) { - flops += ShapeUtil::ElementsIn(subshape); - } - }); + ShapeUtil::ForEachSubshape(crs->shape(), + [&](const Shape& subshape, const ShapeIndex&) { + if (ShapeUtil::IsArray(subshape)) { + flops += ShapeUtil::ElementsIn(subshape); + } + }); + current_properties_[kFlopsKey] = flops; + return Status::OK(); +} + +Status HloCostAnalysis::HandleAllToAll(const HloInstruction* hlo) { + // TODO(b/110096724): Compute correct cost here. + double flops = 0.0; + ShapeUtil::ForEachSubshape(hlo->shape(), + [&](const Shape& subshape, const ShapeIndex&) { + if (ShapeUtil::IsArray(subshape)) { + flops += ShapeUtil::ElementsIn(subshape); + } + }); current_properties_[kFlopsKey] = flops; return Status::OK(); } @@ -546,15 +566,9 @@ Status HloCostAnalysis::HandleRng(const HloInstruction* random) { } Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) { - // Compute the properties of the fused expression and attribute them to the - // fusion node. Use a dummy shape_size to avoid any errors from trying to - // calculate the size of a shape that does not have a layout, since nodes - // inside fusion nodes do not necessarily have a layout assigned. - ShapeSizeFunction shape_size = [](const Shape& shape) { return 0; }; TF_ASSIGN_OR_RETURN( current_properties_, - ProcessSubcomputation(fusion->fused_instructions_computation(), - &shape_size)); + ProcessSubcomputation(fusion->fused_instructions_computation())); // Fusion nodes that produce a tuple also produce the entries in the tuple. // Ignore the memory accessed inside fused ops, since fusion is supposed to @@ -563,11 +577,11 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) { ShapeUtil::ForEachSubshape( fusion->shape(), [this](const Shape& subshape, const ShapeIndex& /*shape_index*/) { - current_properties_[kBytesAccessedKey] += shape_size_(subshape); + current_properties_[kBytesAccessedKey] += GetShapeSize(subshape); }); for (const HloInstruction* operand : fusion->operands()) { - current_properties_[kBytesAccessedKey] += shape_size_(operand->shape()); + current_properties_[kBytesAccessedKey] += GetShapeSize(operand->shape()); } return Status::OK(); @@ -648,6 +662,11 @@ Status HloCostAnalysis::HandleGather(const HloInstruction* gather) { return Status::OK(); } +Status HloCostAnalysis::HandleScatter(const HloInstruction* scatter) { + // TODO(b/32945756): Compute the properties of the sub-computation. + return Status::OK(); +} + Status HloCostAnalysis::FinishVisit(const HloInstruction*) { return Status::OK(); } @@ -685,11 +704,8 @@ float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const { } StatusOr HloCostAnalysis::ProcessSubcomputation( - HloComputation* computation, const ShapeSizeFunction* shape_size) { - if (shape_size == nullptr) { - shape_size = &shape_size_; - } - HloCostAnalysis visitor(*shape_size, per_second_rates_); + HloComputation* computation) { + HloCostAnalysis visitor(shape_size_, per_second_rates_); TF_RETURN_IF_ERROR(computation->Accept(&visitor)); return visitor.properties(); } diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h index 82d650dc7b2a7fdd7c156d5fadcabd40f5535161..193a04bea0831de2b3aca19b17a445ad73e02e49 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h @@ -71,6 +71,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor { Status HandleConvolution(const HloInstruction* convolution) override; Status HandleFft(const HloInstruction* fft) override; Status HandleCrossReplicaSum(const HloInstruction* crs) override; + Status HandleAllToAll(const HloInstruction* hlo) override; Status HandleInfeed(const HloInstruction* infeed) override; Status HandleOutfeed(const HloInstruction* outfeed) override; Status HandleHostCompute(const HloInstruction* host_compute) override; @@ -104,6 +105,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor { Status HandleWhile(const HloInstruction* xla_while) override; Status HandleConditional(const HloInstruction* conditional) override; Status HandleGather(const HloInstruction* gather) override; + Status HandleScatter(const HloInstruction* scatter) override; Status FinishVisit(const HloInstruction* root) override; Status Preprocess(const HloInstruction* hlo) override; @@ -149,11 +151,8 @@ class HloCostAnalysis : public ConstDfsHloVisitor { const Properties& per_second_rates); // Returns the properties computed from visiting the computation rooted at the - // given hlo. Uses shape_size_ to calculate shape sizes if shape_size is null, - // otherwise uses shape_size_. - StatusOr ProcessSubcomputation( - HloComputation* computation, - const ShapeSizeFunction* shape_size = nullptr); + // given hlo. + StatusOr ProcessSubcomputation(HloComputation* computation); // Utility function to handle all element-wise operations. Status HandleElementwiseOp(const HloInstruction* hlo_instruction); @@ -170,6 +169,10 @@ class HloCostAnalysis : public ConstDfsHloVisitor { static float GetPropertyForHlo(const HloInstruction& hlo, const string& key, const HloToProperties& hlo_to_properties); + // Decorates shape_size_ by returning 0 immediately if the shape does not have + // a layout. + int64 GetShapeSize(const Shape& shape) const; + // Function which computes the size of the top-level of a given shape (not // including nested elements, if any). If null then bytes_accessed methods // return an error. diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc index 2ec31a91488805b323549575f9c8d1a92ea9c619..4755c4a0cf8d268b1c47e596a14605eb2c60b36c 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc @@ -2365,7 +2365,7 @@ TEST_F(CanShareOperandBufferWithUserTest, FusionCanShareBufferCustomized) { TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) { Shape data_shape = ShapeUtil::MakeShape(F32, {8}); - auto make_cond = [this, &data_shape]() { + auto make_cond = [&data_shape]() { auto builder = HloComputation::Builder(TestName() + ".Cond"); auto data = builder.AddInstruction( HloInstruction::CreateParameter(0, data_shape, "data")); @@ -2374,7 +2374,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) { return builder.Build(); }; - auto make_body = [this, &data_shape]() { + auto make_body = [&data_shape]() { auto builder = HloComputation::Builder(TestName() + ".Body"); auto data = builder.AddInstruction( HloInstruction::CreateParameter(0, data_shape, "data")); diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc index ffc18a0f886df86d87944d9c284a6faf8afe4c60..70271be304336767bd3fd01297217e9309a941b6 100644 --- a/tensorflow/compiler/xla/service/hlo_domain_test.cc +++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc @@ -490,5 +490,38 @@ TEST_F(HloDomainTest, DumpParseNullSharding) { ASSERT_TRUE(ParseModule(hlo_string).status().ok()); } +TEST_F(HloDomainTest, DomainTuple) { + const char* const hlo_string = R"( +HloModule Module + +ENTRY entry { + p0 = f32[4] parameter(0), sharding={maximal device=0} + cst = u32[] constant(0), sharding={maximal device=1} + tpl = (u32[], f32[4]) tuple(cst, p0), sharding={{maximal device=1}, {maximal device=0}} + ROOT gte = f32[4] get-tuple-element(tpl), index=1, sharding={maximal device=0} +} +)"; + + TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string)); + + HloDomainIsolator isolator(CreateShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module)); + EXPECT_TRUE(isolator_changed); + + // Clear sharding of tpl instruction, in order to test domain sharding + // application. + auto tpl = FindInstruction(module, "tpl"); + tpl->clear_sharding(); + + HloDomainRemover remover(ShardingMetadata::KindName(), + ShardingMetadata::NormalizeShardingDomain); + TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module)); + EXPECT_TRUE(remover_changed); + + EXPECT_EQ(HloSharding::Tuple(tpl->shape(), {HloSharding::AssignDevice(1), + HloSharding::AssignDevice(0)}), + tpl->sharding()); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc index c804f4364f6d16d5b8112219ce884495200aa827..b9244b8e9e5f34e7ac5113c8eacb6f8243eea314 100644 --- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc +++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc @@ -144,6 +144,7 @@ StatusOr HloElementTypeConverter::Run(HloModule* module) { opcode == HloOpcode::kCrossReplicaSum || opcode == HloOpcode::kFusion || opcode == HloOpcode::kMap || opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow || + opcode == HloOpcode::kScatter || opcode == HloOpcode::kSelectAndScatter || opcode == HloOpcode::kConditional) { continue; diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc index cba72469ce73603f05d9957eb64e8519e8b8aec0..3ac6d68df30955d2e5e06e1e76d2182772151b47 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc +++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc @@ -2048,6 +2048,459 @@ ENTRY main { *Evaluate({operand.get(), gather_indices.get()}))); } +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) { + const char* hlo_text = R"( +HloModule TensorFlowScatterV1 + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{10, 20, 30}, {4, 5, 6}, {70, 80, 90}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) { + const char* hlo_text = R"( +HloModule TensorFlowScatterV2 + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[3,2] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={0}, + inserted_window_dims={1}, + scatter_dims_to_operand_dims={1}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 30}, {40, 60}, {70, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{10, 2, 30}, {40, 5, 60}, {70, 8, 90}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) { + const char* hlo_text = R"( +HloModule TensorFlowScatter + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{11, 22, 33}, {4, 5, 6}, {77, 88, 99}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) { + const char* hlo_text = R"( +HloModule TensorFlowScatter + +mul_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT mul = s32[] multiply(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=mul_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{10, 40, 90}, {4, 5, 6}, {490, 640, 810}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_F32) { + const char* hlo_text = R"( +HloModule TensorFlowScatter + +add_f32 (lhs: f32[], rhs: f32[]) -> f32[] { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(f32[] lhs, f32[] rhs) +} + +ENTRY main { + operand = f32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = f32[2,3] parameter(2) + ROOT scatter = f32[3,3] scatter(operand, indices, updates), + to_apply=add_f32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = LiteralUtil::CreateR2( + {{1.1, 2.2, 3.3}, {4.4, 5.5, 6.6}, {7.7, 8.8, 9.9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({2, 1}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{0.4, 1.1, 0.7}, {2.3, 3.1, 1.6}}); + EXPECT_TRUE(LiteralTestUtil::Near( + *LiteralUtil::CreateR2( + {{1.1, 2.2, 3.3}, {6.7, 8.6, 8.2}, {8.1, 9.9, 10.6}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}), + ErrorSpec{0.1, 0.01})); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) { + const char* hlo_text = R"( +HloModule TensorFlowScatter + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,3] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({1, 1}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20, 30}, {70, 80, 90}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{1, 2, 3}, {84, 105, 126}, {7, 8, 9}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) { + const char* hlo_text = R"( +HloModule TensorFlowScatterMultipleBatchDims + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,3,2] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={1}, + inserted_window_dims={1}, + scatter_dims_to_operand_dims={1}, + index_vector_dim=2 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{0, 2}, {2, 1}}); + std::unique_ptr updates = LiteralUtil::CreateR3( + {{{10, 30}, {40, 60}, {70, 90}}, {{5, 5}, {5, 5}, {5, 5}}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *LiteralUtil::CreateR2({{11, 7, 38}, {44, 10, 71}, {77, 13, 104}}), + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) { + const char* hlo_text = R"( +HloModule TensorFlowScatterNd + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3,2] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,2] parameter(2) + ROOT scatter = s32[3,3,2] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0,1}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{-10, 10}, {-40, 40}}); + std::unique_ptr expected = + LiteralUtil::CreateR3({{{-10, 10}, {-2, 2}, {-3, 3}}, // + {{-40, 40}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, + EvaluateScatter_TensorFlowScatterNd_NonDefaultIndexVectorDim) { + const char* hlo_text = R"( +HloModule TensorFlowScatterNdNonDefaultIndexVectorDim + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3,2] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,2] parameter(2) + ROOT scatter = s32[3,3,2] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0,1}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=0 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR3({{{-1, 1}, {-2, 2}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{0, 0}, {1, 0}}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{-10, 10}, {-20, 20}}); + std::unique_ptr expected = + LiteralUtil::CreateR3({{{-20, 20}, {-10, 10}, {-3, 3}}, // + {{-4, 4}, {-5, 5}, {-6, 6}}, // + {{-7, 7}, {-8, 8}, {-9, 9}}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) { + const char* hlo_text = R"( +HloModule DynamicUpdateSlice + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + updates = s32[1,1] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={0,1}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=0 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({1, 1}); + std::unique_ptr updates = LiteralUtil::CreateR2({{10}}); + std::unique_ptr expected = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 10, 6}, {7, 8, 9}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) { + const char* hlo_text = R"( +HloModule BatchDynamicUpdateSlice + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2,2] parameter(1) + updates = s32[2,1,1] parameter(2) + ROOT scatter = s32[3,3] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1,2}, + inserted_window_dims={}, + scatter_dims_to_operand_dims={0,1}, + index_vector_dim=0 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR2({{2, 1}, {1, 1}}); + std::unique_ptr updates = + LiteralUtil::CreateR3({{{10}}, {{20}}}); + std::unique_ptr expected = + LiteralUtil::CreateR2({{1, 2, 3}, {4, 20, 6}, {7, 10, 9}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) { + const char* hlo_text = R"( +HloModule TensorFlowScatter_ZeroDimBounds + +update_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + ROOT rhs = s32[] parameter(1) +} + +ENTRY main { + operand = s32[3,0] parameter(0) + indices = s32[2] parameter(1) + updates = s32[2,0] parameter(2) + ROOT scatter = s32[3,0] scatter(operand, indices, updates), + to_apply=update_s32, + update_window_dims={1}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=1 +} +)"; + ParseAndVerifyModule(hlo_text); + std::unique_ptr operand = LiteralUtil::CreateR2({{}, {}, {}}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR1({0, 2}); + std::unique_ptr updates = LiteralUtil::CreateR2({{}, {}}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *operand, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + +TEST_P(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) { + const string hlo_text = R"( +HloModule Scatter_NoUpdateWindowDims + +add_s32 (lhs: s32[], rhs: s32[]) -> s32[] { + lhs = s32[] parameter(0) + rhs = s32[] parameter(1) + ROOT add = s32[] add(s32[] lhs, s32[] rhs) +} + +ENTRY main { + operand = s32[3] parameter(0) + indices = s32[2,2,1] parameter(1) + updates = s32[2,2] parameter(2) + ROOT scatter = s32[3] scatter(operand, indices, updates), + to_apply=add_s32, + update_window_dims={}, + inserted_window_dims={0}, + scatter_dims_to_operand_dims={0}, + index_vector_dim=2 +} +)"; + ParseAndVerifyModule(hlo_text); + + std::unique_ptr operand = LiteralUtil::CreateR1({0, 1, 2}); + std::unique_ptr scatter_indices = + LiteralUtil::CreateR3({{{0}, {1}}, {{2}, {1}}}); + std::unique_ptr updates = + LiteralUtil::CreateR2({{10, 20}, {30, 40}}); + std::unique_ptr expected = + LiteralUtil::CreateR1({10, 61, 32}); + EXPECT_TRUE(LiteralTestUtil::Equal( + *expected, + *Evaluate({operand.get(), scatter_indices.get(), updates.get()}))); +} + // Verifies that HloEvaluator evaluates a HLO instruction that performs // element-wise comparison with 2 bfloat16 operands. TEST_P(HloEvaluatorTest, DoesCompareBF16) { diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h index d5b4be7e1284509a4494b0e804e5396c7cfcecc2..084b49b4783fe15e91917317d8b3746e2c7569d0 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h +++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h @@ -1473,6 +1473,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { } Status HandleReduce(HloInstruction* reduce) override { + // TODO(b/112040122): Support variadic reduce. + if (!ShapeUtil::IsArray(reduce->shape())) { + return Unimplemented("Variadic reduce is not supported in the Evaluator"); + } auto arg = reduce->operand(0); auto init_value = reduce->operand(1); tensorflow::gtl::ArraySlice dimensions(reduce->dimensions()); @@ -1481,8 +1485,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { ShapeUtil::Rank(arg->shape()) - dimensions.size()); TF_ASSIGN_OR_RETURN(auto inferred_return_shape, ShapeInference::InferReduceShape( - /*arg=*/arg->shape(), - /*init_value=*/init_value->shape(), + {&arg->shape(), &init_value->shape()}, /*dimensions_to_reduce=*/dimensions, /*to_apply=*/function->ComputeProgramShape())); TF_RET_CHECK(ShapeUtil::Compatible(reduce->shape(), inferred_return_shape)) @@ -1772,6 +1775,388 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Status::OK(); } + // Reshapes the scatter indices input to have a trailing degenerate `1` + // dimension if necessary. Hands over the ownership of the newly created + // literal (if there is one) to `reshaped_indices`. + StatusOr> ReshapedScatterIndices( + int64 index_vector_dim, const Literal& indices, + std::unique_ptr* reshaped_indices) { + if (indices.shape().dimensions_size() != index_vector_dim) { + return std::cref(indices); + } + + std::vector new_shape(indices.shape().dimensions().begin(), + indices.shape().dimensions().end()); + new_shape.push_back(1); + TF_ASSIGN_OR_RETURN(*reshaped_indices, indices.Reshape(new_shape)); + return std::cref(**reshaped_indices); + } + + // Returns an ShapeUtil::IndexIterationSpace that iterates over the update + // scatter dimensions while keeping the rest of the update dimensions clamped + // to 0. + ShapeUtil::IndexIterationSpace IterationSpaceForUpdateScatterIndices( + const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) { + int64 updates_rank = updates_shape.dimensions_size(); + std::vector index_base(updates_rank, 0); + std::vector index_count(updates_rank, 1); + for (int64 i = 0; i < updates_rank; i++) { + bool is_update_scatter_dim = + !c_binary_search(dim_numbers.update_window_dims(), i); + if (is_update_scatter_dim) { + index_count[i] = updates_shape.dimensions(i); + } + } + return {std::move(index_base), std::move(index_count), + std::vector(updates_rank, 1)}; + } + + // Return an ShapeUtil::IndexIterationSpace that iterates over the update + // window dimensions while keeping the rest of the update dimensions clamped + // to 0. + ShapeUtil::IndexIterationSpace IterationSpaceForUpdateWindowIndices( + const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) { + int64 updates_rank = updates_shape.dimensions_size(); + std::vector index_base(updates_rank, 0); + std::vector index_count(updates_rank, 1); + for (int64 i = 0; i < updates_rank; i++) { + bool is_update_window_dim = + c_binary_search(dim_numbers.update_window_dims(), i); + if (is_update_window_dim) { + index_count[i] = updates_shape.dimensions(i); + } + } + return {std::move(index_base), std::move(index_count), + std::vector(updates_rank, 1)}; + } + + // This functor computes the contribution of scatter_indices to an input index + // corresponding to an update index. That is, given an update index I, it + // picks out the scatter indices in I and uses them to look up a scatter + // index, S, from the scatter indices tensor, and expands S into the input + // space according to scatter_dims_to_operand_dims. + // + // This is similar to the class HloEvaluator::OutputGatherIndexToInputIndex + // that does the corresponding function for Gather. + class UpdateScatterIndexToInputIndex { + public: + // The constructor does some setup work that is amortized across all + // iterations. + explicit UpdateScatterIndexToInputIndex( + const ScatterDimensionNumbers* dim_numbers, const Shape& input_shape, + const Shape& updates_shape, const Literal* scatter_indices) + : dim_numbers_(*dim_numbers), scatter_indices_(*scatter_indices) { + for (int64 i = 0; i < updates_shape.dimensions_size(); i++) { + update_dim_is_scatter_dims_.push_back( + !c_binary_search(dim_numbers_.update_window_dims(), i)); + } + + for (int64 i = 0; i < input_shape.dimensions_size(); i++) { + int64 index_of_input_dim_in_index_vector = + FindIndex(dim_numbers_.scatter_dims_to_operand_dims(), i); + if (index_of_input_dim_in_index_vector == + dim_numbers_.scatter_dims_to_operand_dims_size()) { + input_dim_value_to_index_vector_.push_back(-1); + } else { + input_dim_value_to_index_vector_.push_back( + index_of_input_dim_in_index_vector); + } + } + + index_vector_index_.resize(scatter_indices_.shape().dimensions_size()); + input_index_.resize(input_shape.dimensions_size()); + int64 index_vector_size = + scatter_indices_.shape().dimensions(dim_numbers_.index_vector_dim()); + index_vector_.resize(index_vector_size); + } + + // Returns the contribution of scatter_indices to the input index + // corresponding to update_index. See scatter_inner_loop_body. + // + // This is conceptually a stateless transformation from update_index to the + // scatter input index, but: + // + // - Instead of allocating memory to represent the scatter input index on + // every invocation we reuse the same storage for the result + // (input_index_), mutating it in place. + // - Instead of allocating buffers for temporary values like + // index_vector_index_ and index_vector on every invocation, we reuse the + // same storage for all invocations. + // + // This returns an arrayslice into memory owned by the class. + StatusOr> operator()( + tensorflow::gtl::ArraySlice update_index) { + PropagateUpdateIndexScatterDimsToIndexVectorIndex(update_index); + TF_RETURN_IF_ERROR(FetchIndexVector()); + PropagateIndexVectorToInputIndex(); + return tensorflow::gtl::ArraySlice(input_index_); + } + + private: + // Propagates the scatter index dimensions from the update index into + // index_vector_index_ by mutating index_vector_index_ in place. Does not + // update the dim_numbers.index_vector_dim() dimension -- that's the + // dimension we iterate over in FetchIndexVector. + void PropagateUpdateIndexScatterDimsToIndexVectorIndex( + tensorflow::gtl::ArraySlice update_index) { + int64 index_vector_index_i = 0; + for (int64 i = 0, e = update_index.size(); i < e; i++) { + if (!update_dim_is_scatter_dims_[i]) { + continue; + } + + if (index_vector_index_i == dim_numbers_.index_vector_dim()) { + index_vector_index_i++; + } + + index_vector_index_[index_vector_index_i++] = update_index[i]; + } + } + + // Populates index_vector_ by iterating over scatter_indices_ according to + // index_vector_index_. + Status FetchIndexVector() { + int64 index_vector_dim = dim_numbers_.index_vector_dim(); + for (int64 i = 0, e = index_vector_.size(); i < e; i++) { + index_vector_index_[index_vector_dim] = i; + TF_ASSIGN_OR_RETURN(index_vector_[i], scatter_indices_.GetIntegralAsS64( + index_vector_index_)); + } + return Status::OK(); + } + + // Populates input_index_. + void PropagateIndexVectorToInputIndex() { + for (int64 i = 0, e = input_index_.size(); i < e; i++) { + if (input_dim_value_to_index_vector_[i] != -1) { + input_index_[i] = index_vector_[input_dim_value_to_index_vector_[i]]; + } + + // If input_dim_value_to_index_vector_[i] == -1 then input_index_[i] + // remains 0, as set by the constructor. + } + } + + // input_dim_value_to_index_vector_[i] tells us how to compute dimension i + // of the input index from the index vector. See + // PropagateIndexVectorToInputIndex. + std::vector input_dim_value_to_index_vector_; + + // update_dim_is_scatter_dims_[i] is true iff the update index i is a + // scatter dimension. + std::vector update_dim_is_scatter_dims_; + + // The buffer into which we construct an index into scatter_indices_ to + // fetch the index vector. + std::vector index_vector_index_; + + // The index vector fetched from scatter_indices_. + std::vector index_vector_; + + // The result computed by this functor. operator() returns an ArraySlice + // into this vector. + std::vector input_index_; + + const ScatterDimensionNumbers& dim_numbers_; + const Literal& scatter_indices_; + }; + + // This functor computes the contribution of the window indices in an update + // index to an input index. That is, given an update index I it picks out the + // update window indices in I and expands it into a window index into the + // input shape. + // + // This is similar to the class HloEvaluator::OutputWindowIndexToInputIndex + // that does the corresponding function for Gather. + class UpdateWindowIndexToInputIndex { + public: + // The constructor does some setup work that is amortized across all + // iterations. + explicit UpdateWindowIndexToInputIndex( + const ScatterDimensionNumbers& dim_numbers, const Shape& input_shape, + const Shape& updates_shape) { + std::vector window_index_to_update_index; + int64 update_index_count = 0; + for (int64 i = 0; i < updates_shape.dimensions_size(); i++) { + if (c_binary_search(dim_numbers.update_window_dims(), i)) { + window_index_to_update_index.push_back(update_index_count++); + } else { + update_index_count++; + } + } + + int64 window_dim_count = 0; + for (int64 i = 0; i < input_shape.dimensions_size(); i++) { + if (c_binary_search(dim_numbers.inserted_window_dims(), i)) { + input_dim_value_to_update_index_.push_back(-1); + } else { + input_dim_value_to_update_index_.push_back( + window_index_to_update_index[window_dim_count++]); + } + } + + input_index_.resize(input_shape.dimensions_size()); + } + + // Returns the contribution of the window indices to the input index + // corresponding to update_index. See scatter_inner_loop_body. + // + // This is conceptually a stateless transformation from update_index to the + // window input index, but instead of allocating memory to represent the + // scatter input index on every invocation we reuse the same storage for the + // result (input_index_), mutating it in place. + // + // This returns an arrayslice into memory owned by the class. + StatusOr> operator()( + tensorflow::gtl::ArraySlice update_index) { + PropagateUpdateIndexWindowDimsToInputIndex(update_index); + return tensorflow::gtl::ArraySlice(input_index_); + } + + // Returns for a given 'input_dim' the corresponding update dimension index, + // or -1 if 'input_dim' is an elided window dimension. + int64 input_dim_value_to_update_index(int64 input_dim) { + return input_dim_value_to_update_index_[input_dim]; + } + + private: + // Propagates window dimensions from the update index to input_index_ by + // mutating input_index_ in place. + void PropagateUpdateIndexWindowDimsToInputIndex( + tensorflow::gtl::ArraySlice update_index) { + for (int64 i = 0, e = input_index_.size(); i < e; i++) { + if (input_dim_value_to_update_index_[i] != -1) { + input_index_[i] = update_index[input_dim_value_to_update_index_[i]]; + } + + // If input_dim_value_to_index_vector_[i] == -1 then input_index_[i] + // remains 0, as set by the constructor. + } + } + + // input_dim_value_to_index_vector_[i] tells us how to compute dimension i + // of the input index from the update index. See + // PropagateUpdateIndexWindowDimsToInputIndex. + std::vector input_dim_value_to_update_index_; + + // The result computed by this functor. operator() returns an ArraySlice + // into this vector. + std::vector input_index_; + }; + + Status HandleScatter(HloInstruction* scatter) override { + const ScatterDimensionNumbers& dim_numbers = + scatter->scatter_dimension_numbers(); + const Literal& operand = + parent_->GetEvaluatedLiteralFor(scatter->operand(0)); + std::unique_ptr reshaped_scatter_indices; + TF_ASSIGN_OR_RETURN(const Literal& scatter_indices, + ReshapedScatterIndices(dim_numbers.index_vector_dim(), + parent_->GetEvaluatedLiteralFor( + scatter->operand(1)), + &reshaped_scatter_indices)); + const Literal& updates = + parent_->GetEvaluatedLiteralFor(scatter->operand(2)); + const Shape& updates_shape = updates.shape(); + const Shape& operand_shape = operand.shape(); + + ShapeUtil::IndexIterationSpace scatter_indices_iteration_space = + IterationSpaceForUpdateScatterIndices(updates_shape, dim_numbers); + ShapeUtil::IndexIterationSpace window_indices_iteration_space = + IterationSpaceForUpdateWindowIndices(updates_shape, dim_numbers); + + std::vector input_index(operand_shape.dimensions_size()); + std::vector update_index(updates_shape.dimensions_size()); + std::vector input_scatter_index_clamped( + operand_shape.dimensions_size()); + + UpdateScatterIndexToInputIndex update_scatter_index_to_input_index( + &scatter->scatter_dimension_numbers(), /*input_shape=*/operand_shape, + updates_shape, &scatter_indices); + UpdateWindowIndexToInputIndex update_window_index_to_input_index( + scatter->scatter_dimension_numbers(), /*input_shape=*/operand_shape, + updates_shape); + + // Initialize the result with the operand. This makes it easier to handle + // the updates even when the indices are repeated. + std::unique_ptr result = operand.CloneToUnique(); + HloEvaluator embedded_evaluator; + auto scatter_inner_loop_body = + [&](tensorflow::gtl::ArraySlice update_window_index, + tensorflow::gtl::ArraySlice input_scatter_index, + tensorflow::gtl::ArraySlice update_scatter_index) + -> StatusOr { + TF_ASSIGN_OR_RETURN( + tensorflow::gtl::ArraySlice input_window_index, + update_window_index_to_input_index(update_window_index)); + for (int i = 0, e = update_index.size(); i < e; i++) { + update_index[i] = update_scatter_index[i] + update_window_index[i]; + DCHECK_LT(update_index[i], updates_shape.dimensions(i)); + } + for (int i = 0, e = input_scatter_index.size(); i < e; i++) { + int64 update_dim = + update_window_index_to_input_index.input_dim_value_to_update_index( + i); + // If 'update_dim' is -1, it means 'i' is an elided window dim. This + // means we set the iteration index to 0, so for the purpose of the + // following calculations we can consider the update dimension size to + // be 1. + int64 update_dim_size = + update_dim == -1 ? 1 : updates_shape.dimensions(update_dim); + // Clamp the scatter index so that the scatter region fits in the + // operand. input_scatter_index_clamped[i] = + // clamp(input_scatter_index[i], 0, + // operand_shape.dimensions(i) - + // update_dim_size); + input_scatter_index_clamped[i] = + std::min(operand_shape.dimensions(i) - update_dim_size, + std::max(0LL, input_scatter_index[i])); + } + for (int i = 0, e = input_index.size(); i < e; i++) { + input_index[i] = input_scatter_index_clamped[i] + input_window_index[i]; + DCHECK_GE(input_index[i], 0); + DCHECK_LT(input_index[i], operand_shape.dimensions(i)); + } + + auto result_value_literal = + LiteralUtil::CreateR0(result->Get(input_index)); + auto update_value_literal = + LiteralUtil::CreateR0(updates.Get(update_index)); + std::unique_ptr updated_result = + embedded_evaluator + .Evaluate( + *scatter->to_apply(), + {result_value_literal.get(), update_value_literal.get()}) + .ConsumeValueOrDie(); + // Clear visit states so that the we can use the evaluate again on the + // same computation. + embedded_evaluator.ResetVisitStates(); + result->Set(input_index, updated_result->Get({})); + return true; + }; + + auto scatter_outer_loop_body = + [&](tensorflow::gtl::ArraySlice update_scatter_index) + -> StatusOr { + TF_ASSIGN_OR_RETURN( + tensorflow::gtl::ArraySlice input_scatter_index, + update_scatter_index_to_input_index(update_scatter_index)); + TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus( + updates_shape, window_indices_iteration_space, + [&](tensorflow::gtl::ArraySlice update_window_index) { + return scatter_inner_loop_body( + update_window_index, input_scatter_index, update_scatter_index); + })); + return true; + }; + + TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus( + updates_shape, scatter_indices_iteration_space, + scatter_outer_loop_body)); + parent_->evaluated_[scatter] = std::move(result); + return Status::OK(); + } + Status HandleSlice(HloInstruction* slice) override { auto operand = slice->operand(0); const Shape& shape = slice->shape(); diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index fd5085bed234068a1bdf18977b38d92badc02a49..1efa6eb5bda7e1cb90874e0466aafd2c788a3fbf 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -844,7 +844,10 @@ string HloDotDumper::GetInstructionNodeInlinedOperands( *elem_count *= dim; } } - if (elem_count.has_value() && *elem_count <= 8) { + // Allow HloDotDumper to print HloInstruction reconstructed from HloProto + // collected from profiling tools. Those constants may not have a valid + // literal. + if (elem_count.has_value() && *elem_count <= 8 && constant->HasLiteral()) { return Printf("%s (%s)", constant->literal().ToString(), ShapeUtil::HumanString(constant->shape())); } @@ -1019,6 +1022,8 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { return kWhite; } return kGreen; + case HloOpcode::kScatter: + // Do not de-emphasize Scatter, since it involves significant work. case HloOpcode::kCopy: // Emphasize copy nodes, which are either physical transposes (and thus // significant), or copies of read-only buffers (and thus dead weight). @@ -1043,6 +1048,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { case HloOpcode::kMap: return kGray; case HloOpcode::kCrossReplicaSum: + case HloOpcode::kAllToAll: case HloOpcode::kInfeed: case HloOpcode::kOutfeed: case HloOpcode::kRecv: diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index 8b9bdd2f46fe8a63b419b45ef2c2a2e025c60c8f..8690f2cdaa9b45d126e91b123c6992cbe2f27e1d 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -320,6 +320,15 @@ StatusOr> HloInstruction::CreateFromProto( /*all_reduce_id=*/all_reduce_id); break; } + case HloOpcode::kAllToAll: { + instruction = CreateAllToAll( + proto.shape(), all_operands(), + /*replica_groups=*/ + std::vector(proto.replica_groups().begin(), + proto.replica_groups().end()), + /*barrier=*/proto.cross_replica_sum_barrier()); + break; + } case HloOpcode::kConvolution: TF_RET_CHECK(proto.operand_ids_size() == 2) << "Convolution instruction should have 2 operands but sees " @@ -404,6 +413,22 @@ StatusOr> HloInstruction::CreateFromProto( *gather_dimension_numbers, gather_window_bounds); break; } + case HloOpcode::kScatter: { + TF_RET_CHECK(proto.operand_ids_size() == 3) + << "Scatter instruction should have 3 operands but sees " + << proto.operand_ids_size(); + TF_RET_CHECK(proto.has_scatter_dimension_numbers()) + << "Scatter instruction should have ScatterDimensionNumbers set."; + TF_RET_CHECK(proto.called_computation_ids_size() == 1) + << "Scatter instruction should have 1 called computation but sees " + << proto.called_computation_ids_size(); + auto scatter_dimension_numbers = MakeUnique( + proto.scatter_dimension_numbers()); + instruction = + CreateScatter(proto.shape(), operands(0), operands(1), operands(2), + computations(0), *scatter_dimension_numbers); + break; + } default: { instruction = WrapUnique(new HloInstruction(opcode, proto.shape())); for (const int64 operand_id : proto.operand_ids()) { @@ -655,6 +680,14 @@ HloInstruction::CreateCrossReplicaSum( all_reduce_id); } +/* static */ std::unique_ptr HloInstruction::CreateAllToAll( + const Shape& shape, tensorflow::gtl::ArraySlice operands, + const std::vector& replica_groups, + tensorflow::StringPiece barrier) { + return MakeUnique(shape, operands, replica_groups, + barrier); +} + /* static */ std::unique_ptr HloInstruction::CreateInfeed( const Shape& infeed_shape, HloInstruction* token_operand, const string& config) { @@ -812,11 +845,25 @@ HloInstruction::CreateBitcastConvert(const Shape& shape, } /* static */ std::unique_ptr HloInstruction::CreateReduce( - const Shape& shape, HloInstruction* arg, HloInstruction* init_value, + const Shape& shape, HloInstruction* operand, HloInstruction* init_value, + tensorflow::gtl::ArraySlice dimensions_to_reduce, + HloComputation* reduce_computation) { + auto instruction = WrapUnique(new HloReduceInstruction( + shape, {operand, init_value}, dimensions_to_reduce, reduce_computation)); + return std::move(instruction); +} + +/* static */ std::unique_ptr HloInstruction::CreateReduce( + const Shape& shape, tensorflow::gtl::ArraySlice operands, + tensorflow::gtl::ArraySlice init_values, tensorflow::gtl::ArraySlice dimensions_to_reduce, HloComputation* reduce_computation) { - return MakeUnique( - shape, arg, init_value, dimensions_to_reduce, reduce_computation); + std::vector all_args; + all_args.reserve(operands.size() * 2); + all_args.insert(all_args.end(), operands.begin(), operands.end()); + all_args.insert(all_args.end(), init_values.begin(), init_values.end()); + return MakeUnique(shape, all_args, dimensions_to_reduce, + reduce_computation); } /* static */ std::unique_ptr HloInstruction::CreateReduceWindow( @@ -1062,6 +1109,16 @@ bool HloInstruction::HasSideEffect() const { gather_dim_numbers, window_bounds); } +/* static */ std::unique_ptr HloInstruction::CreateScatter( + const Shape& shape, HloInstruction* operand, + HloInstruction* scatter_indices, HloInstruction* updates, + HloComputation* update_computation, + const ScatterDimensionNumbers& scatter_dim_numbers) { + return MakeUnique(shape, operand, scatter_indices, + updates, update_computation, + scatter_dim_numbers); +} + /* static */ std::unique_ptr HloInstruction::CreateDomain( const Shape& shape, HloInstruction* operand, std::unique_ptr operand_side_metadata, @@ -1113,6 +1170,7 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( case HloOpcode::kGetTupleElement: case HloOpcode::kReducePrecision: case HloOpcode::kCrossReplicaSum: + case HloOpcode::kAllToAll: case HloOpcode::kInfeed: case HloOpcode::kOutfeed: case HloOpcode::kConvolution: @@ -1124,6 +1182,7 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( case HloOpcode::kDynamicSlice: case HloOpcode::kSort: case HloOpcode::kGather: + case HloOpcode::kScatter: case HloOpcode::kIota: clone = CloneWithNewOperandsImpl(shape, new_operands, context); break; @@ -1579,6 +1638,7 @@ bool HloInstruction::IdenticalSlowPath( case HloOpcode::kInfeed: case HloOpcode::kOutfeed: case HloOpcode::kCrossReplicaSum: + case HloOpcode::kAllToAll: case HloOpcode::kConvolution: case HloOpcode::kCustomCall: case HloOpcode::kReduceWindow: @@ -1587,6 +1647,7 @@ bool HloInstruction::IdenticalSlowPath( case HloOpcode::kPad: case HloOpcode::kDynamicSlice: case HloOpcode::kGather: + case HloOpcode::kScatter: LOG(FATAL) << "Base class impl called for opcode with subclass: " << opcode(); } @@ -1693,6 +1754,7 @@ HloComputation* HloInstruction::to_apply() const { case HloOpcode::kReduceWindow: case HloOpcode::kReduce: case HloOpcode::kCrossReplicaSum: + case HloOpcode::kScatter: CHECK_EQ(called_computations_.size(), 1); return called_computations_[0]; default: @@ -1711,6 +1773,7 @@ void HloInstruction::set_to_apply(HloComputation* computation) { case HloOpcode::kReduceWindow: case HloOpcode::kReduce: case HloOpcode::kCrossReplicaSum: + case HloOpcode::kScatter: CHECK_EQ(called_computations_.size(), 1); called_computations_[0] = computation; break; @@ -1977,7 +2040,8 @@ std::vector HloInstruction::ExtraAttributesToString( } else if (opcode() == HloOpcode::kCall || opcode() == HloOpcode::kMap || opcode() == HloOpcode::kReduceWindow || opcode() == HloOpcode::kReduce || - opcode() == HloOpcode::kCrossReplicaSum) { + opcode() == HloOpcode::kCrossReplicaSum || + opcode() == HloOpcode::kScatter) { extra.push_back( StrCat("to_apply=", PrintName(to_apply()->name(), options))); } else if (!called_computations().empty()) { @@ -2013,6 +2077,7 @@ std::vector HloInstruction::ExtraAttributesToString( case HloOpcode::kReduceWindow: case HloOpcode::kReduce: case HloOpcode::kCrossReplicaSum: + case HloOpcode::kScatter: extra.push_back( StrCat("to_apply=\n", to_apply()->ToString(new_options))); break; @@ -2219,6 +2284,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return visitor->HandleFft(this); case HloOpcode::kCrossReplicaSum: return visitor->HandleCrossReplicaSum(this); + case HloOpcode::kAllToAll: + return visitor->HandleAllToAll(this); case HloOpcode::kTuple: return visitor->HandleTuple(this); case HloOpcode::kMap: @@ -2311,6 +2378,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return visitor->HandleSendDone(this); case HloOpcode::kGather: return visitor->HandleGather(this); + case HloOpcode::kScatter: + return visitor->HandleScatter(this); case HloOpcode::kDomain: return visitor->HandleDomain(this); case HloOpcode::kAfterAll: @@ -3091,12 +3160,23 @@ const std::vector& HloInstruction::replica_group_ids() const { return Cast(this)->replica_group_ids(); } +const std::vector& HloInstruction::replica_groups() const { + return Cast(this)->replica_groups(); +} + string HloInstruction::cross_replica_sum_barrier() const { - return Cast(this)->cross_replica_sum_barrier(); + if (opcode() == HloOpcode::kCrossReplicaSum) { + return Cast(this)->cross_replica_sum_barrier(); + } + return Cast(this)->cross_replica_sum_barrier(); } void HloInstruction::set_cross_replica_sum_barrier(const string& barrier) { - return Cast(this)->set_cross_replica_sum_barrier( + if (opcode() == HloOpcode::kCrossReplicaSum) { + return Cast(this)->set_cross_replica_sum_barrier( + barrier); + } + return Cast(this)->set_cross_replica_sum_barrier( barrier); } @@ -3171,4 +3251,9 @@ tensorflow::gtl::ArraySlice HloInstruction::gather_window_bounds() return Cast(this)->gather_window_bounds(); } +const ScatterDimensionNumbers& HloInstruction::scatter_dimension_numbers() + const { + return Cast(this)->scatter_dimension_numbers(); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 70441b879de6a7fa4af24aa489069c75d7d5cbbd..3c575ae6ea8e60f48def4debcd9cfbea63e396b2 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -449,6 +449,26 @@ class HloInstruction { tensorflow::StringPiece barrier, const tensorflow::gtl::optional& all_reduce_id); + // This op handles the communication of an Alltoall operation. On each core, + // the operands are N ops in the same shape, where N is the number of cores + // participating the Alltoall. Then the N operands are scattered to N cores, + // e.g., the ith operand is sent to the ith core. Then each core gathers the + // received data into a tuple. + // + // - `replica_groups`: each ReplicaGroup contains a list of replica id. If + // empty, all replicas belong to one group in the order of 0 - (n-1). Alltoall + // will be applied within subgroups in the specified order. For example, + // replica groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied + // within replica 1, 2, 3, and in the gather phase, the received blocks will + // be concatenated in the order of 1, 2, 3; another Alltoall will be applied + // within replica 4, 5, 0, and the concatenation order is 4, 5, 0. + // + // TODO(b/110096724): This is NOT YET ready to use. + static std::unique_ptr CreateAllToAll( + const Shape& shape, tensorflow::gtl::ArraySlice operands, + const std::vector& replica_groups, + tensorflow::StringPiece barrier); + // Creates a conversion instruction, where operand is the data to convert and // shape is the target shape for the conversion. static std::unique_ptr CreateConvert(const Shape& shape, @@ -541,17 +561,34 @@ class HloInstruction { int64 dimension); // Creates a reduce instruction, where the computation (given by the handle) - // is applied successively to every element in operand. That is, if f is the - // function to apply (which either takes 2 [accumulator, value] or 3 - // [accumulator, index, value] arguments) and init is a reduction operator - // specified initial value (for example, 0 for addition), then this operation - // will compute: - // f(f(init, [index0], value0), [index1], value1), ...) + // is applied successively to every element in operand. For example, let f be + // the function to apply, which takes 2 arguments, an accumulator and the + // current value. Let init be an initial value (which is normally chosen to be + // the identity element for f, e.g. 0 if f is addition). + // Then the reduce HLO will compute: + // f(f(init, value0), value1), ...) static std::unique_ptr CreateReduce( const Shape& shape, HloInstruction* operand, HloInstruction* init_value, tensorflow::gtl::ArraySlice dimensions_to_reduce, HloComputation* reduce_computation); + // A more general, multiple-argument version of the above. + // The function to apply, f, now takes N arguments: + // [accumulator0, accumulator1, ..., accumulatorN, value0, value1, ..., + // init_valueN], and returns an N-tuple. The performed computation is (for + // commutative and associative f operators) equivalent to: + // + // f_1 = f(init0, ... initN, input0.value0, ..., inputN.value0) + // f_2 = f(f_1.tuple_element(0), ..., f_1.tuple_element(N), input0.value1, + // ..., inputN.value1) + // ... + // TODO(b/112040122): Add support to this in HLO passes and in backends. + static std::unique_ptr CreateReduce( + const Shape& shape, tensorflow::gtl::ArraySlice operands, + tensorflow::gtl::ArraySlice init_values, + tensorflow::gtl::ArraySlice dimensions_to_reduce, + HloComputation* reduce_computation); + // Creates a reduce-window instruction, where the computation (given // by the handle) is applied window-wise at each valid window // position in the operand. @@ -644,6 +681,12 @@ class HloInstruction { const GatherDimensionNumbers& gather_dim_numbers, tensorflow::gtl::ArraySlice window_bounds); + static std::unique_ptr CreateScatter( + const Shape& shape, HloInstruction* operand, + HloInstruction* scatter_indices, HloInstruction* updates, + HloComputation* update_computation, + const ScatterDimensionNumbers& scatter_dim_numbers); + // Creates a kDomain instruction which delimits an HLO domain which have // the provided user and operand side metadata. static std::unique_ptr CreateDomain( @@ -1014,9 +1057,7 @@ class HloInstruction { if (sharding_ == nullptr) { return tensorflow::gtl::optional(); } - auto device = sharding_->UniqueDevice(); - return device.ok() ? device.ValueOrDie() - : tensorflow::gtl::optional(); + return sharding_->UniqueDevice(); } // Sets the sharding of this operator. Should only be called by HloModule or // HloComputation methods. @@ -1393,6 +1434,9 @@ class HloInstruction { // Delegates to HloAllReduceInstruction::replica_group_ids. const std::vector& replica_group_ids() const; + // Delegates to HloAllToAllInstruction::replica_groups. + const std::vector& replica_groups() const; + // Delegates to HloAllReduceInstruction::cross_replica_sum_barrier. string cross_replica_sum_barrier() const; void set_cross_replica_sum_barrier(const string& barrier); @@ -1454,6 +1498,9 @@ class HloInstruction { // Delegates to HloGatherInstruction::gather_window_bounds. tensorflow::gtl::ArraySlice gather_window_bounds() const; + // Delegates to HloScatterInstruction::scatter_dimension_numbers(). + const ScatterDimensionNumbers& scatter_dimension_numbers() const; + // Old methods kept for smooth subclassing transition END. protected: diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc index b75a2bd34bc5d3b5b6100515748df787b9e7f08a..8a694dde8066ab9a1138b9f7981153d451ddb89e 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc @@ -1425,6 +1425,55 @@ TEST_F(HloInstructionTest, StringifyGather_1) { "index_vector_dim=2, window_bounds={30,29,28,27,26}"); } +TEST_F(HloInstructionTest, StringifyScatter) { + Shape input_tensor_shape = ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46}); + Shape scatter_indices_tensor_shape = + ShapeUtil::MakeShape(S64, {10, 9, 5, 7, 6}); + Shape scatter_updates_shape = + ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26}); + + HloComputation::Builder builder("Scatter"); + HloInstruction* input = builder.AddInstruction( + HloInstruction::CreateParameter(0, input_tensor_shape, "input_tensor")); + HloInstruction* scatter_indices = + builder.AddInstruction(HloInstruction::CreateParameter( + 1, scatter_indices_tensor_shape, "scatter_indices")); + HloInstruction* scatter_updates = + builder.AddInstruction(HloInstruction::CreateParameter( + 2, scatter_updates_shape, "scatter_updates")); + + HloComputation::Builder update_builder("Scatter.update"); + update_builder.AddInstruction( + HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p1")); + update_builder.AddInstruction( + HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "p2")); + + auto module = CreateNewModule(); + auto* update_computation = + module->AddEmbeddedComputation(update_builder.Build()); + + HloInstruction* scatter_instruction = + builder.AddInstruction(HloInstruction::CreateScatter( + input_tensor_shape, input, scatter_indices, scatter_updates, + update_computation, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 8}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/2))); + module->AddEntryComputation(builder.Build()); + + EXPECT_EQ( + scatter_instruction->ToString(), + "%scatter = f32[50,49,48,47,46]{4,3,2,1,0} " + "scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, " + "s64[10,9,5,7,6]{4,3,2,1,0} %scatter_indices, " + "f32[10,9,7,6,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %scatter_updates), " + "update_window_dims={4,5,6,7,8}, inserted_window_dims={}, " + "scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=2, " + "to_apply=%Scatter.update"); +} + TEST_F(HloInstructionTest, CanonnicalStringificationFusion) { // Tests stringification of a simple op, fusion, while, and conditional. const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10}); diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc index df26a2c744fbcac814727139e1cf7f23037dcc50..1de5032670ff47cda5599cf736bbd3529cfcaba9 100644 --- a/tensorflow/compiler/xla/service/hlo_instructions.cc +++ b/tensorflow/compiler/xla/service/hlo_instructions.cc @@ -359,6 +359,67 @@ HloAllReduceInstruction::CloneWithNewOperandsImpl( cross_replica_sum_barrier(), all_reduce_id()); } +HloAllToAllInstruction::HloAllToAllInstruction( + const Shape& shape, tensorflow::gtl::ArraySlice operands, + const std::vector& replica_groups, + tensorflow::StringPiece barrier) + : HloInstruction(HloOpcode::kAllToAll, shape), + replica_groups_(replica_groups), + cross_replica_sum_barrier_(barrier.begin(), barrier.end()) { + for (auto operand : operands) { + AppendOperand(operand); + } +} + +bool HloAllToAllInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return ContainersEqual(replica_groups(), casted_other.replica_groups(), + [](const ReplicaGroup& a, const ReplicaGroup& b) { + return ContainersEqual(a.replica_ids(), + b.replica_ids()); + }) && + cross_replica_sum_barrier() == + casted_other.cross_replica_sum_barrier(); +} + +std::unique_ptr +HloAllToAllInstruction::CloneWithNewOperandsImpl( + const Shape& shape, + tensorflow::gtl::ArraySlice new_operands, + HloCloneContext* /*context*/) const { + return MakeUnique( + shape, new_operands, replica_groups(), cross_replica_sum_barrier()); +} + +std::vector HloAllToAllInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + std::vector result; + std::vector replica_group_str; + for (const ReplicaGroup& group : replica_groups()) { + replica_group_str.push_back( + StrCat("{", Join(group.replica_ids(), ","), "}")); + } + result.push_back( + StrCat("replica_groups={", Join(replica_group_str, ","), "}")); + + if (!cross_replica_sum_barrier().empty()) { + result.push_back(StrCat("barrier=\"", cross_replica_sum_barrier(), "\"")); + } + + return result; +} + +HloInstructionProto HloAllToAllInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_replica_groups() = {replica_groups_.begin(), + replica_groups_.end()}; + proto.set_cross_replica_sum_barrier(cross_replica_sum_barrier_); + return proto; +} + HloReverseInstruction::HloReverseInstruction( const Shape& shape, HloInstruction* operand, tensorflow::gtl::ArraySlice dimensions) @@ -438,13 +499,14 @@ HloConcatenateInstruction::CloneWithNewOperandsImpl( } HloReduceInstruction::HloReduceInstruction( - const Shape& shape, HloInstruction* arg, HloInstruction* init_value, + const Shape& shape, tensorflow::gtl::ArraySlice args, tensorflow::gtl::ArraySlice dimensions_to_reduce, HloComputation* reduce_computation) : HloInstruction(HloOpcode::kReduce, shape), dimensions_(dimensions_to_reduce.begin(), dimensions_to_reduce.end()) { - AppendOperand(arg); - AppendOperand(init_value); + for (HloInstruction* arg : args) { + AppendOperand(arg); + } AppendComputation(reduce_computation); } @@ -477,8 +539,8 @@ std::unique_ptr HloReduceInstruction::CloneWithNewOperandsImpl( tensorflow::gtl::ArraySlice new_operands, HloCloneContext* context) const { CHECK_EQ(new_operands.size(), 2); - return MakeUnique( - shape, new_operands[0], new_operands[1], dimensions(), to_apply()); + return MakeUnique(shape, new_operands, dimensions(), + to_apply()); } HloSortInstruction::HloSortInstruction(const Shape& shape, int64 dimension, @@ -2015,4 +2077,91 @@ std::unique_ptr HloGatherInstruction::CloneWithNewOperandsImpl( gather_window_bounds()); } +HloScatterInstruction::HloScatterInstruction( + const Shape& shape, HloInstruction* operand, + HloInstruction* scatter_indices, HloInstruction* updates, + HloComputation* update_computation, + const ScatterDimensionNumbers& scatter_dim_numbers) + : HloInstruction(HloOpcode::kScatter, shape) { + AppendOperand(operand); + AppendOperand(scatter_indices); + AppendOperand(updates); + AppendComputation(update_computation); + scatter_dimension_numbers_ = + MakeUnique(scatter_dim_numbers); +} + +string HloScatterInstruction::ScatterDimensionNumbersToString() const { + string update_window_dims = + StrCat("update_window_dims={", + Join(scatter_dimension_numbers().update_window_dims(), ","), "}"); + string inserted_window_dims = StrCat( + "inserted_window_dims={", + Join(scatter_dimension_numbers().inserted_window_dims(), ","), "}"); + string scatter_dims_to_operand_dims = StrCat( + "scatter_dims_to_operand_dims={", + Join(scatter_dimension_numbers().scatter_dims_to_operand_dims(), ","), + "}"); + string index_vector_dim = StrCat( + "index_vector_dim=", scatter_dimension_numbers().index_vector_dim()); + + return Join>( + {update_window_dims, inserted_window_dims, scatter_dims_to_operand_dims, + index_vector_dim}, + ", "); +} + +/* static */ ScatterDimensionNumbers +HloScatterInstruction::MakeScatterDimNumbers( + tensorflow::gtl::ArraySlice update_window_dims, + tensorflow::gtl::ArraySlice inserted_window_dims, + tensorflow::gtl::ArraySlice scatter_dims_to_operand_dims, + int64 index_vector_dim) { + ScatterDimensionNumbers scatter_dim_numbers; + for (int64 update_window_dim : update_window_dims) { + scatter_dim_numbers.add_update_window_dims(update_window_dim); + } + for (int64 inserted_window_dim : inserted_window_dims) { + scatter_dim_numbers.add_inserted_window_dims(inserted_window_dim); + } + for (int64 scatter_dim_to_operand_dim : scatter_dims_to_operand_dims) { + scatter_dim_numbers.add_scatter_dims_to_operand_dims( + scatter_dim_to_operand_dim); + } + scatter_dim_numbers.set_index_vector_dim(index_vector_dim); + return scatter_dim_numbers; +} + +HloInstructionProto HloScatterInstruction::ToProto() const { + HloInstructionProto proto = HloInstruction::ToProto(); + *proto.mutable_scatter_dimension_numbers() = scatter_dimension_numbers(); + return proto; +} + +std::vector HloScatterInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + return {ScatterDimensionNumbersToString()}; +} + +bool HloScatterInstruction::IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const { + const auto& casted_other = static_cast(other); + return protobuf_util::ProtobufEquals( + scatter_dimension_numbers(), + casted_other.scatter_dimension_numbers()) && + eq_computations(to_apply(), casted_other.to_apply()); +} + +std::unique_ptr HloScatterInstruction::CloneWithNewOperandsImpl( + const Shape& shape, + tensorflow::gtl::ArraySlice new_operands, + HloCloneContext* context) const { + CHECK_EQ(new_operands.size(), 3); + return MakeUnique( + shape, new_operands[0], new_operands[1], new_operands[2], to_apply(), + scatter_dimension_numbers()); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h index 132e767420ce1e9229eb8c545a0f3111ffba48f4..9586ad667345111d05015e035c93fe6578e3b665 100644 --- a/tensorflow/compiler/xla/service/hlo_instructions.h +++ b/tensorflow/compiler/xla/service/hlo_instructions.h @@ -273,6 +273,47 @@ class HloAllReduceInstruction : public HloInstruction { tensorflow::gtl::optional all_reduce_id_; }; +class HloAllToAllInstruction : public HloInstruction { + public: + explicit HloAllToAllInstruction( + const Shape& shape, tensorflow::gtl::ArraySlice operand, + const std::vector& replica_groups, + tensorflow::StringPiece barrier); + + const std::vector& replica_groups() const { + return replica_groups_; + } + + // TODO(b/110096724): rename this. + void set_cross_replica_sum_barrier(string barrier) { + cross_replica_sum_barrier_ = barrier; + } + string cross_replica_sum_barrier() const { + return cross_replica_sum_barrier_; + } + + HloInstructionProto ToProto() const override; + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, + tensorflow::gtl::ArraySlice new_operands, + HloCloneContext* context) const override; + + std::vector replica_groups_; + + // The string representation of the barrier config. + string cross_replica_sum_barrier_; +}; + class HloReverseInstruction : public HloInstruction { public: explicit HloReverseInstruction(const Shape& shape, HloInstruction* operand, @@ -331,7 +372,7 @@ class HloConcatenateInstruction : public HloInstruction { class HloReduceInstruction : public HloInstruction { public: explicit HloReduceInstruction( - const Shape& shape, HloInstruction* arg, HloInstruction* init_value, + const Shape& shape, tensorflow::gtl::ArraySlice args, tensorflow::gtl::ArraySlice dimensions_to_reduce, HloComputation* reduce_computation); // Returns the dimension sizes or numbers associated with this instruction. @@ -340,6 +381,18 @@ class HloReduceInstruction : public HloInstruction { // Returns a serialized representation of this instruction. HloInstructionProto ToProto() const override; + // Returns the input tensors to be reduced. + tensorflow::gtl::ArraySlice inputs() const { + return tensorflow::gtl::ArraySlice(operands(), 0, + operand_count() / 2); + } + + // Returns the init values of the reduction. + tensorflow::gtl::ArraySlice init_values() const { + return tensorflow::gtl::ArraySlice( + operands(), operand_count() / 2, operand_count()); + } + private: std::vector ExtraAttributesToStringImpl( const HloPrintOptions& options) const override; @@ -534,6 +587,8 @@ class HloConstantInstruction : public HloInstruction { explicit HloConstantInstruction(const Shape& shape); // Returns the literal associated with this instruction. const Literal& literal() const { return *literal_; } + // Returns whether there is literal associated with this instruction. + bool HasLiteral() const { return literal_ != nullptr; } // Returns a serialized representation of this instruction. HloInstructionProto ToProto() const override; @@ -1198,6 +1253,45 @@ class HloGatherInstruction : public HloInstruction { std::vector gather_window_bounds_; }; +class HloScatterInstruction : public HloInstruction { + public: + explicit HloScatterInstruction( + const Shape& shape, HloInstruction* operand, + HloInstruction* scatter_indices, HloInstruction* updates, + HloComputation* update_computation, + const ScatterDimensionNumbers& scatter_dim_numbers); + const ScatterDimensionNumbers& scatter_dimension_numbers() const { + CHECK(scatter_dimension_numbers_ != nullptr); + return *scatter_dimension_numbers_; + } + // Returns the dump string of the scatter dimension numbers. + string ScatterDimensionNumbersToString() const; + // Returns a serialized representation of this instruction. + HloInstructionProto ToProto() const override; + + // Creates an instance of ScatterDimensionNumbers. + static ScatterDimensionNumbers MakeScatterDimNumbers( + tensorflow::gtl::ArraySlice update_window_dims, + tensorflow::gtl::ArraySlice inserted_window_dims, + tensorflow::gtl::ArraySlice scatter_dims_to_operand_dims, + int64 index_vector_dim); + + private: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + bool IdenticalSlowPath( + const HloInstruction& other, + const std::function& + eq_computations) const override; + // Implementation for non-common logic of CloneWithNewOperands. + std::unique_ptr CloneWithNewOperandsImpl( + const Shape& shape, + tensorflow::gtl::ArraySlice new_operands, + HloCloneContext* context) const override; + + std::unique_ptr scatter_dimension_numbers_; +}; + } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_ diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc index f0d9fdbc8f86da0bb9d7f9235239df677c9506bc..8e0d38b6a63917582b8bfa10f205e1ed511efef3 100644 --- a/tensorflow/compiler/xla/service/hlo_lexer.cc +++ b/tensorflow/compiler/xla/service/hlo_lexer.cc @@ -143,8 +143,47 @@ TokKind HloLexer::LexToken() { return TokKind::kLparen; case ')': return TokKind::kRparen; - case '/': - return LexComment(); + case '/': { + if (PeekCurrentChar() == '*') { + // This is the start of a /*...*/ delimited comment. Save the current + // location in case the comment is unterminated so the error message + // will point to the beginning of the comment. + const char* comment_start = current_ptr_; + current_ptr_++; + // Advance until '*/' is found. + while (true) { + int current = GetNextChar(); + if (current == '*' && PeekCurrentChar() == '/') { + // End of comment. + current_ptr_++; + break; + } + if (current == kEOF) { + // Unterminated comment. + current_ptr_ = comment_start; + return TokKind::kError; + } + } + // Return no token for the comment. Keep lexing. + continue; + } else if (PeekCurrentChar() == '/') { + // This is the start of a '//' delimited comment. Throw away + // everything until end of line or file. The end-of-line character(s) + // are left unlexed in the buffer which is harmless because these are + // skipped later by the lexer. This approach enables support for + // different end-of-line encodings. + while (true) { + int current = PeekCurrentChar(); + if (current == kEOF || current == '\n' || current == '\r') { + break; + } + current_ptr_++; + } + continue; + } + // A lone '/' is an error. + return TokKind::kError; + } case '"': return LexString(); } @@ -299,9 +338,12 @@ TokKind HloLexer::LexNumberOrPattern() { static LazyRE2 int_pattern = {R"([-]?\d+)"}; if (RE2::Consume(&consumable, *int_pattern)) { current_ptr_ = consumable.begin(); - tensorflow::strings::safe_strto64( - StringPieceFromPointers(token_start_, current_ptr_), &int64_val_); - return TokKind::kInt; + auto slice = StringPieceFromPointers(token_start_, current_ptr_); + if (tensorflow::strings::safe_strto64(slice, &int64_val_)) { + return TokKind::kInt; + } + LOG(ERROR) << "Failed to parse int literal: " << slice; + return TokKind::kError; } static LazyRE2 neg_inf = {"-inf"}; @@ -354,16 +396,6 @@ tensorflow::StringPiece HloLexer::GetLine(LocTy loc) const { return StringPieceFromPointers(start, end); } -TokKind HloLexer::LexComment() { - auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end()); - static LazyRE2 comment_pattern = {R"(\/\*.*?\*\/)"}; - if (RE2::Consume(&consumable, *comment_pattern)) { - current_ptr_ = consumable.begin(); - return TokKind::kComment; - } - return TokKind::kError; -} - // Lexes quoted string with escaping characters. If matched, the quoted string // will be unescaped and stored to str_val_. TokKind HloLexer::LexString() { @@ -409,8 +441,6 @@ string TokKindToString(TokKind kind) { return "kRparen"; case TokKind::kArrow: return "kArrow"; - case TokKind::kComment: - return "kComment"; case TokKind::kw_HloModule: return "kw_HloModule"; case TokKind::kw_ENTRY: diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h index ceb674f25e94ac3ac2e6a4a0687a93ffdcd065e0..003ac34ace5713446afa74eb3af96ae33087223e 100644 --- a/tensorflow/compiler/xla/service/hlo_lexer.h +++ b/tensorflow/compiler/xla/service/hlo_lexer.h @@ -105,7 +105,6 @@ class HloLexer { TokKind LexShape(); TokKind LexConstant(); TokKind LexNumberOrPattern(); - TokKind LexComment(); TokKind LexString(); const tensorflow::StringPiece buf_; diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h index b57c940238f0672692e3b65827f43e2f5499502d..c577b4359aae6c66f29860a0e56c3487b07afc02 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.h +++ b/tensorflow/compiler/xla/service/hlo_matchers.h @@ -231,6 +231,7 @@ HLO_MATCHER(Tanh); HLO_MATCHER(Trace); HLO_MATCHER(Transpose); HLO_MATCHER(Tuple); +HLO_MATCHER(TupleSelect); HLO_MATCHER(While); // The special cases below let you check additional information about the diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc index 7de59acc1efbc0150b95ebdd85a13ede48eec2f9..7961aece541faeb66875885b380158756c503250 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc @@ -157,9 +157,8 @@ TEST(HloMatchersTest, ShardingMatcher) { Array assignment({2}); assignment.SetValues({0, 1}); auto sharding = HloSharding::Tuple( - tuple_shape, - {HloSharding::Tile(ShapeUtil::MakeShape(F32, {5}), assignment), - HloSharding::AssignDevice(1), HloSharding::Replicate()}); + tuple_shape, {HloSharding::Tile(assignment), HloSharding::AssignDevice(1), + HloSharding::Replicate()}); p2->set_sharding(sharding); EXPECT_THAT(p0.get(), op::NoSharding()); @@ -172,8 +171,7 @@ TEST(HloMatchersTest, ShardingMatcher) { EXPECT_THAT( p2.get(), - op::Sharding( - "{{f32[5] devices=[2]0,1}, {maximal device=1}, {replicated}}")); + op::Sharding("{{devices=[2]0,1}, {maximal device=1}, {replicated}}")); EXPECT_THAT(Explain(p0.get(), op::Sharding(HloSharding::AssignDevice(1))), "%param.0 = f32[5]{0} parameter(0) has no sharding (expected: " diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h index 59e9a5a94aa4fc6270bde76c19dbd0d4506a563c..ec279867e595b66a22882703cc06046e3e916c96 100644 --- a/tensorflow/compiler/xla/service/hlo_opcode.h +++ b/tensorflow/compiler/xla/service/hlo_opcode.h @@ -47,6 +47,7 @@ namespace xla { #define HLO_OPCODE_LIST(V) \ V(kAbs, "abs") \ V(kAdd, "add") \ + V(kAllToAll, "all-to-all") \ V(kAtan2, "atan2") \ V(kBatchNormGrad, "batch-norm-grad") \ V(kBatchNormInference, "batch-norm-inference") \ @@ -118,6 +119,7 @@ namespace xla { V(kReverse, "reverse") \ V(kRng, "rng") \ V(kRoundNearestAfz, "round-nearest-afz") \ + V(kScatter, "scatter") \ V(kSelect, "select") \ V(kSelectAndScatter, "select-and-scatter") \ V(kSend, "send") \ diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc index d71d3c81702fb3d2adae82b1055464e4983eb891..4b3cd99dc06520bfeb60430d9d4316db66ea04b3 100644 --- a/tensorflow/compiler/xla/service/hlo_parser.cc +++ b/tensorflow/compiler/xla/service/hlo_parser.cc @@ -125,6 +125,7 @@ class HloParser { kFloat, kString, kBracedInt64List, + kBracedInt64ListList, kHloComputation, kFftType, kWindow, @@ -205,6 +206,10 @@ class HloParser { bool ParseInt64List(const TokKind start, const TokKind end, const TokKind delim, std::vector* result); + // 'parse_and_add_item' is an lambda to parse an element in the list and add + // the parsed element to the result. It's supposed to capture the result. + bool ParseList(const TokKind start, const TokKind end, const TokKind delim, + const std::function& parse_and_add_item); bool ParseParamListToShape(Shape* shape, LocTy* shape_loc); bool ParseParamList(); @@ -619,6 +624,28 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder, } break; } + case HloOpcode::kAllToAll: { + optional>> tmp_groups; + optional barrier; + attrs["replica_groups"] = {/*required=*/false, + AttrTy::kBracedInt64ListList, &tmp_groups}; + attrs["barrier"] = {/*required=*/false, AttrTy::kString, &barrier}; + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { + return false; + } + std::vector replica_groups; + if (tmp_groups) { + c_transform(*tmp_groups, std::back_inserter(replica_groups), + [](const std::vector& ids) { + ReplicaGroup group; + *group.mutable_replica_ids() = {ids.begin(), ids.end()}; + return group; + }); + } + instruction = builder->AddInstruction(HloInstruction::CreateAllToAll( + shape, operands, replica_groups, barrier ? *barrier : "")); + break; + } case HloOpcode::kReshape: { if (!ParseOperands(&operands, /*expected_size=*/1) || !ParseAttributes(attrs)) { @@ -865,18 +892,28 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder, break; } case HloOpcode::kReduce: { + auto loc = lexer_.GetLoc(); + optional reduce_computation; attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, &reduce_computation}; optional> dimensions_to_reduce; attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, &dimensions_to_reduce}; - if (!ParseOperands(&operands, /*expected_size=*/2) || - !ParseAttributes(attrs)) { + if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { return false; } + if (operands.size() % 2) { + return Error(loc, StrCat("expects an even number of operands, but has ", + operands.size(), " operands")); + } instruction = builder->AddInstruction(HloInstruction::CreateReduce( - shape, /*operand=*/operands[0], /*init_value=*/operands[1], + shape, /*operands=*/ + tensorflow::gtl::ArraySlice(operands, 0, + operands.size() / 2), + /*init_values=*/ + tensorflow::gtl::ArraySlice( + operands, operands.size() / 2, operands.size()), *dimensions_to_reduce, *reduce_computation)); break; } @@ -1242,6 +1279,42 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder, dim_numbers, *window_bounds)); break; } + case HloOpcode::kScatter: { + optional> update_window_dims; + attrs["update_window_dims"] = { + /*required=*/true, AttrTy::kBracedInt64List, &update_window_dims}; + optional> inserted_window_dims; + attrs["inserted_window_dims"] = { + /*required=*/true, AttrTy::kBracedInt64List, &inserted_window_dims}; + optional> scatter_dims_to_operand_dims; + attrs["scatter_dims_to_operand_dims"] = {/*required=*/true, + AttrTy::kBracedInt64List, + &scatter_dims_to_operand_dims}; + optional index_vector_dim; + attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64, + &index_vector_dim}; + + optional update_computation; + attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, + &update_computation}; + + if (!ParseOperands(&operands, /*expected_size=*/3) || + !ParseAttributes(attrs)) { + return false; + } + + ScatterDimensionNumbers dim_numbers = + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/*update_window_dims, + /*inserted_window_dims=*/*inserted_window_dims, + /*scatter_dims_to_operand_dims=*/*scatter_dims_to_operand_dims, + /*index_vector_dim=*/*index_vector_dim); + + instruction = builder->AddInstruction(HloInstruction::CreateScatter( + shape, /*operand=*/operands[0], /*scatter_indices=*/operands[1], + /*updates=*/operands[2], *update_computation, dim_numbers)); + break; + } case HloOpcode::kDomain: { DomainData domain; attrs["domain"] = {/*required=*/true, AttrTy::kDomain, &domain}; @@ -1337,7 +1410,6 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding, bool replicated = false; std::vector devices; std::vector tile_assignment_dimensions; - Shape tile_shape; while (lexer_.GetKind() != TokKind::kRbrace) { switch (lexer_.GetKind()) { case TokKind::kw_maximal: @@ -1388,7 +1460,8 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding, break; } case TokKind::kShape: - tile_shape = lexer_.GetShapeVal(); + // TODO(b/112302613): Left here for backward compatibility to ignore the + // removed tile shape data. lexer_.Lex(); break; case TokKind::kRbrace: @@ -1403,19 +1476,12 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding, return Error(loc, "replicated shardings should not have any devices assigned"); } - if (!ShapeUtil::Equal(tile_shape, Shape())) { - return Error(loc, - "replicated shardings should not have any tile shape set"); - } sharding->set_type(OpSharding::Type::OpSharding_Type_REPLICATED); } else if (maximal) { if (devices.size() != 1) { return Error(loc, "maximal shardings should have exactly one device assigned"); } - if (!ShapeUtil::Equal(tile_shape, Shape())) { - return Error(loc, "maximal shardings should not have any tile shape set"); - } sharding->set_type(OpSharding::Type::OpSharding_Type_MAXIMAL); sharding->add_tile_assignment_devices(devices[0]); } else { @@ -1423,9 +1489,6 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding, return Error( loc, "non-maximal shardings must have more than one device assigned"); } - if (ShapeUtil::Equal(tile_shape, Shape())) { - return Error(loc, "non-maximal shardings should have a tile shape set"); - } if (tile_assignment_dimensions.empty()) { return Error( loc, @@ -1433,7 +1496,6 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding, "dimensions"); } sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER); - *sharding->mutable_tile_shape() = tile_shape; for (tensorflow::int64 dim : tile_assignment_dimensions) { sharding->add_tile_assignment_dimensions(dim); } @@ -1590,6 +1652,24 @@ bool HloParser::SetValueInLiteralHelper(ParsedElemT value, "value ", value, " is out of range for literal's primitive type ", PrimitiveType_Name(literal->shape().element_type()))); } + } else if (std::is_unsigned::value) { + CHECK((std::is_same::value || + std::is_same::value)) + << "Unimplemented checking for ParsedElemT"; + + ParsedElemT upper_bound; + if (sizeof(LiteralNativeT) >= sizeof(ParsedElemT)) { + upper_bound = std::numeric_limits::max(); + } else { + upper_bound = + static_cast(std::numeric_limits::max()); + } + if (value > upper_bound || value < 0) { + // Value is out of range for LiteralNativeT. + return TokenError(StrCat( + "value ", value, " is out of range for literal's primitive type ", + PrimitiveType_Name(literal->shape().element_type()))); + } } else if (value > static_cast( std::numeric_limits::max()) || value < static_cast( @@ -1744,7 +1824,6 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr* literal, break; } case TokKind::kComma: - case TokKind::kComment: // Skip. lexer_.Lex(); break; @@ -2191,6 +2270,26 @@ bool HloParser::ParseAttributeHelper( ->emplace(result); return true; } + case AttrTy::kBracedInt64ListList: { + std::vector> result; + auto parse_and_add_item = [&]() { + std::vector item; + if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, + TokKind::kComma, &item)) { + return false; + } + result.push_back(item); + return true; + }; + if (!ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma, + parse_and_add_item)) { + return false; + } + static_cast>>*>( + attr_out_ptr) + ->emplace(result); + return true; + } case AttrTy::kSliceRanges: { SliceRanges result; if (!ParseSliceRanges(&result)) { @@ -2533,6 +2632,26 @@ bool HloParser::ParseInt64List(const TokKind start, const TokKind end, end, StrCat("expects an int64 list to end with ", TokKindToString(end))); } +bool HloParser::ParseList(const TokKind start, const TokKind end, + const TokKind delim, + const std::function& parse_and_add_item) { + if (!ParseToken(start, StrCat("expects a list starting with ", + TokKindToString(start)))) { + return false; + } + if (lexer_.GetKind() == end) { + // empty + } else { + do { + if (!parse_and_add_item()) { + return false; + } + } while (EatIfPresent(delim)); + } + return ParseToken( + end, StrCat("expects a list to end with ", TokKindToString(end))); +} + // param_list_to_shape ::= param_list '->' shape bool HloParser::ParseParamListToShape(Shape* shape, LocTy* shape_loc) { if (!ParseParamList() || !ParseToken(TokKind::kArrow, "expects '->'")) { diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc index 1c08c51220e88cdd04b26fe3bcd84d28c4436e85..5990a3d4784750feef2e375492851974214db779 100644 --- a/tensorflow/compiler/xla/service/hlo_parser_test.cc +++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc @@ -758,6 +758,46 @@ ENTRY %Gather (input_tensor: f32[50,49,48,47,46], gather_indices: s64[10,9,8,7,5 ROOT %gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26} } +)" +}, +{ +"scatter", +R"(HloModule StringifyScatter + +%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] { + %lhs = f32[] parameter(0) + %rhs = f32[] parameter(1) + ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs) +} + +ENTRY %Scatter (input_tensor: f32[50,49,48,47,46], scatter_indices: s64[10,9,8,7,5], updates: f32[10,9,8,7,30,29,28,27,26]) -> f32[50,49,48,47,46] { + %input_tensor = f32[50,49,48,47,46]{4,3,2,1,0} parameter(0) + %scatter_indices = s64[10,9,8,7,5]{4,3,2,1,0} parameter(1) + %updates = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} parameter(2) + ROOT %scatter = f32[50,49,48,47,46]{4,3,2,1,0} scatter(f32[50,49,48,47,46]{4,3,2,1,0} %input_tensor, s64[10,9,8,7,5]{4,3,2,1,0} %scatter_indices, f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} %updates), update_window_dims={4,5,6,7,8}, inserted_window_dims={}, scatter_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, to_apply=%add_F32.v3 +} + +)" +}, +{ + "ConstantUnsignedNoUnderflow", + R"(HloModule ConstantUnsignedNoUnderflow_module + +ENTRY %ConstantUnsignedNoUnderflow () -> u64[] { + ROOT %constant = u64[] constant(1) +} + +)" +}, + +{ + "ConstantUnsignedNoOverflow", + R"(HloModule ConstantUnsignedNoOverflow_module + +ENTRY %ConstantUnsignedNoOverflow () -> u64[] { + ROOT %constant = u64[] constant(9223372036854775807) +} + )" }, }); @@ -803,6 +843,32 @@ ENTRY ReduceR3ToR2.v3 { ROOT reduce = f32[8,16]{1,0} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3 } +)" +}, +// tuple reduce +{ +"TupleReduce", +R"(HloModule TupleReduce + +max_argmax { + value = f32[] parameter(2) + prev_max = f32[] parameter(0) + is_next_larger = pred[] greater-than-or-equal-to(value, prev_max) + max = f32[] select(is_next_larger, value, prev_max) + index = s32[] parameter(3) + prev_argmax = s32[] parameter(1) + argmax = s32[] select(is_next_larger, index, prev_argmax) + ROOT pair = (f32[], s32[]) tuple(max, argmax) +} + +ENTRY reduce_entry { + values = f32[1024]{0} parameter(0) + indices = f32[1024]{0} parameter(1) + init_value = f32[] constant(-inf) + init_index = s32[] constant(-1) + ROOT result = (f32[], s32[]) reduce(values, indices, init_value, init_index), dimensions={0}, to_apply=max_argmax +} + )" }, // infeed/outfeed @@ -1004,6 +1070,30 @@ ENTRY CrossReplicaSumWithSubgroups { ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_group_ids={0,0,1,1}, barrier="abc", to_apply=add } +)" +}, +// all-to-all +{ +"AllToAll", +R"(HloModule AllToAll + +ENTRY AllToAll { + input = f32[128,32]{0,1} parameter(0) + ROOT a2a = f32[128,32]{0,1} all-to-all(input), replica_groups={} +} + +)" +}, +// all-to-all with subgroups +{ +"AllToAllWithSubgroups", +R"(HloModule AllToAllWithSubgroups + +ENTRY AllToAllWithSubgroups { + input = f32[128,32]{0,1} parameter(0) + ROOT a2a = f32[128,32]{0,1} all-to-all(input), replica_groups={{1,2},{3,0}}, barrier="abc" +} + )" }, // Iota @@ -1224,6 +1314,40 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] { "is out of range for literal's primitive type F16"); } +TEST_F(HloParserTest, ConstantUnsignedUnderflow) { + const string original = R"( + HloModule ConstantUnsignedUnderflow_module + ENTRY %ConstantUnsignedUnderflow () -> u64[] { + ROOT %constant = u64[] constant(-1) + })"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); + ExpectHasSubstr(result.status().error_message(), + "is out of range for literal's primitive type U64"); +} + +TEST_F(HloParserTest, ConstantUnsignedOverflow) { + const string original = R"( + HloModule ConstantUnsignedOverflow_module + ENTRY %ConstantUnsignedOverflow () -> u32[] { + ROOT %constant = u32[] constant(4294967296) + })"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); + ExpectHasSubstr(result.status().error_message(), + "is out of range for literal's primitive type U32"); +} + +TEST_F(HloParserTest, ConstantUnsignedInt64Overflow) { + const string original = R"( + HloModule ConstantUnsignedOverflow_module + ENTRY %ConstantUnsignedOverflow () -> u64[] { + ROOT %constant = u64[] constant(9223372036854775808) + })"; + auto result = ParseHloString(original); + EXPECT_NE(Status::OK(), result.status()); +} + TEST_F(HloParserTest, ConstantWithExp) { const string original = R"(HloModule ConstantWithExp_module @@ -1436,6 +1560,81 @@ ENTRY consts { "last"); } +TEST_F(HloParserTest, Comments) { + const string original = R"(/* module description. */ +HloModule comments: + +ENTRY /*comment*/ c1 { + /* blah */ + ROOT const1 = /*foo*/f32[1]{0} constant({12345 /*bar*/}) + /* comment */ +} + +/* something else */ + +)"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + +TEST_F(HloParserTest, MultilineComments) { + const string original = R"(HloModule multiline_comment: +ENTRY c1 { + /* + ROOT foo = f32[1]{0} constant({12345}) + */ + ROOT const1 = f32[1]{0} constant({12345}) +/* +a +b +c +d + +*/ +})"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + +TEST_F(HloParserTest, UnterminatedComment) { + const string original = R"(HloModule unterminated_comment: +ENTRY c1 { +/* unterminated + ROOT const1 = f32[1]{0} constant({12345}) +})"; + // Verify that the error message points to the beginning of the unterminated + // comment. + ExpectHasSubstr(ParseHloString(original).status().error_message(), + "/* unterminated\n^"); +} + +TEST_F(HloParserTest, SlashSlashComments) { + const string original = R"(HloModule slash_slash_comment: +// Garbage +ENTRY c1 { + // Foo bar + ROOT const1 = f32[1]{0} constant({12345}) // Something else +})"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + +TEST_F(HloParserTest, SlashSlashCommentMsDosEolFormat) { + const string original = + "HloModule slash_slash_comment:\r\n// Garbage\r\nENTRY c1 {\r\n// Foo " + "bar\r\nROOT const1 = f32[1]{0} constant({12345}) // Something else\r\n}"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + +TEST_F(HloParserTest, SlashSlashCommentMacEolFormat) { + const string original = + "HloModule slash_slash_comment:\r// Garbage\rENTRY c1 {\r// Foo " + "bar\rROOT const1 = f32[1]{0} constant({12345}) // Something else\r}"; + auto module = ParseHloString(original); + TF_ASSERT_OK(module.status()); +} + TEST_F(HloParserTest, MultipleEntries) { const string original = R"(HloModule multiple_entries: ENTRY c1 { diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h index b3d0a07add39968c6310392ea01daeab8a7dd9af..28194deb0e32252b372a328b006dabaf250fa2c7 100644 --- a/tensorflow/compiler/xla/service/hlo_pass_fix.h +++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h @@ -16,6 +16,8 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PASS_FIX_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PASS_FIX_H_ +#include + #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -34,9 +36,19 @@ class HloPassFix : public Pass { StatusOr Run(HloModule* module) override { bool changed = false; bool changed_this_iteration = true; + int64 iteration_count = 0; + int64 limit = + std::max(static_cast(1000), module->instruction_count()); while (changed_this_iteration) { TF_ASSIGN_OR_RETURN(changed_this_iteration, Pass::Run(module)); changed |= changed_this_iteration; + ++iteration_count; + if (iteration_count == limit) { + LOG(ERROR) + << "Unexpectedly number of iterations in HLO passes (" + << iteration_count + << ")\nIf compilation hangs here, please file a bug with XLA."; + } } return changed; } diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc index cf9ceed5b2fb49eb91fea96d89c8e1efc2a3dad1..9ec983c2bc353955cb23d441d200ac8aa36951b1 100644 --- a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc +++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc @@ -282,7 +282,7 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) { TF_ASSERT_OK_AND_ASSIGN( SequentialHloOrdering::HloModuleSequence sequence, ScheduleComputationsInModule(*module, - [&TUPLE_SIZE](const BufferValue& buffer) { + [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf( buffer.shape(), TUPLE_SIZE); }, diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc index 393944c20faa0b09ebc8544543b62566c836739f..879fb3bbab2ada0f924282f16b3d9ccb4c2cb203 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding.cc @@ -31,12 +31,9 @@ HloSharding HloSharding::Tile1D(const Shape& input_shape, int64 num_tiles) { CHECK_EQ(1, ShapeUtil::Rank(input_shape)); CHECK_GT(num_tiles, 1); std::vector dimensions(1, num_tiles); - Shape tile_shape = input_shape; - auto& tile_dimension = (*tile_shape.mutable_dimensions())[0]; - tile_dimension = CeilOfRatio(static_cast(tile_dimension), num_tiles); Array assignment(dimensions); std::iota(assignment.begin(), assignment.end(), 0); - return HloSharding(tile_shape, assignment); + return HloSharding(assignment); } HloSharding HloSharding::Tuple(const ShapeTree& sub_shardings) { @@ -104,8 +101,7 @@ string HloSharding::ToString() const { return StrCat( "{maximal device=", static_cast(*tile_assignment_.begin()), "}"); } else { - return StrCat("{", ShapeUtil::HumanString(tile_shape_), " ", "devices=[", - Join(tile_assignment_.dimensions(), ","), "]", + return StrCat("{devices=[", Join(tile_assignment_.dimensions(), ","), "]", Join(tile_assignment_, ","), "}"); } } @@ -127,15 +123,15 @@ std::map HloSharding::UsedDevices(int64* count) const { if (IsTuple()) { for (auto& tuple_element_sharding : tuple_elements()) { auto unique_device = tuple_element_sharding.UniqueDevice(); - if (unique_device.ok()) { - device_map[unique_device.ValueOrDie()] += 1; + if (unique_device) { + device_map[*unique_device] += 1; } } element_count = tuple_elements().size(); } else { auto unique_device = UniqueDevice(); - if (unique_device.ok()) { - device_map[unique_device.ValueOrDie()] += 1; + if (unique_device) { + device_map[*unique_device] += 1; } } if (count != nullptr) { @@ -145,7 +141,6 @@ std::map HloSharding::UsedDevices(int64* count) const { } std::vector HloSharding::TileIndexForDevice(int64 device) const { - CHECK(!ShapeUtil::IsTuple(tile_shape_)); CHECK(!maximal_); CHECK(!IsTuple()); std::vector ret_index; @@ -165,32 +160,43 @@ int64 HloSharding::DeviceForTileIndex( if (maximal_) { return *tile_assignment_.begin(); } - CHECK_EQ(ShapeUtil::Rank(tile_shape_), tile_assignment_.dimensions().size()); return tile_assignment_(index); } -std::vector HloSharding::TileOffsetForDevice(int64 device) const { +std::vector HloSharding::TileOffsetForDevice(const Shape& shape, + int64 device) const { CHECK(!IsTuple()); - std::vector index = TileIndexForDevice(device); if (maximal_) { - // Index will always be all zeroes if we're maximal, and tile_shape_ is not - // valid. - return index; + return std::vector(shape.dimensions_size(), 0); } + + CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions()); + std::vector index = TileIndexForDevice(device); for (int64 i = 0; i < index.size(); ++i) { - index[i] *= tile_shape_.dimensions(i); + const int64 shape_dim = shape.dimensions(i); + index[i] = std::min( + index[i] * CeilOfRatio(shape_dim, tile_assignment_.dim(i)), shape_dim); } return index; } -std::vector HloSharding::TileLimitForDevice(int64 device) const { +std::vector HloSharding::TileLimitForDevice(const Shape& shape, + int64 device) const { CHECK(!IsTuple()); - CHECK(!maximal_); // Maximal shardings do not have a valid tile shape. + if (maximal_) { + return std::vector(shape.dimensions().begin(), + shape.dimensions().end()); + } + + CHECK_EQ(shape.dimensions_size(), tile_assignment_.num_dimensions()); std::vector index = TileIndexForDevice(device); for (int64 i = 0; i < index.size(); ++i) { - index[i] = (index[i] + 1) * tile_shape_.dimensions(i); + const int64 shape_dim = shape.dimensions(i); + index[i] = std::min( + (index[i] + 1) * CeilOfRatio(shape_dim, tile_assignment_.dim(i)), + shape_dim); } return index; } @@ -238,40 +244,31 @@ StatusOr HloSharding::GetTupleSharding(const Shape& shape) const { return Tuple(ShapeTree(shape, *this)); } -StatusOr HloSharding::UniqueDevice() const { +tensorflow::gtl::optional HloSharding::UniqueDevice() const { if (IsTuple()) { if (tuple_elements_.empty()) { - return tensorflow::errors::InvalidArgument( - "UniqueDevice() called on empty tuple"); + return tensorflow::gtl::nullopt; } - std::vector> results; - std::transform(tuple_elements_.begin(), tuple_elements_.end(), - std::back_inserter(results), - [](const HloSharding& s) { return s.UniqueDevice(); }); - if (std::all_of(results.begin(), results.end(), - [&](const StatusOr& s) { - return s.ok() && results[0].ok() && - s.ValueOrDie() == results[0].ValueOrDie(); - })) { - return results[0]; - } else { - return tensorflow::errors::InvalidArgument( - "Tuple did not contain a unique device"); + tensorflow::gtl::optional unique_device; + for (auto& tuple_sharding : tuple_elements_) { + auto device = tuple_sharding.UniqueDevice(); + if (!device || (unique_device && *device != *unique_device)) { + return tensorflow::gtl::nullopt; + } + unique_device = device; } + return unique_device; } - if (!replicated_ && maximal_ && !IsTuple()) { + if (!replicated_ && maximal_) { return static_cast(*tile_assignment_.begin()); } - return tensorflow::errors::InvalidArgument( - "UniqueDevice() called on sharding that executes on multiple devices"); + return tensorflow::gtl::nullopt; } -bool HloSharding::HasUniqueDevice() const { - if (IsTuple()) { - return UniqueDevice().status().ok(); - } else { - return !IsReplicated() && IsTileMaximal(); - } +int64 HloSharding::GetUniqueDevice() const { + auto device = UniqueDevice(); + CHECK(device) << "Sharding does not have a unique device: " << *this; + return *device; } Status HloSharding::ValidateTuple(const Shape& shape, int64 num_devices) const { @@ -345,11 +342,12 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, return Status::OK(); } - // The tile rank must be the same as the input rank. - if (ShapeUtil::Rank(shape) != ShapeUtil::Rank(tile_shape_)) { + // The tile assignment tensor must have the same rank as the input. + if (ShapeUtil::Rank(shape) != tile_assignment_.num_dimensions()) { return tensorflow::errors::InvalidArgument( - "Tile rank is different to the input rank. sharding=", ToString(), - ", input_shape=", ShapeUtil::HumanString(shape)); + "Number of tile assignment dimensions is different to the input rank. " + "sharding=", + ToString(), ", input_shape=", ShapeUtil::HumanString(shape)); } // The correct constructor have to be used to create tile maximal shardings. @@ -359,20 +357,6 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, "sharding was intended, use HloSharding::Replicated(). If a device " "placement was intended, use HloSharding::AssignDevice()"); } - - // The tile assignment tensor must contain enough element to cover the full - // shape with tiles of the specified size. - for (int64 i = 0, e = tile_assignment_.dimensions().size(); i != e; ++i) { - int64 total_tile_size = tile_assignment_.dim(i) * tile_shape_.dimensions(i); - if (shape.dimensions(i) > total_tile_size) { - return tensorflow::errors::InvalidArgument( - StrCat("Tile assignment tensor has too few element to cover the full " - "shape. Dimension ", - i, ", shape ", shape.dimensions(i), ", total size ", - total_tile_size)); - } - } - return Status::OK(); } @@ -402,7 +386,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, proto.tile_assignment_dimensions().end())); std::copy(proto.tile_assignment_devices().begin(), proto.tile_assignment_devices().end(), tile_assignment.begin()); - return HloSharding(proto.tile_shape(), tile_assignment); + return HloSharding(tile_assignment); } OpSharding HloSharding::ToProto() const { @@ -416,7 +400,6 @@ OpSharding HloSharding::ToProto() const { return result; } - *result.mutable_tile_shape() = tile_shape_; for (int64 dim : tile_assignment_.dimensions()) { result.add_tile_assignment_dimensions(dim); } @@ -433,30 +416,16 @@ OpSharding HloSharding::ToProto() const { return result; } -HloSharding HloSharding::TransformShardedTileShape( - const Shape& new_shape, - const std::function& transform) const { - CHECK(!IsTuple()); +Shape HloSharding::TileShape(const Shape& shape) const { if (IsTileMaximal()) { - return *this; + return shape; } - CHECK_EQ(ShapeUtil::Rank(new_shape), ShapeUtil::Rank(tile_shape())); - Shape new_tile_shape; - new_tile_shape.set_element_type(tile_shape().element_type()); - for (int64 i = 0; i < ShapeUtil::Rank(new_shape); ++i) { - int64 dim; - if (tile_assignment().dim(i) == 1) { - dim = new_shape.dimensions(i); - } else if (transform) { - dim = transform(i, tile_shape().dimensions(i)); - } else { - dim = tile_shape().dimensions(i); - } - new_tile_shape.add_dimensions(dim); + Shape result_shape = shape; + for (int64 i = 0; i < shape.dimensions_size(); ++i) { + (*result_shape.mutable_dimensions())[i] = + CeilOfRatio(shape.dimensions(i), tile_assignment_.dim(i)); } - TF_CHECK_OK( - LayoutUtil::CopyLayoutBetweenShapes(tile_shape_, &new_tile_shape)); - return HloSharding::Tile(new_tile_shape, tile_assignment()); + return result_shape; } HloSharding HloSharding::GetSubSharding(const Shape& shape, @@ -498,9 +467,6 @@ size_t HloSharding::Hash() const { for (uint32 v : tile_assignment_) { h = tensorflow::Hash64Combine(h, std::hash{}(v)); } - for (uint32 v : tile_shape_.dimensions()) { - h = tensorflow::Hash64Combine(h, std::hash{}(v)); - } return h; } diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h index 6f672b0f28d2b85411d70f33da9a9f270aefc0d0..894783e5d1538fa4e8e91b65827121f32040af83 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding.h +++ b/tensorflow/compiler/xla/service/hlo_sharding.h @@ -48,22 +48,10 @@ class HloSharding { // the input shape (one tile) assigned to a single device. static HloSharding AssignDevice(int64 device_id); - // Creates a new sharding which splits a shape into tiles each with shape - // `tile_shape`. Each tile is assigned to one device, which is specified by - // `tile_assignment`. Any tensor not a multiple of the tile size in any - // dimension is implicitly padded to the tile size. - // - // e.g. Tile({2, 2}, {0, 1}) on a tensor of shape {3, 2} would look like: - // 2 1 padding - // <------><-> - // +----+----+ - // | 0 | 1 | - // +----+----+ - // - // Split into two tiles, one of which is implicitly padded by one. - static HloSharding Tile(const Shape& tile_shape, - const Array& tile_assignment) { - return HloSharding(tile_shape, tile_assignment); + // Creates a new sharding which splits a shape into tiles amongst the devices + // specified by `tile_assignment`. + static HloSharding Tile(const Array& tile_assignment) { + return HloSharding(tile_assignment); } // Creates a new sharding which splits a one-dimensional input shape into @@ -146,24 +134,30 @@ class HloSharding { // REQUIRES: !IsTuple() int64 DeviceForTileIndex(tensorflow::gtl::ArraySlice index) const; - // Given a device ID, returns the offset within the input space of the + // Given a device ID, returns the offset within the specified shape of the // tile that should be executed on the given core. This returns the lower // extent of the tile in the input space. // REQUIRES: !IsTuple() - std::vector TileOffsetForDevice(int64 device) const; + std::vector TileOffsetForDevice(const Shape& shape, + int64 device) const; - // Given a device ID, returns the limit within the input space of the + // Given a device ID, returns the limit within the specified shape of the // tile that should be executed on the given core. This returns the upper // extent of the tile in the input space. // REQUIRES: !IsTuple() - std::vector TileLimitForDevice(int64 device) const; + std::vector TileLimitForDevice(const Shape& shape, int64 device) const; - // Returns the single device this op operates on. - // REQUIRES: !IsTuple&& !Replicated() && IsTileMaximal() - StatusOr UniqueDevice() const; + // Returns the single device this op operates on. If the sharding does not + // span a single device, the return value will be empty. + // In order for a sharding to span a single device, every leaf sharding must + // be maximal and not replicated, and the used device must match. + tensorflow::gtl::optional UniqueDevice() const; + + // Retrieves the unique device or fails with a CHECK. + int64 GetUniqueDevice() const; // Returns true if this op only uses a single device. - bool HasUniqueDevice() const; + bool HasUniqueDevice() const { return UniqueDevice().has_value(); } // Returns the ShapeTree containing the shardings for each element of this // tuple, if IsTuple, or a ShapeTree with a single element containing this @@ -192,7 +186,6 @@ class HloSharding { bool operator==(const HloSharding& other) const { return replicated_ == other.replicated_ && maximal_ == other.maximal_ && - ShapeUtil::Compatible(tile_shape_, other.tile_shape_) && tile_assignment_ == other.tile_assignment_ && tuple_elements_ == other.tuple_elements_; } @@ -206,9 +199,6 @@ class HloSharding { } }; - // Gets the tile shape. - // REQUIRES: !IsTileMaximal() && !IsTuple() - const Shape& tile_shape() const { return tile_shape_; } // Gets the tile assignment tensor. // REQUIRES: !IsReplicated() && !IsTuple() const Array& tile_assignment() const { return tile_assignment_; } @@ -220,25 +210,15 @@ class HloSharding { return tuple_elements_; } - // Return a new sharding that can apply to the given new shape. - // If this sharding is tile-maximal, the returned sharding will be the same as - // this sharding. If this sharding is not tile-maximal, the returned - // sharding's tile size will differ: - // - Non-sharded dimensions will be adapted to be the same as `new_shape`; - // tile_dimension(i) = new_shape.dimensions(i); - // - Sharded dimensions will be kept the same unless `transform` is supplied - // in which case tile_dimension(i) = transform(i, tile_dimension(i)); - // REQUIRES: !IsTuple(). - HloSharding TransformShardedTileShape( - const Shape& new_shape, - const std::function& transform = nullptr) const; + // Gets the tile shape. + // REQUIRES: !IsTuple() + Shape TileShape(const Shape& shape) const; private: HloSharding() : replicated_(true), maximal_(true), tuple_(false), - tile_shape_(), tile_assignment_({0}) {} // device_id values: // -2: magic number to mean unassigned device, used by spatial partitioning @@ -250,15 +230,13 @@ class HloSharding { : replicated_(false), maximal_(true), tuple_(false), - tile_shape_(), tile_assignment_({1}, device_id) {} - HloSharding(const Shape& tile_shape, const Array& tile_assignment) + explicit HloSharding(const Array& tile_assignment) : replicated_(false), maximal_(false), tuple_(false), - tile_shape_(tile_shape), tile_assignment_(tile_assignment) {} - HloSharding(const std::vector& tuple_shardings) + explicit HloSharding(const std::vector& tuple_shardings) : replicated_(false), maximal_(false), tuple_(true), @@ -281,7 +259,6 @@ class HloSharding { bool replicated_; bool maximal_; bool tuple_; - Shape tile_shape_; Array tile_assignment_; // Only non-empty when tuple_ is true, but because empty tuples are allowed // may also be empty even then. This is a flattened list of all the leaf diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc index 94f5a3b273b2fd7e545472c42f3863f549dd3db1..a2c1d39d0d4893333b3c2ed0e3418b01dac8cefd 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc @@ -158,7 +158,6 @@ ShapeTree GetTupleSharding(HloInstruction* tuple) { const HloSharding* GetOperandSharding(const HloInstruction* operand, const DomainMetadata::Domain& domain, const HloSharding& sharding) { - DCHECK_EQ(domain.reach_set.count(const_cast(operand)), 1); // Here the user of operand is within the domain instruction set, and since it // is user of operand, we need to look into the enter_domains set. If this is // not a kDomain within the user domains set, then return the operand @@ -203,10 +202,17 @@ StatusOr ApplyDomainShardingPass(const DomainMetadata::Domain& domain, for (int64 i = 0; i < instruction->operand_count(); ++i) { const HloSharding* operand_sharding = GetOperandSharding(instruction->operand(i), domain, sharding); - if (operand_sharding != nullptr && - shape_tree.element({i}) != *operand_sharding) { - *shape_tree.mutable_element({i}) = *operand_sharding; - ++tuple_assigned; + if (operand_sharding != nullptr) { + HloSharding operand_subsharding = HloSharding::Replicate(); + if (operand_sharding == &sharding) { + operand_subsharding = + sharding.GetSubSharding(instruction->shape(), {i}); + operand_sharding = &operand_subsharding; + } + if (shape_tree.element({i}) != *operand_sharding) { + *shape_tree.mutable_element({i}) = *operand_sharding; + ++tuple_assigned; + } } } if (tuple_assigned > 0) { diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc index 7baa927d0e2b1abbbb2333633d16dd605ae8c8ef..45fc300fcaf5a301fe11768da77a7c0907919c39 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc @@ -39,7 +39,6 @@ Array MakeArray(tensorflow::gtl::ArraySlice dimensions, class HloShardingTest : public HloTestBase {}; TEST_F(HloShardingTest, Replicate) { - Shape tile_shape = ShapeUtil::MakeShape(U32, {4}); HloSharding sharding = HloSharding::Replicate(); EXPECT_TRUE(sharding.IsReplicated()); EXPECT_TRUE(sharding.IsTileMaximal()); @@ -51,7 +50,7 @@ TEST_F(HloShardingTest, Replicate) { EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4}), /*num_devices=*/2)); - EXPECT_IS_NOT_OK(sharding.UniqueDevice()); + EXPECT_FALSE(sharding.HasUniqueDevice()); } TEST_F(HloShardingTest, DevicePlacement) { @@ -60,7 +59,7 @@ TEST_F(HloShardingTest, DevicePlacement) { EXPECT_TRUE(sharding.IsTileMaximal()); EXPECT_FALSE(sharding.UsesDevice(0)); EXPECT_TRUE(sharding.UsesDevice(5)); - EXPECT_EQ(5, sharding.UniqueDevice().ValueOrDie()); + EXPECT_EQ(5, sharding.GetUniqueDevice()); HloSharding other = HloSharding::Replicate(); EXPECT_NE(other, sharding); @@ -79,37 +78,22 @@ TEST_F(HloShardingTest, DevicePlacement) { TEST_F(HloShardingTest, Tile) { { // Test should fail because of a duplicate tile assignment. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 0, 2, 3})); + HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 0, 2, 3})); EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {4, 6}), /*num_devices=*/4)); } { // Test should fail because of more devices used then `num_device`. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); + HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 1, 2, 3})); EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4, 6}), /*num_devices=*/2)); } - { - // Test should fail because the total tiled size in dimension 0 is 4 but we - // have 6 elements along that dimensions. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); - EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {6, 3}), - /*num_devices=*/4)); - } - { // Test should pass. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1})); + Shape shape = ShapeUtil::MakeShape(U32, {4, 5}); + HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1})); EXPECT_IS_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {3, 5}), /*num_devices=*/5)); @@ -118,12 +102,16 @@ TEST_F(HloShardingTest, Tile) { EXPECT_EQ(2, sharding.DeviceForTileIndex({1, 0})); EXPECT_EQ(1, sharding.DeviceForTileIndex({1, 1})); - EXPECT_EQ(sharding.TileOffsetForDevice(0), (std::vector{0, 0})); - EXPECT_EQ(sharding.TileOffsetForDevice(3), (std::vector{0, 3})); - EXPECT_EQ(sharding.TileOffsetForDevice(2), (std::vector{2, 0})); - EXPECT_EQ(sharding.TileOffsetForDevice(1), (std::vector{2, 3})); + EXPECT_EQ(sharding.TileOffsetForDevice(shape, 0), + (std::vector{0, 0})); + EXPECT_EQ(sharding.TileOffsetForDevice(shape, 3), + (std::vector{0, 3})); + EXPECT_EQ(sharding.TileOffsetForDevice(shape, 2), + (std::vector{2, 0})); + EXPECT_EQ(sharding.TileOffsetForDevice(shape, 1), + (std::vector{2, 3})); - EXPECT_IS_NOT_OK(sharding.UniqueDevice()); + EXPECT_FALSE(sharding.HasUniqueDevice()); } } @@ -135,8 +123,7 @@ TEST_F(HloShardingTest, NestedTuple) { ShapeUtil::MakeShape(F32, {4, 6}), }); - HloSharding tiled_sharding = HloSharding::Tile( - ShapeUtil::MakeShape(F32, {4, 3}), Array({{0, 1}})); + HloSharding tiled_sharding = HloSharding::Tile(Array({{0, 1}})); OpSharding proto; proto.set_type(OpSharding::Type::OpSharding_Type_TUPLE); *proto.add_tuple_shardings() = HloSharding::Replicate().ToProto(); @@ -187,32 +174,11 @@ TEST_F(HloShardingTest, Hash) { } { - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding1 = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1})); - HloSharding sharding2 = HloSharding::Tile(ShapeUtil::MakeShape(U32, {2, 3}), - MakeArray({2, 2}, {0, 3, 2, 1})); - EXPECT_TRUE(hash_compare_equal(sharding1, sharding2)); - } - - { - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding1 = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1})); - HloSharding sharding2 = HloSharding::Tile(ShapeUtil::MakeShape(U32, {2, 3}), - MakeArray({2, 2}, {0, 3, 2, 1})); + HloSharding sharding1 = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1})); + HloSharding sharding2 = HloSharding::Tile(MakeArray({2, 2}, {0, 3, 2, 1})); EXPECT_TRUE(hash_compare_equal(sharding1, sharding2)); } - { - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding1 = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 3, 2, 1})); - HloSharding sharding2 = HloSharding::Tile(ShapeUtil::MakeShape(U32, {2, 3}), - MakeArray({2, 2}, {0, 3, 1, 2})); - EXPECT_FALSE(hash_compare_equal(sharding1, sharding2)); - } - HloSharding default_sharding = HloSharding::Replicate(); { ShapeTree shape_tree(ShapeUtil::MakeTupleShape({}), @@ -259,19 +225,6 @@ TEST_F(HloShardingTest, Hash) { } } -TEST_F(HloShardingTest, TransformShardedTileShapeTest) { - HloSharding sharding = - HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 5, 7, 11}), - Array4D({{{{0, 1}, {2, 3}}}})); - HloSharding result = sharding.TransformShardedTileShape( - ShapeUtil::MakeShape(F32, {13, 15, 17, 19}), - [](int dim, int value) { return dim * 111; }); - HloSharding expected = - HloSharding::Tile(ShapeUtil::MakeShape(F32, {13, 15, 222, 333}), - Array4D({{{{0, 1}, {2, 3}}}})); - EXPECT_EQ(result, expected); -} - TEST_F(HloShardingTest, ToStringReplicatedTest) { HloSharding sharding = HloSharding::Replicate(); EXPECT_EQ(sharding.ToString(), "{replicated}"); @@ -284,9 +237,8 @@ TEST_F(HloShardingTest, ToStringAssignDeviceTest) { TEST_F(HloShardingTest, ToStringTiledTest) { HloSharding sharding = - HloSharding::Tile(ShapeUtil::MakeShape(S32, {7, 11, 13}), - Array3D({{{2, 3}}, {{5, 7}}})); - EXPECT_EQ(sharding.ToString(), "{s32[7,11,13] devices=[2,1,2]2,3,5,7}"); + HloSharding::Tile(Array3D({{{2, 3}}, {{5, 7}}})); + EXPECT_EQ(sharding.ToString(), "{devices=[2,1,2]2,3,5,7}"); } TEST_F(HloShardingTest, ToStringTupleTest) { @@ -294,21 +246,18 @@ TEST_F(HloShardingTest, ToStringTupleTest) { ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 5}), ShapeUtil::MakeShape(U32, {7, 25}), ShapeUtil::MakeShape(S32, {9, 11})}), - {HloSharding::Replicate(), - HloSharding::Tile(ShapeUtil::MakeShape(U32, {7, 13}), - Array2D({{3, 5}})), + {HloSharding::Replicate(), HloSharding::Tile(Array2D({{3, 5}})), HloSharding::AssignDevice(3)}); EXPECT_EQ(sharding.ToString(), - "{{replicated}, {u32[7,13] devices=[1,2]3,5}, {maximal device=3}}"); + "{{replicated}, {devices=[1,2]3,5}, {maximal device=3}}"); } TEST_F(HloShardingTest, OstreamTest) { HloSharding sharding = - HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 5, 7, 11}), - Array4D({{{{0, 1}, {2, 3}}}})); + HloSharding::Tile(Array4D({{{{0, 1}, {2, 3}}}})); std::ostringstream oss; oss << sharding; - EXPECT_EQ(oss.str(), "{f32[3,5,7,11] devices=[1,1,2,2]0,1,2,3}"); + EXPECT_EQ(oss.str(), "{devices=[1,1,2,2]0,1,2,3}"); } TEST_F(HloShardingTest, ParseHloString) { @@ -319,8 +268,7 @@ TEST_F(HloShardingTest, ParseHloString) { }; check(HloSharding::Replicate()); check(HloSharding::AssignDevice(2)); - check(HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}), - Array4D({{{{0}, {1}}}}))); + check(HloSharding::Tile(Array4D({{{{0}, {1}}}}))); // Empty tuple. One sharding is required for empty tuples, as we need to be // able to assign sharding to them, even though they have no leaves. check(HloSharding::Tuple(ShapeUtil::MakeTupleShape({}), @@ -332,8 +280,7 @@ TEST_F(HloShardingTest, ParseHloString) { ShapeUtil::MakeShape(F32, {3, 5, 7}), ShapeUtil::MakeShape(F32, {3, 7})}); check(HloSharding::Tuple( - tuple_shape, {HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}), - Array4D({{{{0}, {1}}}})), + tuple_shape, {HloSharding::Tile(Array4D({{{{0}, {1}}}})), HloSharding::Replicate(), HloSharding::AssignDevice(1)})); } { @@ -343,8 +290,7 @@ TEST_F(HloShardingTest, ParseHloString) { ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 5, 7}), ShapeUtil::MakeShape(F32, {3, 7})})}); std::vector leaf_shardings = { - HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}), - Array4D({{{{0}, {1}}}})), + HloSharding::Tile(Array4D({{{{0}, {1}}}})), HloSharding::Replicate(), HloSharding::AssignDevice(1)}; ShapeTree sharding_tree(tuple_shape, HloSharding::Replicate()); // Assign leaf_shardings to sharding_tree leaves. diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc index 48f676db85ab5e7711d9e9ac900306a9ea85ef10..b78bfa0cdf4db605576fa11e18ce6c654c6a0b6d 100644 --- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc +++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc @@ -101,11 +101,11 @@ const string& HloTfGraphBuilder::GetNodeNameForInstruction( } }; string node_name; - if (debug_options_.xla_hlo_tfgraph_device_scopes() && - instruction->has_sharding() && - instruction->sharding().HasUniqueDevice()) { - node_name = StrCat( - "dev", instruction->sharding().UniqueDevice().ConsumeValueOrDie()); + if (debug_options_.xla_hlo_tfgraph_device_scopes()) { + auto device = instruction->sharding_unique_device(); + if (device) { + node_name = StrCat("dev", *device); + } } // If an instruction is fused, put it in the subgraph of the fusion; // otherwise, put it in the computation subgraph. @@ -215,10 +215,10 @@ Status HloTfGraphBuilder::AddInstruction(const HloInstruction* instruction) { NodeDef* node_def = graph_def_.add_node(); node_def->set_name(GetNodeNameForInstruction(instruction)); node_def->set_op(GetOpDefName(instruction)); - if (instruction->has_sharding() && - instruction->sharding().HasUniqueDevice()) { - TF_ASSIGN_OR_RETURN(int64 device, instruction->sharding().UniqueDevice()); - node_def->set_device(GetDeviceName(device)); + + auto device = instruction->sharding_unique_device(); + if (device) { + node_def->set_device(GetDeviceName(*device)); } SetNodeAttrs(instruction, node_def); if (instruction->opcode() == HloOpcode::kFusion) { diff --git a/tensorflow/compiler/xla/service/hlo_token.h b/tensorflow/compiler/xla/service/hlo_token.h index 533429608bc2e13626a3e746fbe465398e1f4bb4..4458c251dee4af365e39027dd4289925c8890efd 100644 --- a/tensorflow/compiler/xla/service/hlo_token.h +++ b/tensorflow/compiler/xla/service/hlo_token.h @@ -44,7 +44,6 @@ enum class TokKind { kRparen, // ( ) kArrow, // -> - kComment, // /*xxx*/ // Keywords kw_HloModule, diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc index 4e3c9df3a036890ce25f5b14603d275263e8659b..7fd99fc93050b386c5ad24e6dcd2fea1bf652c3f 100644 --- a/tensorflow/compiler/xla/service/hlo_value.cc +++ b/tensorflow/compiler/xla/service/hlo_value.cc @@ -283,8 +283,7 @@ std::ostream& operator<<(std::ostream& out, string InstructionValueSet::ToString() const { string out = StrCat("InstructionValueSet(", ShapeUtil::HumanString(shape()), ")\n"); - ForEachElement([this, &out](const ShapeIndex& index, - const HloValueSet& value_set) { + ForEachElement([&out](const ShapeIndex& index, const HloValueSet& value_set) { StrAppend(&out, " ", index.ToString(), " : ", value_set.ToString(), "\n"); }); return out; diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 25fa319faf13d8bef69381c869f08f4948fc3519..e7674f3ddd5baa87c872d1c0b40bff340f3cd911 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -105,6 +105,15 @@ Status ShapeVerifier::HandleCrossReplicaSum(HloInstruction* crs) { ShapeInference::InferCrossReplicaSumShape(operand_shapes)); } +Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) { + std::vector operand_shapes; + for (const HloInstruction* operand : hlo->operands()) { + operand_shapes.push_back(&operand->shape()); + } + return CheckShape(hlo, + ShapeInference::InferAllToAllTupleShape(operand_shapes)); +} + Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) { return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape( reduce_precision->operand(0)->shape(), @@ -185,7 +194,67 @@ Status ShapeVerifier::HandleHostCompute(HloInstruction*) { return Status::OK(); } -Status ShapeVerifier::HandleRng(HloInstruction*) { return Status::OK(); } +bool ShapeVerifier::HasCompatibleElementTypes(const Shape& shape_0, + const Shape& shape_1, + const Shape& result_shape) { + return ShapeUtil::SameElementType(shape_0, shape_1) && + (ShapeUtil::SameElementType(shape_0, result_shape) || + (allow_mixed_precision_ && + ShapeUtil::SameElementTypeIgnoringFpPrecision(shape_0, + result_shape))); +} + +Status ShapeVerifier::HandleRng(HloInstruction* instruction) { + if (instruction->operand_count() != 2) { + return InternalError("Expected two operands for Rng instruction: %s", + instruction->ToString().c_str()); + } + + const Shape& shape_0 = instruction->operand(0)->shape(); + const Shape& shape_1 = instruction->operand(1)->shape(); + if (!ShapeUtil::IsScalar(shape_0) || !ShapeUtil::IsScalar(shape_1)) { + return InternalError( + "Expected scalar types for the two operands of Rng instruction: %s", + instruction->ToString().c_str()); + } + + if (!HasCompatibleElementTypes(shape_0, shape_1, instruction->shape())) { + return InternalError( + "Expected compatible element types for the result and the two operands" + " of Rng instruction: %s", + instruction->ToString().c_str()); + } + + PrimitiveType element_type = shape_0.element_type(); + switch (instruction->random_distribution()) { + case RNG_UNIFORM: + if (!primitive_util::IsFloatingPointType(element_type) && + !primitive_util::IsIntegralType(element_type) && + element_type != PRED) { + return InternalError( + "Element type not supported." + " Expected element to be of floating point type, integral type or" + " predicate type for RngUniform: %s", + instruction->ToString().c_str()); + } + break; + + case RNG_NORMAL: + if (!primitive_util::IsFloatingPointType(element_type)) { + return InternalError( + "Element type not supported." + " Expected element to be FloatingPointType for RngNormal: %s", + instruction->ToString().c_str()); + } + break; + default: + return InternalError( + "Invalid Rng distribution %s", + RandomDistribution_Name(instruction->random_distribution()).c_str()); + } + + return Status::OK(); +} Status ShapeVerifier::HandleReverse(HloInstruction* reverse) { return CheckShape( @@ -224,10 +293,13 @@ Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) { } Status ShapeVerifier::HandleReduce(HloInstruction* reduce) { + if (!ShapeUtil::IsArray(reduce->shape())) { + return InvalidArgument("Variadic reduce is not supported."); + } return CheckShape( reduce, ShapeInference::InferReduceShape( - reduce->operand(0)->shape(), reduce->operand(1)->shape(), + {&reduce->operand(0)->shape(), &reduce->operand(1)->shape()}, reduce->dimensions(), reduce->to_apply()->ComputeProgramShape())); } @@ -451,9 +523,9 @@ namespace { // inputs. Status CheckMixedPrecisionOperands(const HloInstruction* instruction) { switch (instruction->opcode()) { - // White list the following opcodes for mixed-precision check, because they - // involve data pass through or grouping via tuples, where the precisions - // of buffers can be different. + // White list the following opcodes for mixed-precision check, because + // they involve data pass through or grouping via tuples, where the + // precisions of buffers can be different. case HloOpcode::kCall: case HloOpcode::kConditional: case HloOpcode::kConstant: @@ -510,6 +582,15 @@ Status ShapeVerifier::HandleGather(HloInstruction* gather) { gather->gather_dimension_numbers(), gather->gather_window_bounds())); } +Status ShapeVerifier::HandleScatter(HloInstruction* scatter) { + return CheckShape( + scatter, ShapeInference::InferScatterShape( + scatter->operand(0)->shape(), scatter->operand(1)->shape(), + scatter->operand(2)->shape(), + scatter->to_apply()->ComputeProgramShape(), + scatter->scatter_dimension_numbers())); +} + Status ShapeVerifier::HandleAfterAll(HloInstruction* token) { std::vector operand_shapes; for (const HloInstruction* operand : token->operands()) { @@ -626,7 +707,8 @@ string ComputationsToString( // Verifies various invariants about the structure of the HLO: // -// (1) each instruction has a non-null parent() set to the HloComputation which +// (1) each instruction has a non-null parent() set to the HloComputation +// which // contains it. // // (2) each computation has a non-null parent() set to the HloModule which @@ -660,9 +742,9 @@ Status VerifyHloStructure(HloModule* module) { } // Check that operands are in the same computation separately from verifying - // parent() correctness so conditions like a null HloInstruction::parent() are - // identified and reported explicitly above rather than reporting a mismatched - // operand. + // parent() correctness so conditions like a null HloInstruction::parent() + // are identified and reported explicitly above rather than reporting a + // mismatched operand. for (const HloComputation* computation : module->computations()) { for (const HloInstruction* instruction : computation->instructions()) { for (int i = 0; i < instruction->operand_count(); ++i) { @@ -686,13 +768,14 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { HloComputation* fused_computation = fusion->fused_instructions_computation(); if (fusion != fused_computation->FusionInstruction()) { return InternalError( - "Instruction of fused computation does not match expected instruction " + "Instruction of fused computation does not match expected " + "instruction " "%s.", fusion->ToString().c_str()); } - // Fused root instruction and fused parameters must all be owned by the fusion - // computation. + // Fused root instruction and fused parameters must all be owned by the + // fusion computation. bool root_owned = false; const std::vector& fused_parameters = fusion->fused_parameters(); @@ -734,8 +817,8 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { fusion->ToString().c_str()); } - // All uses of fused instructions must be in the fusion computation, and every - // non-root instruction must have at least one use. + // All uses of fused instructions must be in the fusion computation, and + // every non-root instruction must have at least one use. for (auto* instruction : fusion->fused_instructions_computation()->instructions()) { if (instruction != fused_root) { @@ -779,7 +862,8 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { if (!ShapeUtil::Compatible(fused_param->shape(), fusion->operand(param_no)->shape())) { return InternalError( - "Shape mismatch between parameter number %lld and its operand in %s.", + "Shape mismatch between parameter number %lld and its operand in " + "%s.", param_no, fusion->ToString().c_str()); } } @@ -897,8 +981,9 @@ Status CheckSameChannel(const HloInstruction* instr1, return Status::OK(); } -// Checks if the given two instructions have the same is_host_transfer attribute -// value. Intsructions must be send/recv instructions or their 'done' variant. +// Checks if the given two instructions have the same is_host_transfer +// attribute value. Intsructions must be send/recv instructions or their +// 'done' variant. Status CheckSameIsHostTransfer(const HloInstruction* instr1, const HloInstruction* instr2) { const HloSendRecvInstruction* send_recv1 = @@ -909,7 +994,8 @@ Status CheckSameIsHostTransfer(const HloInstruction* instr1, TF_RET_CHECK(send_recv2 != nullptr); if (send_recv1->is_host_transfer() != send_recv2->is_host_transfer()) { return InternalError( - "Expected instructions to have the same is-host-transfer property: %s, " + "Expected instructions to have the same is-host-transfer property: " + "%s, " "%s ", instr1->ToString().c_str(), instr2->ToString().c_str()); } @@ -928,7 +1014,8 @@ Status VerifySendsAndRecvs(const HloModule& module) { host_channels.insert({sendrecv->channel_id(), sendrecv}); if (!it_inserted.second) { return FailedPrecondition( - "Channel %lld is used for multiple host send/recv instructions: %s " + "Channel %lld is used for multiple host send/recv instructions: " + "%s " "and " "%s", sendrecv->channel_id(), sendrecv->ToString().c_str(), diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h index 79f7aa9f4ce66cc9b53d016f2e126033492c81e9..c942fab08e1ace75bccb8762954787a4366922a9 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.h +++ b/tensorflow/compiler/xla/service/hlo_verifier.h @@ -45,6 +45,7 @@ class ShapeVerifier : public DfsHloVisitor { Status HandleConvolution(HloInstruction* convolution) override; Status HandleFft(HloInstruction* fft) override; Status HandleCrossReplicaSum(HloInstruction* crs) override; + Status HandleAllToAll(HloInstruction* hlo) override; Status HandleReducePrecision(HloInstruction* reduce_precision) override; Status HandleInfeed(HloInstruction*) override; Status HandleOutfeed(HloInstruction*) override; @@ -83,6 +84,7 @@ class ShapeVerifier : public DfsHloVisitor { HloInstruction* batch_norm_inference) override; Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override; Status HandleGather(HloInstruction* gather) override; + Status HandleScatter(HloInstruction* scatter) override; Status HandleAfterAll(HloInstruction* token) override; Status FinishVisit(HloInstruction*) override { return Status::OK(); } @@ -104,6 +106,13 @@ class ShapeVerifier : public DfsHloVisitor { Status CheckVariadicShape(const HloInstruction* instruction); private: + // Return true if the shapes of the two operands have the same element type, + // and the result shape either has the same element type as the operand + // shapes or mixed precision is allowed and the result shape and the operand + // shapes have floating point element types. + bool HasCompatibleElementTypes(const Shape& shape_0, const Shape& shape_1, + const Shape& result_shape); + // Whether the inputs and output of an instruction can contain both F32s and // BF16s. Tuples that include both F32s and BF16s are allowed regardless of // this flag. diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc index 04c6ba3eeb92bad2b5b69f7f56e73e1f7a8148aa..d764964f3c3dc58a54bd0307f8b625076c14f3e5 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc @@ -34,7 +34,17 @@ namespace { using ::testing::HasSubstr; -using HloVerifierTest = HloTestBase; +class HloVerifierTest : public HloTestBase { + public: + HloVerifierTest() + : HloTestBase(/*allow_mixed_precision_in_hlo_verifier=*/false) {} +}; + +class HloVerifierTestAllowMixedPrecision : public HloTestBase { + public: + HloVerifierTestAllowMixedPrecision() + : HloTestBase(/*allow_mixed_precision_in_hlo_verifier=*/true) {} +}; TEST_F(HloVerifierTest, NullInstructionParent) { HloComputation::Builder builder(TestName()); @@ -174,5 +184,96 @@ ENTRY entry { HasSubstr("shape does not match parameter")); } +TEST_F(HloVerifierTest, RngOpnd0NotScalar) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngOpnd0NotScalar { + constant.0 = f32[] constant(0) + constant.1 = f16[2] constant({1, 3}) + ROOT rng.0 = f32[10]{0} rng(f32[] constant.0, f16[2] constant.1), + distribution=rng_uniform + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), HasSubstr("Expected scalar type")); +} + +TEST_F(HloVerifierTest, RngOperandElementTypesDoNotMatch) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngOperandElementTypesNotMatch { + constant.0 = f32[] constant(0) + constant.1 = f16[] constant(1) + ROOT rng.0 = f32[10]{0} rng(f32[] constant.0, f16[] constant.1), + distribution=rng_normal + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Expected compatible element types")); +} + +TEST_F(HloVerifierTest, RngMixedPrecisionNotAllowed) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngResultElementTypeNotMatch { + constant.0 = f32[] constant(0) + constant.1 = f32[] constant(1) + ROOT rng.0 = f16[10]{0} rng(f32[] constant.0, f32[] constant.1), + distribution=rng_normal + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), + HasSubstr("Expected compatible element types")); +} + +TEST_F(HloVerifierTestAllowMixedPrecision, RngMixedPrecisionAllowed) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngResultElementTypeNotMatch { + constant.0 = f32[] constant(0) + constant.1 = f32[] constant(1) + ROOT rng.0 = f16[10]{0} rng(f32[] constant.0, f32[] constant.1), + distribution=rng_normal + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_TRUE(status.ok()); +} + +TEST_F(HloVerifierTest, RngElementTypeNotSupported) { + const char* const hlo_string = R"( + HloModule Module + + ENTRY RngElementTypeNotSupported { + constant.0 = s32[] constant(0) + constant.1 = s32[] constant(1) + ROOT rng.0 = s32[10]{0} rng(s32[] constant.0, s32[] constant.1), + distribution=rng_normal + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string)); + + auto status = verifier().Run(module.get()).status(); + ASSERT_FALSE(status.ok()); + EXPECT_THAT(status.error_message(), HasSubstr("Element type not supported")); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc index d7458c338e9f1df9fac90270845aae0b8f779ee2..bb5b40a8a87c5eab5a5b1599581a81bbd064511b 100644 --- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc @@ -36,7 +36,8 @@ string HumanReadableProfileBuilder::ToString() const { computation_name_.c_str(), HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str()); - auto print_op = [&](const OpInfo& op) { + int64 cumulative_cycles = 0; + auto print_op = [&](const OpInfo& op, bool is_total = false) { // Skip ops with 0 optimal seconds and 0 actual cycles. These are ops that // were expected to be free and are actually free -- things like (on most // backends) kParameter or kConstant HLOs. There's no need to clutter the @@ -59,27 +60,44 @@ string HumanReadableProfileBuilder::ToString() const { } } + double cumulative_cycles_percent = 0; double cycles_percent = 0; + if (!is_total) { + cumulative_cycles += op.cycles; + } if (total_cycles_ > 0) { cycles_percent = op.cycles / static_cast(total_cycles_) * 100; + cumulative_cycles_percent = + cumulative_cycles / static_cast(total_cycles_) * 100; + } + + string cycles_percent_str; + if (is_total) { + // Leaving off the two trailing decimal points of "100.%" lets us save two + // columns in the output. + cycles_percent_str = "100.% 100Σ"; + } else { + cycles_percent_str = + Printf("%5.2f%% %2.0fΣ", cycles_percent, cumulative_cycles_percent); } double nsecs = op.cycles / clock_rate_ghz_; - Appendf(&s, - "%15lld cycles (%6.2f%%) :: %12.1f usec %22s :: %18s " - ":: %18s :: %14s :: %16s :: %s\n", - op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles), - op.optimal_seconds < 0 - ? "" - : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(), - op.flop_count <= 0 - ? "" - : HumanReadableNumFlops(op.flop_count, nsecs).c_str(), - op.transcendental_count <= 0 ? "" - : HumanReadableNumTranscendentalOps( - op.transcendental_count, nsecs) - .c_str(), - bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str()); + Appendf( + &s, + "%15lld cycles (%s) :: %12.1f usec %22s :: %18s :: %18s :: %14s :: " + "%16s :: %s\n", + op.cycles, cycles_percent_str.c_str(), CyclesToMicroseconds(op.cycles), + op.optimal_seconds < 0 + ? "" + : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(), + op.flop_count <= 0 + ? "" + : HumanReadableNumFlops(op.flop_count, nsecs).c_str(), + op.transcendental_count <= 0 + ? "" + : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs) + .c_str(), + bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str()); }; float optimal_seconds_sum = 0.0; @@ -98,7 +116,8 @@ string HumanReadableProfileBuilder::ToString() const { VLOG(1) << "Total floating point ops: " << total_flops; print_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops, - total_transcendentals, total_bytes, optimal_seconds_sum}); + total_transcendentals, total_bytes, optimal_seconds_sum}, + /*is_total=*/true); // Sort ops in decreasing order of cycles, and print them. std::vector sorted_ops(op_infos_); diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc index 8b2df3256776a7d77517daff1fe282b0dbde7045..3531b7223fb11df212fa8d30e3adba6aac6c5679 100644 --- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc +++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc @@ -447,7 +447,7 @@ int64 FindSourcePositionForPassthroughResultDim(ArraySlice operand_shape, int64 indexed_source_subarray_size = std::accumulate(operand_shape.begin() + source_passthrough_dim + 1, - operand_shape.end(), 1, std::multiplies()); + operand_shape.end(), 1LL, std::multiplies()); return FindSuffixWithProduct(result_shape, indexed_source_subarray_size); } @@ -764,7 +764,7 @@ IndexedArrayAnalysis::FoldReshapeOfGatherNoDegenerateDims( &new_scalar_indexed_source_shape, source_dim_for_new_scalar_indexed_node, scalar_indexed_source_shape.dimensions(scalar_indexed->source_dim())); - CHECK_EQ(c_accumulate(new_scalar_indexed_source_shape, 1l, + CHECK_EQ(c_accumulate(new_scalar_indexed_source_shape, 1LL, std::multiplies()), ShapeUtil::ElementsIn(scalar_indexed_source_shape)); diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc index af07370135ca2b2e53fcbcb53696e0aa12bf7a6f..f33942d67907d8f40811bde5041350a2e1e1f1fc 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion.cc @@ -120,6 +120,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kConditional: case HloOpcode::kConvolution: case HloOpcode::kCrossReplicaSum: + case HloOpcode::kAllToAll: case HloOpcode::kCustomCall: case HloOpcode::kDivide: case HloOpcode::kDomain: @@ -141,6 +142,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kReduceWindow: case HloOpcode::kRemainder: case HloOpcode::kRng: + case HloOpcode::kScatter: case HloOpcode::kSelectAndScatter: case HloOpcode::kSend: case HloOpcode::kSendDone: diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h index 9b109022fbfc698f7dadc678ef837da270a5e74a..db6b910b32f8ec234c4cf1c331a1aa3bb2f9389f 100644 --- a/tensorflow/compiler/xla/service/interpreter/executor.h +++ b/tensorflow/compiler/xla/service/interpreter/executor.h @@ -104,7 +104,7 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface { } // No "synchronize all activity" implemented for this platform at the moment. - bool SynchronizeAllActivity() override { return false; } + bool SynchronizeAllActivity() override { return true; } bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override { return false; } diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc index 9705687b004976fc5d35ddeb1c2a69c65ed50358..805fdb2d5bd8a08490b354d60f281c8f99bc20d8 100644 --- a/tensorflow/compiler/xla/service/layout_assignment.cc +++ b/tensorflow/compiler/xla/service/layout_assignment.cc @@ -874,8 +874,8 @@ void LayoutAssignment::SetupCopiedInstruction(const HloInstruction& instruction, // HostCompute module. // Otherwise it is preferable to leave the new instruction without device, // and let the automatic device placer to choose the best location. - if (!sharding.HasUniqueDevice() || - HloSharding::IsReservedDevice(sharding.UniqueDevice().ValueOrDie())) { + auto device = sharding.UniqueDevice(); + if (!device || HloSharding::IsReservedDevice(*device)) { copy->set_sharding(sharding); } } @@ -1228,7 +1228,7 @@ Status LayoutAssignment::PropagateUseConstraintToDefs( const PointsToSet& points_to_set = constraints->points_to_analysis().GetPointsToSet(instruction); return points_to_set.ForEachElementWithStatus( - [this, &shape_layout, constraints]( + [&shape_layout, constraints]( const ShapeIndex& index, const PointsToSet::BufferList& buffers) -> Status { if (ShapeUtil::IsLeafIndex(shape_layout.shape(), index)) { @@ -1563,7 +1563,7 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) { // and the computation result. The latter two are specified in // computation_layout, so we only need to keep the existing layouts for // infeeds. Clearing the layouts here avoids hiding potential bugs in the - // layout assignment pass that may accidently use the existing layout. + // layout assignment pass that may accidentally use the existing layout. for (HloInstruction* instruction : computation->instructions()) { if (instruction->opcode() == HloOpcode::kBitcast) { // bitcasts are inherently layout sensitive and so a bitcast instruction diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc index 941d940684651792467a84e816a91533ce11dd63..fe5ec1cc66d06e85ce70625ef7cf764a37b29166 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc @@ -56,12 +56,12 @@ ENTRY while3 { )"; CompileAndVerifyIr(hlo_string, R"( -; CHECK-LABEL: @body(i8* align 4 dereferenceable(4) %retval +; CHECK-LABEL: @body(i8* %retval ; CHECK: %[[add_result:.*]] = fadd fast float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]] ; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], !alias.scope ![[alias_scope_md_for_store:[0-9]+]] ; -; CHECK-LABEL: @condition(i8* align 1 dereferenceable(1) %fusion, i8* noalias %run_options, i8** noalias %params -; CHECK: %[[cond_state_buf_ptr:.*]] = getelementptr inbounds i8*, i8** %params, i64 0 +; CHECK-LABEL: @condition(i8* %retval, i8* noalias %run_options, i8** noalias %params +; CHECK: %[[cond_state_buf_ptr:.*]] = getelementptr inbounds i8*, i8** %temps, i64 0 ; CHECK: %[[cond_state_buf_untyped:.*]] = load i8*, i8** %[[cond_state_buf_ptr]] ; CHECK: %[[cond_state_buf_typed:.*]] = bitcast i8* %[[cond_state_buf_untyped]] to float* ; CHECK: load float, float* %[[cond_state_buf_typed]], !alias.scope ![[alias_scope_md_for_store]], !noalias ![[noalias_md_for_load:.*]] diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc index ad3b662c20ac53b0a6d634b16b3b908f730f3d2d..ccb9fb3e3af5e308accc924d3501213841d7d6c7 100644 --- a/tensorflow/compiler/xla/service/reshape_mover_test.cc +++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc @@ -76,9 +76,13 @@ TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) { TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) { HloComputation::Builder builder(TestName()); auto root_shape = ShapeUtil::MakeShape(F32, {8, 7}); - auto rng0 = builder.AddInstruction( - HloInstruction::CreateRng(ShapeUtil::MakeShape(F32, {1, 8, 1, 7, 1}), - RandomDistribution::RNG_UNIFORM, {})); + auto rng0 = builder.AddInstruction(HloInstruction::CreateRng( + ShapeUtil::MakeShape(F32, {1, 8, 1, 7, 1}), + RandomDistribution::RNG_UNIFORM, + {builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))), + builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR0(1.0f)))})); auto reshape0 = builder.AddInstruction(HloInstruction::CreateReshape(root_shape, rng0)); diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index ce070bc5b6c3dfc22ffd0922be27f0afd6bff48f..1dbf540d13d1fb6f6a4052caeff922cc0290f1b8 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -53,10 +53,10 @@ limitations under the License. #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/ptr_util.h" using ::tensorflow::strings::Printf; using ::tensorflow::strings::StrCat; -using ::xla::source_map_util::InvalidParameterArgument; namespace xla { @@ -409,7 +409,7 @@ Service::ExecuteParallelAndRegisterResult( streams.push_back(std::move(stream)); if (replica == 0 && profile != nullptr) { - timers.emplace_back(new se::Timer(streams.back()->parent())); + timers.push_back(MakeUnique(streams.back()->parent())); streams.back() ->InitTimer(timers.back().get()) .ThenStartTimer(timers.back().get()); @@ -441,7 +441,7 @@ Service::ExecuteParallelAndRegisterResult( streams.back()->ThenStopTimer(timers.back().get()); } - result_buffers.emplace_back(std::move(result)); + result_buffers.push_back(std::move(result)); } TF_ASSIGN_OR_RETURN(GlobalDataHandle handle, allocation_tracker_.RegisterReplicatedBuffers( @@ -559,7 +559,7 @@ StatusOr Service::ExecuteAndRegisterResult( std::vector> replicated_arguments; for (const auto& arg : arguments) { - replicated_arguments.emplace_back(arg); + replicated_arguments.push_back(arg); } TF_ASSIGN_OR_RETURN(auto results, executable->ExecuteOnStreams( @@ -1053,11 +1053,12 @@ Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg, executor = replicas[arg->replica_id()]; } - Literal literal; + auto literal = Literal::CreateFromShape(arg->shape_with_layout()); + TF_RETURN_IF_ERROR( execute_backend_->transfer_manager()->TransferLiteralFromOutfeed( - executor, arg->shape_with_layout(), &literal)); - *result->mutable_literal() = literal.ToProto(); + executor, arg->shape_with_layout(), *literal)); + *result->mutable_literal() = literal->ToProto(); return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index 35df792b07022b2338fcecc25eb8a0718626e464..a4ea2b28f4dbf41d61702f1af2d65c4d2c86d578 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -58,66 +58,101 @@ Status ExpectArray(const Shape& shape, tensorflow::StringPiece op_type) { return Status::OK(); } -Status VerifyReducerShape(const ProgramShape& reducer_shape, - const Shape& init_value_shape, - const PrimitiveType& input_element_type) { - if (reducer_shape.parameters_size() != 2) { - return InvalidArgument( - "Reduction function must take 2 parameters, but " +Status VerifyReducerShape( + const ProgramShape& reducer_shape, + tensorflow::gtl::ArraySlice init_value_shapes, + tensorflow::gtl::ArraySlice input_element_types, + int64 inputs) { + if (reducer_shape.parameters_size() != inputs * 2) { + return InvalidArgument( + "Reduction function must take %lld parameters, but " "takes %d parameter(s).", - reducer_shape.parameters_size()); + inputs * 2, reducer_shape.parameters_size()); } const Shape& accumulator_shape = reducer_shape.result(); - if (!ShapeUtil::IsArray(accumulator_shape) || - ShapeUtil::Rank(accumulator_shape) != 0) { - return InvalidArgument( - "Reduction function must produce a scalar but has shape: %s", - ShapeUtil::HumanString(accumulator_shape).c_str()); - } - - // Check that the accumulator can be passed in as the first argument. - // Note: comparing here and below with Compatible since we don't care about - // layout in scalars - see b/26668201 for a longer-term vision. - if (!ShapeUtil::Compatible(accumulator_shape, reducer_shape.parameters(0))) { + std::vector accumulator_subshapes; + if (ShapeUtil::IsArray(accumulator_shape)) { + if (inputs != 1) { + return InvalidArgument( + "Reduction function must produce a tuple with %lld elements, but " + "produces a scalar", + inputs); + } + accumulator_subshapes.push_back(&accumulator_shape); + } else if (ShapeUtil::IsTuple(accumulator_shape)) { + if (ShapeUtil::TupleElementCount(accumulator_shape) != inputs) { + return InvalidArgument( + "Reduction function must produce a tuple with %lld elements, but has " + "%lld elements", + inputs, ShapeUtil::TupleElementCount(accumulator_shape)); + } + for (const Shape& element_shape : accumulator_shape.tuple_shapes()) { + accumulator_subshapes.push_back(&element_shape); + } + } else { return InvalidArgument( - "Reduction function's first parameter shape differs from the " - "result shape: %s vs %s", - ShapeUtil::HumanString(reducer_shape.parameters(0)).c_str(), + "Reduction function must produce a scalar or tuple of scalars, but has " + "shape: %s", ShapeUtil::HumanString(accumulator_shape).c_str()); } - // Check that init_value's shape is suitable for reducer_shape. - if (!ShapeUtil::CompatibleIgnoringFpPrecision(accumulator_shape, - init_value_shape)) { - return InvalidArgument( - "Reduction function's accumulator shape differs from the " - "init_value shape: %s vs %s", - ShapeUtil::HumanString(accumulator_shape).c_str(), - ShapeUtil::HumanString(init_value_shape).c_str()); - } - - // Check that the inputs can be passed in as the second argument. - const Shape& input_element_shape = - ShapeUtil::MakeShape(input_element_type, {}); - if (!ShapeUtil::CompatibleIgnoringFpPrecision(input_element_shape, - reducer_shape.parameters(1))) { - return InvalidArgument( - "Reduction function's second parameter shape differs from the " - "input type element type: %s vs %s", - ShapeUtil::HumanString(reducer_shape.parameters(1)).c_str(), - ShapeUtil::HumanString(input_element_shape).c_str()); + for (const Shape* element_shape : accumulator_subshapes) { + if (ShapeUtil::Rank(*element_shape) != 0) { + return InvalidArgument( + "Reduction function must return a scalar or tuple of scalars but " + "returns shape: %s", + ShapeUtil::HumanString(accumulator_shape).c_str()); + } } - // Currently the accumulator and inputs must be the same type, - // though that restriction could be relaxed. - if (!ShapeUtil::CompatibleIgnoringFpPrecision(accumulator_shape, - reducer_shape.parameters(1))) { - return InvalidArgument( - "Reduction function's second parameter shape must " - "match the result shape, but got %s vs %s.", - ShapeUtil::HumanString(reducer_shape.parameters(1)).c_str(), - ShapeUtil::HumanString(accumulator_shape).c_str()); + for (int64 i = 0; i < inputs; ++i) { + // Check that the accumulator can be passed in as the first argument. + // Note: comparing here and below with Compatible since we don't care about + // layout in scalars - see b/26668201 for a longer-term vision. + if (!ShapeUtil::Compatible(*accumulator_subshapes[i], + reducer_shape.parameters(i))) { + return InvalidArgument( + "Reduction function's %lld-th parameter shape differs from the " + "result shape: %s vs %s", + i, ShapeUtil::HumanString(reducer_shape.parameters(i)).c_str(), + ShapeUtil::HumanString(*accumulator_subshapes[i]).c_str()); + } + // Check that init_value's shapes are suitable for reducer_shape. + if (!ShapeUtil::CompatibleIgnoringFpPrecision(*accumulator_subshapes[i], + *init_value_shapes[i])) { + return InvalidArgument( + "Reduction function's accumulator shape at index %lld differs from " + "the init_value shape: %s vs %s", + i, ShapeUtil::HumanString(*accumulator_subshapes[i]).c_str(), + ShapeUtil::HumanString(*init_value_shapes[i]).c_str()); + } + // Check that the inputs can be passed in as the non-accumulator arguments. + const Shape input_element_shape = + ShapeUtil::MakeShape(input_element_types[i], {}); + if (!ShapeUtil::CompatibleIgnoringFpPrecision( + input_element_shape, reducer_shape.parameters(inputs + i))) { + return InvalidArgument( + "Reduction function's %lld-th parameter shape differs from the " + "input type element type: %s vs %s", + inputs + i, + ShapeUtil::HumanString(reducer_shape.parameters(inputs + i)).c_str(), + ShapeUtil::HumanString(input_element_shape).c_str()); + } + // Check that the accumulator and inputs to the reducer function match. + // If the accumulator is scalar, it must have the same type as the inputs + // (up to fp precision). If it is a tuple, then the k-th element of the + // tuple must have the same type as the K-th input (again, up to fp + // precision.) + if (!ShapeUtil::CompatibleIgnoringFpPrecision( + *accumulator_subshapes[i], reducer_shape.parameters(inputs + i))) { + return InvalidArgument( + "Reduction function's %lld-th parameter shape must " + "match the result shape, but got %s vs %s.", + inputs + i, + ShapeUtil::HumanString(reducer_shape.parameters(inputs + i)).c_str(), + ShapeUtil::HumanString(*accumulator_subshapes[i]).c_str()); + } } return Status::OK(); @@ -1744,11 +1779,83 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, return ShapeUtil::MakeTupleShape(operand_shape_values); } +/* static */ StatusOr ShapeInference::InferAllToAllShape( + const Shape& shape, int64 split_dimension, int64 concat_dimension, + int64 split_count) { + TF_RET_CHECK(split_count > 0); + if (split_dimension >= ShapeUtil::Rank(shape) || split_dimension < 0) { + return InvalidArgument( + "AllToAll split_dimension %lld is out-of-bounds in shape %s.", + split_dimension, ShapeUtil::HumanString(shape).c_str()); + } + if (concat_dimension >= ShapeUtil::Rank(shape) || concat_dimension < 0) { + return InvalidArgument( + "AllToAll concat_dimension %lld is out-of-bounds in shape %s.", + concat_dimension, ShapeUtil::HumanString(shape).c_str()); + } + if (shape.dimensions(split_dimension) % split_count != 0) { + return InvalidArgument( + "AllToAll split dimension size %lld must be dividable by split_count " + "%lld.", + shape.dimensions(split_dimension), split_count); + } + std::vector new_dimensions(shape.dimensions().begin(), + shape.dimensions().end()); + new_dimensions[split_dimension] /= split_count; + new_dimensions[concat_dimension] *= split_count; + return ShapeUtil::MakeShape(shape.element_type(), new_dimensions); +} + +/* static */ StatusOr ShapeInference::InferAllToAllTupleShape( + tensorflow::gtl::ArraySlice operand_shapes) { + // An Alltoall HLO instruction receives N operands (with the same shape) and + // returns a tuple that contains N array shapes. + TF_RET_CHECK(!operand_shapes.empty()); + for (int i = 0; i < operand_shapes.size(); i++) { + if (!ShapeUtil::Equal(*operand_shapes[0], *operand_shapes[i])) { + return InvalidArgument( + "HLO all-to-all has operands with different shapes: the 0th " + "operand shape %s, but the %dth operand has shape %s.", + ShapeUtil::HumanString(*operand_shapes[0]).c_str(), i, + ShapeUtil::HumanString(*operand_shapes[i]).c_str()); + } + } + + return InferVariadicOpShape(HloOpcode::kTuple, operand_shapes); +} + /* static */ StatusOr ShapeInference::InferReduceShape( - const Shape& arg, const Shape& init_value, + tensorflow::gtl::ArraySlice arg_shapes, tensorflow::gtl::ArraySlice dimensions_to_reduce, const ProgramShape& to_apply) { - // Check that the dimension to reduce are in-bounds for the given shape. + if (arg_shapes.empty()) { + return InvalidArgument("Reduce must have at least 2 arguments, has 0"); + } + if (arg_shapes.size() % 2) { + return InvalidArgument( + "Reduce must have an even number of arguments, has %lu", + arg_shapes.size()); + } + int64 num_reduced_args = arg_shapes.size() / 2; + + tensorflow::gtl::ArraySlice reduced_args(arg_shapes, 0, + num_reduced_args); + // Check that all of the reduced tensors have the same dimensions. The element + // types may be different. + for (int64 i = 1; i < num_reduced_args; ++i) { + if (!ShapeUtil::SameDimensions(*reduced_args[0], *reduced_args[i])) { + return InvalidArgument( + "All reduced tensors must have the sime dimension. Tensor 0 has " + "shape %s, Tensor %lld has shape %s", + ShapeUtil::HumanString(*reduced_args[0]).c_str(), i, + ShapeUtil::HumanString(*reduced_args[i]).c_str()); + } + } + + // Check that the dimensions to reduce are in-bounds for the given shape. + // We've already verified all reduced tensors have the same dimensions, so it + // doesn't matter which one we choose. + const Shape& arg = *reduced_args[0]; for (int64 dimension : dimensions_to_reduce) { if (dimension >= ShapeUtil::Rank(arg) || dimension < 0) { return InvalidArgument( @@ -1756,8 +1863,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, ShapeUtil::HumanString(arg).c_str()); } } - TF_RETURN_IF_ERROR( - VerifyReducerShape(to_apply, init_value, arg.element_type())); + + tensorflow::gtl::ArraySlice init_values( + arg_shapes, num_reduced_args, arg_shapes.size()); + std::vector element_types; + for (const Shape* arg : reduced_args) { + element_types.push_back(arg->element_type()); + } + TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply, init_values, element_types, + num_reduced_args)); std::set dimensions_to_reduce_set(dimensions_to_reduce.begin(), dimensions_to_reduce.end()); @@ -1768,15 +1882,26 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, } } - return ShapeUtil::MakeShape(to_apply.result().element_type(), new_dimensions); + if (ShapeUtil::IsScalar(to_apply.result())) { + return ShapeUtil::MakeShape(to_apply.result().element_type(), + new_dimensions); + } else { + std::vector result_subshapes; + for (const Shape& subshape : to_apply.result().tuple_shapes()) { + result_subshapes.push_back( + ShapeUtil::MakeShape(subshape.element_type(), new_dimensions)); + } + return ShapeUtil::MakeTupleShape(result_subshapes); + } } /* static */ StatusOr ShapeInference::InferReduceWindowShape( const Shape& operand_shape, const Shape& init_value_shape, const Window& window, const ProgramShape& to_apply_shape) { TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of reduce-window")); - TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply_shape, init_value_shape, - operand_shape.element_type())); + TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply_shape, {&init_value_shape}, + {operand_shape.element_type()}, + /*inputs=*/1)); return InferWindowOutputShape(operand_shape, window, init_value_shape.element_type(), /*allow_negative_padding=*/false); @@ -1821,8 +1946,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation, } // Check if the scatter function has a proper shape as a reduction. - TF_RETURN_IF_ERROR(VerifyReducerShape(scatter_shape, init_value_shape, - source_shape.element_type())); + TF_RETURN_IF_ERROR(VerifyReducerShape(scatter_shape, {&init_value_shape}, + {source_shape.element_type()}, + /*inputs=*/1)); // Check if the result shape of window operation matches the source shape. TF_ASSIGN_OR_RETURN(const Shape& window_result_shape, @@ -2568,4 +2694,194 @@ static Status ValidateGatherDimensionNumbers( return ShapeUtil::MakeShape(input_shape.element_type(), output_dim_bounds); } +namespace { + +Status ValidateScatterDimensionNumbers( + const Shape& operand_shape, + tensorflow::gtl::ArraySlice scatter_indices_shape, + const Shape& updates_shape, const ScatterDimensionNumbers& dim_numbers) { + // Validate update_window_dims in ScatterDimensionNumbers. + if (!c_is_sorted(dim_numbers.update_window_dims())) { + return InvalidArgument( + "update_window_dims in scatter op must be sorted; got: %s.", + Join(dim_numbers.update_window_dims(), ", ").c_str()); + } + if (c_adjacent_find(dim_numbers.update_window_dims()) != + dim_numbers.update_window_dims().end()) { + return InvalidArgument( + "update_window_dims in scatter op must not repeat; got: %s.", + Join(dim_numbers.update_window_dims(), ", ").c_str()); + } + const int64 updates_rank = ShapeUtil::Rank(updates_shape); + for (int64 window_dim : dim_numbers.update_window_dims()) { + if (window_dim < 0 || window_dim >= updates_rank) { + return InvalidArgument( + "Invalid update_window_dims set in scatter op; valid range is [0, " + "%lld). got: %lld.", + updates_rank, window_dim); + } + } + + // Validate inserted_window_dims in ScatterDimensionNumbers. + if (!c_is_sorted(dim_numbers.inserted_window_dims())) { + return InvalidArgument( + "inserted_window_dims in scatter op must be sorted; got: %s.", + Join(dim_numbers.inserted_window_dims(), ", ").c_str()); + } + if (c_adjacent_find(dim_numbers.inserted_window_dims()) != + dim_numbers.inserted_window_dims().end()) { + return InvalidArgument( + "inserted_window_dims in scatter op must not repeat; got: %s.", + Join(dim_numbers.inserted_window_dims(), ", ").c_str()); + } + for (int64 inserted_dim : dim_numbers.inserted_window_dims()) { + if (inserted_dim < 0 || inserted_dim >= operand_shape.dimensions_size()) { + return InvalidArgument( + "Invalid inserted_window_dims set in scatter op; valid range is [0, " + "%d), got: %lld.", + operand_shape.dimensions_size(), inserted_dim); + } + } + + // Validate scatter_dims_to_operand_dims in ScatterDimensionNumbers. + if (dim_numbers.scatter_dims_to_operand_dims_size() != + scatter_indices_shape[dim_numbers.index_vector_dim()]) { + return InvalidArgument( + "Scatter op has %d elements in scatter_dims_to_operand_dims and the " + "bound of dimension index_vector_dim=%lld of scatter_indices is %lld. " + "These two numbers must be equal.", + dim_numbers.scatter_dims_to_operand_dims_size(), + dim_numbers.index_vector_dim(), + scatter_indices_shape[dim_numbers.index_vector_dim()]); + } + for (int i = 0; i < dim_numbers.scatter_dims_to_operand_dims_size(); ++i) { + int64 scatter_dim_to_operand_dim = + dim_numbers.scatter_dims_to_operand_dims(i); + if (scatter_dim_to_operand_dim < 0 || + scatter_dim_to_operand_dim >= operand_shape.dimensions_size()) { + return InvalidArgument( + "Invalid scatter_dims_to_operand_dims mapping; domain is [0, %d), " + "got: %d->%lld.", + operand_shape.dimensions_size(), i, scatter_dim_to_operand_dim); + } + } + std::vector sorted_scatter_dims_to_operand_dims( + dim_numbers.scatter_dims_to_operand_dims().begin(), + dim_numbers.scatter_dims_to_operand_dims().end()); + c_sort(sorted_scatter_dims_to_operand_dims); + if (c_adjacent_find(sorted_scatter_dims_to_operand_dims) != + sorted_scatter_dims_to_operand_dims.end()) { + return InvalidArgument( + "Repeated dimensions not allowed in scatter_dims_to_operand_dims; " + "got: %s.", + Join(dim_numbers.scatter_dims_to_operand_dims(), ", ").c_str()); + } + + return Status::OK(); +} + +} // namespace + +/*static*/ StatusOr ShapeInference::InferScatterShape( + const Shape& operand_shape, const Shape& scatter_indices_shape, + const Shape& updates_shape, const ProgramShape& to_apply_shape, + const ScatterDimensionNumbers& scatter_dim_numbers) { + TF_RETURN_IF_ERROR( + ExpectArray(operand_shape, "operand tensor of scatter op")); + TF_RETURN_IF_ERROR( + ExpectArray(scatter_indices_shape, "scatter indices of scatter op")); + TF_RETURN_IF_ERROR(ExpectArray(updates_shape, "updates of scatter op")); + + if (!ShapeUtil::ElementIsIntegral(scatter_indices_shape)) { + return InvalidArgument( + "Scatter indices parameter must be an integral tensor; got %s.", + ShapeUtil::HumanString(scatter_indices_shape).c_str()); + } + + if (scatter_indices_shape.dimensions_size() < + scatter_dim_numbers.index_vector_dim() || + scatter_dim_numbers.index_vector_dim() < 0) { + return InvalidArgument( + "Scatter index leaf dimension must be within [0, rank(scatter_indices)" + " + 1). rank(scatter_indices) is %d and scatter index leaf dimension " + "is %lld.", + scatter_indices_shape.dimensions_size(), + scatter_dim_numbers.index_vector_dim()); + } + + // Check if the update computation has a proper shape as a reduction. + const Shape init_value_shape = + ShapeUtil::MakeShape(operand_shape.element_type(), {}); + TF_RETURN_IF_ERROR(VerifyReducerShape(to_apply_shape, {&init_value_shape}, + {updates_shape.element_type()}, + /*inputs=*/1)); + + std::vector expanded_scatter_indices_shape = + ArraySliceToVector(AsInt64Slice(scatter_indices_shape.dimensions())); + if (expanded_scatter_indices_shape.size() == + scatter_dim_numbers.index_vector_dim()) { + expanded_scatter_indices_shape.push_back(1); + } + + int64 expected_updates_rank = expanded_scatter_indices_shape.size() - 1 + + scatter_dim_numbers.update_window_dims_size(); + if (ShapeUtil::Rank(updates_shape) != expected_updates_rank) { + return InvalidArgument("Updates tensor must be of rank %lld; got %lld.", + expected_updates_rank, + ShapeUtil::Rank(updates_shape)); + } + + TF_RETURN_IF_ERROR(ValidateScatterDimensionNumbers( + operand_shape, expanded_scatter_indices_shape, updates_shape, + scatter_dim_numbers)); + + int64 inserted_dims_seen = 0; + std::vector max_update_window_bounds; + for (int i = 0; i < operand_shape.dimensions_size(); ++i) { + if (inserted_dims_seen < scatter_dim_numbers.inserted_window_dims_size() && + scatter_dim_numbers.inserted_window_dims(inserted_dims_seen) == i) { + ++inserted_dims_seen; + } else { + max_update_window_bounds.push_back(operand_shape.dimensions(i)); + } + } + for (int i = 0; i < scatter_dim_numbers.update_window_dims_size(); ++i) { + auto update_window_dim = scatter_dim_numbers.update_window_dims(i); + if (updates_shape.dimensions(update_window_dim) > + max_update_window_bounds[i]) { + return InvalidArgument( + "Bounds of the window dimensions of updates must not exceed the " + "bounds of the corresponding dimensions of operand. For dimension " + "%lld, updates bound is %lld, operand bound is %lld.", + update_window_dim, updates_shape.dimensions(update_window_dim), + max_update_window_bounds[i]); + } + } + + int64 scatter_dims_seen = 0; + for (int64 i = 0; i < ShapeUtil::Rank(updates_shape); ++i) { + bool is_update_window_dim = + c_binary_search(scatter_dim_numbers.update_window_dims(), i); + if (is_update_window_dim) { + continue; + } + if (scatter_dims_seen == scatter_dim_numbers.index_vector_dim()) { + ++scatter_dims_seen; + } + if (updates_shape.dimensions(i) != + expanded_scatter_indices_shape[scatter_dims_seen]) { + return InvalidArgument( + "Bounds of the scatter dimensions of updates must be same as the " + "bounds of the corresponding dimensions of scatter indices. For " + "scatter dimension %lld, updates bound is %lld, scatter_indices " + "bound is %lld.", + i, updates_shape.dimensions(i), + expanded_scatter_indices_shape[scatter_dims_seen]); + } + ++scatter_dims_seen; + } + + return operand_shape; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h index 1a5684e3c306eef90fd1bfdf4565b0dcde2fbab6..c185b0a1bd79e23e0d76daad50fb4a9708a743dd 100644 --- a/tensorflow/compiler/xla/service/shape_inference.h +++ b/tensorflow/compiler/xla/service/shape_inference.h @@ -119,11 +119,22 @@ class ShapeInference { const Shape& in, FftType fft_type, tensorflow::gtl::ArraySlice fft_length); - // Infers the shape produced a cross replica sum with the given operand + // Infers the shape produced by a cross replica sum with the given operand // shapes. static StatusOr InferCrossReplicaSumShape( tensorflow::gtl::ArraySlice operand_shapes); + // Infers final shape of an Alltoall operation that is created by the xla + // builder. + static StatusOr InferAllToAllShape(const Shape& shape, + int64 split_dimension, + int64 concat_dimension, + int64 split_count); + + // Infers the shape of an HLO all-to-all instruction. + static StatusOr InferAllToAllTupleShape( + tensorflow::gtl::ArraySlice operand_shapes); + // Infers the shape produced by applying the given reduction computation // shape to the given input operand shape. // @@ -131,7 +142,7 @@ class ShapeInference { // index as the leading parameter, and the program shape should match // accordingly (or an error will result). static StatusOr InferReduceShape( - const Shape& arg, const Shape& init_value, + tensorflow::gtl::ArraySlice arg_shapes, tensorflow::gtl::ArraySlice dimensions_to_reduce, const ProgramShape& to_apply); @@ -268,6 +279,14 @@ class ShapeInference { const GatherDimensionNumbers& gather_dim_numbers, tensorflow::gtl::ArraySlice window_bounds); + // Helper that validates the given input shape, scatter indices shape, updates + // shape, and scatter dimension numbers that constitute a scatter operation, + // and returns the result shape of the scatter operation. + static StatusOr InferScatterShape( + const Shape& operand_shape, const Shape& scatter_indices_shape, + const Shape& updates_shape, const ProgramShape& to_apply_shape, + const ScatterDimensionNumbers& scatter_dim_numbers); + private: // Helper that infers the shape produced by performing an element-wise binary // operation with the given LHS and RHS shapes. diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc index 6046d50c6d41a3956b996a3320848784ffd59068..a73fa181cdd13dc7fabcdc367ae117e19bdc3e5f 100644 --- a/tensorflow/compiler/xla/service/shape_inference_test.cc +++ b/tensorflow/compiler/xla/service/shape_inference_test.cc @@ -63,7 +63,7 @@ class ReduceShapeInferenceTest : public ShapeInferenceTest { tensorflow::gtl::ArraySlice dimensions_to_reduce) { ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_); auto inferred_status = ShapeInference::InferReduceShape( - arg, f32_, dimensions_to_reduce, to_apply); + {&arg, &f32_}, dimensions_to_reduce, to_apply); EXPECT_IS_OK(inferred_status.status()); EXPECT_TRUE(ShapeUtil::Equal(expected_inferred_shape, inferred_status.ValueOrDie())); @@ -703,11 +703,99 @@ TEST_F(ReduceShapeInferenceTest, ReduceCubeAmongAllDimensions) { /*dimensions_to_reduce=*/{0, 1, 2}); } +TEST_F(ReduceShapeInferenceTest, ReduceMultiOutput) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_IS_OK(inferred_status.status()); + EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeTupleShape({f32_, s32_}), + inferred_status.ValueOrDie())); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput1) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = + ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_, f32_, s32_}, + ShapeUtil::MakeTupleShape({f32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT(inferred_status.status().error_message(), + HasSubstr("must take 4 parameters, but takes 6 parameter(s)")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput2) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT( + inferred_status.status().error_message(), + HasSubstr( + "parameter shape differs from the result shape: s32[] vs f32[]")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput3) { + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape({}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT(inferred_status.status().error_message(), + HasSubstr("must have at least 2 arguments, has 0")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = + ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_}, f32_); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT( + inferred_status.status().error_message(), + HasSubstr("must produce a tuple with 2 elements, but produces a scalar")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput2) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT( + inferred_status.status().error_message(), + HasSubstr("must produce a tuple with 2 elements, but has 3 elements")); +} + +TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerBoth) { + Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); + Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3}); + ProgramShape to_apply = ShapeUtil::MakeProgramShape( + {s32_, s32_, s32_, s32_}, ShapeUtil::MakeTupleShape({s32_, s32_})); + auto inferred_status = ShapeInference::InferReduceShape( + {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply); + EXPECT_FALSE(inferred_status.ok()); + EXPECT_THAT(inferred_status.status().error_message(), + HasSubstr("accumulator shape at index 0 differs from the " + "init_value shape: s32[] vs f32[]")); +} + TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) { ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_); + Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); auto inferred_status = ShapeInference::InferReduceShape( - ShapeUtil::MakeShape(F32, {5, 3}), f32_, /*dimensions_to_reduce=*/{3, 4}, - to_apply); + {&arg_shape, &f32_}, + /*dimensions_to_reduce=*/{3, 4}, to_apply); EXPECT_FALSE(inferred_status.ok()); EXPECT_THAT(inferred_status.status().error_message(), HasSubstr("out-of-bounds dimension")); @@ -715,8 +803,9 @@ TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) { TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) { ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_, f32_}, f32_); + Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); auto inferred_status = - ShapeInference::InferReduceShape(ShapeUtil::MakeShape(F32, {5, 3}), f32_, + ShapeInference::InferReduceShape({&arg_shape, &f32_}, /*dimensions_to_reduce=*/{0}, to_apply); EXPECT_FALSE(inferred_status.ok()); EXPECT_THAT(inferred_status.status().error_message(), @@ -725,12 +814,13 @@ TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) { TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) { ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, s32_); + Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3}); auto inferred_status = - ShapeInference::InferReduceShape(ShapeUtil::MakeShape(F32, {5, 3}), f32_, + ShapeInference::InferReduceShape({&arg_shape, &f32_}, /*dimensions_to_reduce=*/{0}, to_apply); EXPECT_FALSE(inferred_status.ok()); EXPECT_THAT(inferred_status.status().error_message(), - HasSubstr("first parameter shape differs")); + HasSubstr("0-th parameter shape differs")); } TEST_F(ShapeInferenceTest, InferSliceShapeRank2) { @@ -1536,7 +1626,7 @@ TEST_F(ShapeInferenceTest, BadSort) { << statusor.status(); } -class GatherShapeInferenceTest : public ShapeInferenceTest { +class ScatterGatherShapeInferenceTest : public ShapeInferenceTest { protected: const Shape s64_scalar_ = ShapeUtil::MakeShape(S64, {}); const Shape s64_vector_5_ = ShapeUtil::MakeShape(S64, {5}); @@ -1553,9 +1643,13 @@ class GatherShapeInferenceTest : public ShapeInferenceTest { ShapeUtil::MakeShape(F32, {50, 49, 48, 47, 46}); const Shape tuple_shape_ = ShapeUtil::MakeTupleShape( {s64_4d_tensor_10_9_8_7_1_, s64_4d_tensor_10_9_8_7_1_}); + const ProgramShape to_apply_ = + ShapeUtil::MakeProgramShape({f32_, f32_}, f32_); }; -TEST_F(GatherShapeInferenceTest, TensorFlowGather) { +// Shape inference tests for Gather. + +TEST_F(ScatterGatherShapeInferenceTest, TensorFlowGather) { TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape, ShapeInference::InferGatherShape( matrix_64_48_, s64_vector_32_, @@ -1570,7 +1664,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGather) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, TensorFlowGatherV2) { +TEST_F(ScatterGatherShapeInferenceTest, TensorFlowGatherV2) { TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape, ShapeInference::InferGatherShape( matrix_64_48_, s64_vector_32_, @@ -1585,7 +1679,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGatherV2) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, TensorFlowGatherNd) { +TEST_F(ScatterGatherShapeInferenceTest, TensorFlowGatherNd) { TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape, ShapeInference::InferGatherShape( matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, @@ -1600,7 +1694,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGatherNd) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, TensorFlowBatchDynamicSlice) { +TEST_F(ScatterGatherShapeInferenceTest, TensorFlowBatchDynamicSlice) { TF_ASSERT_OK_AND_ASSIGN( Shape gather_shape, ShapeInference::InferGatherShape( @@ -1617,7 +1711,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowBatchDynamicSlice) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) { +TEST_F(ScatterGatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) { TF_ASSERT_OK_AND_ASSIGN( Shape gather_shape, ShapeInference::InferGatherShape( @@ -1635,7 +1729,7 @@ TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) { +TEST_F(ScatterGatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) { TF_ASSERT_OK_AND_ASSIGN( Shape gather_shape, ShapeInference::InferGatherShape( @@ -1653,7 +1747,7 @@ TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, NoOutputGatherDims) { +TEST_F(ScatterGatherShapeInferenceTest, NoOutputGatherDims) { // This is equivalent to a dynamic slice. TF_ASSERT_OK_AND_ASSIGN( Shape gather_shape, @@ -1671,7 +1765,7 @@ TEST_F(GatherShapeInferenceTest, NoOutputGatherDims) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, ScalarGatherIndices) { +TEST_F(ScatterGatherShapeInferenceTest, ScalarGatherIndices) { // The gather indices "tensor" is a scalar S here that's used to slice out // [S,0,0,0,0]..[S,30,29,28,27] into a [30,29,28,27] shaped result. TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape, @@ -1689,7 +1783,7 @@ TEST_F(GatherShapeInferenceTest, ScalarGatherIndices) { << ShapeUtil::HumanString(gather_shape); } -TEST_F(GatherShapeInferenceTest, TupleShapedTensorInput) { +TEST_F(ScatterGatherShapeInferenceTest, TupleShapedTensorInput) { StatusOr statusor = ShapeInference::InferGatherShape( tuple_shape_, s64_vector_32_, HloGatherInstruction::MakeGatherDimNumbers( @@ -1704,7 +1798,7 @@ TEST_F(GatherShapeInferenceTest, TupleShapedTensorInput) { << statusor.status(); } -TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) { +TEST_F(ScatterGatherShapeInferenceTest, TupleShapedGatherIndicesInput) { StatusOr statusor = ShapeInference::InferGatherShape( s64_vector_32_, tuple_shape_, HloGatherInstruction::MakeGatherDimNumbers( @@ -1719,7 +1813,7 @@ TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) { << statusor.status(); } -TEST_F(GatherShapeInferenceTest, FloatingPointGatherIndicesInput) { +TEST_F(ScatterGatherShapeInferenceTest, FloatingPointGatherIndicesInput) { StatusOr statusor = ShapeInference::InferGatherShape( s64_vector_32_, vector_32_, HloGatherInstruction::MakeGatherDimNumbers( @@ -1734,7 +1828,7 @@ TEST_F(GatherShapeInferenceTest, FloatingPointGatherIndicesInput) { << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_NonAscendingWindowIndices) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1751,7 +1845,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_RepeatedWindowIndices) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1768,7 +1862,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_WindowIndexOutOfBounds) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1784,7 +1878,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_WindowIndexBarelyOutOfBounds) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1800,7 +1894,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_MismatchingElidedWindowDims) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1818,7 +1912,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_OutOfBoundsWindowToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1835,7 +1929,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_RepeatedWindowToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1853,7 +1947,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_MismatchingGatherToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1872,7 +1966,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_OutOfBoundsGatherToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1890,7 +1984,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_RepeatedGatherToInputMapping) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1908,7 +2002,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_NonAscendingElidedWindowDims) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1924,7 +2018,8 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsTooLarge) { +TEST_F(ScatterGatherShapeInferenceTest, + InvalidGatherDimNumbers_WindowBoundsTooLarge) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, HloGatherInstruction::MakeGatherDimNumbers( @@ -1940,7 +2035,7 @@ TEST_F(GatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsTooLarge) { << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_MismatchingNumberOfWindowBounds) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1958,7 +2053,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, +TEST_F(ScatterGatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsNot1ForElidedDim) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, @@ -1975,7 +2070,7 @@ TEST_F(GatherShapeInferenceTest, << statusor.status(); } -TEST_F(GatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) { +TEST_F(ScatterGatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) { StatusOr statusor = ShapeInference::InferGatherShape( f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_, HloGatherInstruction::MakeGatherDimNumbers( @@ -1992,5 +2087,575 @@ TEST_F(GatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) { << statusor.status(); } +// Shape inference tests for Scatter. + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithFullUpdates) { + TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, + ShapeUtil::MakeShape(F32, {64, 32}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithFullUpdatesV2) { + TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, + ShapeUtil::MakeShape(F32, {32, 48}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{1}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/1))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithPartialUpdates) { + TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, + ShapeUtil::MakeShape(F32, {10, 32}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithPartialUpdatesV2) { + TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, + ShapeUtil::MakeShape(F32, {32, 8}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{1}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/1))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithUpdatesBiggerThanInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, ShapeUtil::MakeShape(F32, {65, 32}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Bounds of the window dimensions of updates must not exceed " + "the bounds of the corresponding dimensions of operand.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterWithUpdatesBiggerThanInputV2) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, ShapeUtil::MakeShape(F32, {32, 49}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{1}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Bounds of the window dimensions of updates must not exceed " + "the bounds of the corresponding dimensions of operand.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + TfScatterWithUpdatesNotMatchingIndices) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, ShapeUtil::MakeShape(F32, {64, 31}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr( + "Bounds of the scatter dimensions of updates must be same as the " + "bounds of the corresponding dimensions of scatter indices.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + TfScatterWithUpdatesNotMatchingIndicesV2) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_vector_32_, ShapeUtil::MakeShape(F32, {31, 48}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{1}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr( + "Bounds of the scatter dimensions of updates must be same as the " + "bounds of the corresponding dimensions of scatter indices.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithFullUpdates) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 48}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithFullUpdatesV2) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 64}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithPartialUpdates) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 10}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithPartialUpdatesV2) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 12}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, matrix_64_48_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfScatterNdWithUpdatesBiggerThanInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 65}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Bounds of the window dimensions of updates must not exceed " + "the bounds of the corresponding dimensions of operand.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + TfScatterNdWithUpdatesNotMatchingIndices) { + StatusOr statusor = ShapeInference::InferScatterShape( + matrix_64_48_, s64_4d_tensor_10_9_8_7_1_, + ShapeUtil::MakeShape(F32, {9, 9, 8, 7, 64}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr( + "Bounds of the scatter dimensions of updates must be same as the " + "bounds of the corresponding dimensions of scatter indices.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, TfBatchDynamicUpdateSlice) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 8}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4))); + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, NonDefaultScatterIndicesLeafDim) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_, + ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 8}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/2))); + + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, NonDefaultScatterIndicesLeafDimV2) { + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_5_10_9_7_6_, + ShapeUtil::MakeShape(F32, {10, 9, 7, 6, 30, 29, 28, 27, 26}), + to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 8}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/0))); + + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, NoUpdateScatterDims) { + // This is equivalent to a dynamic update slice. + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_vector_5_, + ShapeUtil::MakeShape(F32, {30, 29, 28, 27, 26}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0, 1, 2, 3, 4}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/0))); + + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, ScalarScatterIndices) { + // The scalar indices "tensor" is a scalar S here that's used to update a + // [30,29,28,27] shaped tensor within the operand at position S. + TF_ASSERT_OK_AND_ASSIGN( + Shape scatter_shape, + ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_scalar_, + ShapeUtil::MakeShape(F32, {30, 29, 28, 27}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0, 1, 2, 3}, + /*inserted_window_dims=*/{0}, + /*scatter_dims_to_operand_dims=*/{0}, + /*index_vector_dim=*/0))); + + EXPECT_TRUE(ShapeUtil::Equal(scatter_shape, f32_5d_tensor_50_49_48_47_46_)) + << ShapeUtil::HumanString(scatter_shape); +} + +TEST_F(ScatterGatherShapeInferenceTest, ScatterWithTupleShapedTensorInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + tuple_shape_, s64_vector_32_, s64_vector_32_, to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/1)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Expected array argument for operand")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + ScatterWithTupleShapedScatterIndicesInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + s64_vector_32_, tuple_shape_, s64_vector_32_, to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/0)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Expected array argument for scatter indices")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, ScatterWithTupleShapedUpdatesInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + s64_vector_32_, s64_vector_32_, tuple_shape_, to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/0)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Expected array argument for updates")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, FloatingPointScatterIndicesInput) { + StatusOr statusor = ShapeInference::InferScatterShape( + s64_vector_32_, vector_32_, s64_vector_32_, to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{0}, + /*inserted_window_dims=*/{1}, + /*scatter_dims_to_operand_dims=*/{1}, + /*index_vector_dim=*/0)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Scatter indices parameter must be an integral tensor")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, OutOfBoundsScatterIndicesLeafDim) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/10)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Scatter index leaf dimension must be within [0, " + "rank(scatter_indices) + 1)")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, InvalidUpdates) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 50}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Updates tensor must be of rank 7; got 8.")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, InvalidUpdateComputation) { + const ProgramShape invalid_update_computation = + ShapeUtil::MakeProgramShape({f32_}, f32_); + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), + invalid_update_computation, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Reduction function must take 2 parameters, but takes 1")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_NonAscendingUpdateWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 8, 7}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("update_window_dims in scatter op must be sorted")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_RepeatedUpdateWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 7}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("update_window_dims in scatter op must not repeat")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_OutOfBoundsUpdateWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28, 27, 26}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6, 7, 9}, + /*inserted_window_dims=*/{}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Invalid update_window_dims set in scatter op; valid " + "range is [0, 9)")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_NonAscendingInsertedWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{2, 1}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("inserted_window_dims in scatter op must be sorted")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_RepeatedInsertedWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 1}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("inserted_window_dims in scatter op must not repeat")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_OutOfBoundsInsertedWindowDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 5}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 4}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Invalid inserted_window_dims set in scatter op; valid " + "range is [0, 5)")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_MismatchingScatterDimsToOperandDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr("Scatter op has 4 elements in scatter_dims_to_operand_dims and " + "the bound of dimension index_vector_dim=4 of scatter_indices " + "is 5. These two numbers must be equal")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_OutOfBoundsScatterDimsToOperandDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 3, 10}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT(statusor.status().error_message(), + HasSubstr("Invalid scatter_dims_to_operand_dims mapping; domain " + "is [0, 5), got: 4->10")) + << statusor.status(); +} + +TEST_F(ScatterGatherShapeInferenceTest, + InvalidScatterDimNumbers_RepeatedValuesInScatterDimsToOperandDims) { + StatusOr statusor = ShapeInference::InferScatterShape( + f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_, + ShapeUtil::MakeShape(F32, {10, 9, 8, 7, 30, 29, 28}), to_apply_, + HloScatterInstruction::MakeScatterDimNumbers( + /*update_window_dims=*/{4, 5, 6}, + /*inserted_window_dims=*/{1, 2}, + /*scatter_dims_to_operand_dims=*/{0, 1, 2, 2, 3}, + /*index_vector_dim=*/4)); + ASSERT_FALSE(statusor.ok()); + EXPECT_THAT( + statusor.status().error_message(), + HasSubstr( + "Repeated dimensions not allowed in scatter_dims_to_operand_dims")) + << statusor.status(); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/stream_pool.cc b/tensorflow/compiler/xla/service/stream_pool.cc index 92bb21b816c36df4dee266942a7ce51718efdfd1..c0582c6a2d3a05e2ed5aead5faac54e536d350cd 100644 --- a/tensorflow/compiler/xla/service/stream_pool.cc +++ b/tensorflow/compiler/xla/service/stream_pool.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/core/platform/logging.h" namespace xla { @@ -27,6 +28,8 @@ StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor) { // Re-use an existing stream from the pool. stream = std::move(streams_.back()); streams_.pop_back(); + VLOG(1) << stream->DebugStreamPointers() + << " StreamPool reusing existing stream"; } } @@ -34,6 +37,8 @@ StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor) { // Create a new stream. stream = MakeUnique(executor); stream->Init(); + VLOG(1) << stream->DebugStreamPointers() + << " StreamPool created new stream"; } // Return the stream wrapped in Ptr, which has our special deleter semantics. @@ -43,12 +48,16 @@ StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor) { void StreamPool::ReturnStream(se::Stream* stream) { if (stream->ok()) { + VLOG(1) << stream->DebugStreamPointers() + << " StreamPool returning ok stream"; tensorflow::mutex_lock lock(mu_); streams_.emplace_back(stream); } else { - // If the stream has encountered any errors, all subsequent - // operations on it will fail. So just delete the stream, and rely - // on new streams to be created in the future. + // If the stream has encountered any errors, all subsequent operations on it + // will fail. So just delete the stream, and rely on new streams to be + // created in the future. + VLOG(1) << stream->DebugStreamPointers() + << " StreamPool deleting !ok stream"; delete stream; } } diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc index 7232c658b3f0687ac93a83e46a200f88bf202084..32d368a90429ec026120bdf033957617eeaba23e 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.cc +++ b/tensorflow/compiler/xla/service/transfer_manager.cc @@ -43,15 +43,39 @@ TransferManager::GetPlatformTransferManagers() { StatusOr> TransferManager::TransferLiteralFromDevice( se::Stream* stream, const ShapedBuffer& device_buffer) { StatusOr> ret; + se::Stream* substream = stream->GetOrCreateSubStream(); substream->ThenWaitFor(stream); auto cleanup = tensorflow::gtl::MakeCleanup( [&]() { stream->ReturnSubStream(substream); }); tensorflow::Notification n; - TransferLiteralFromDevice(substream, device_buffer, - [&](StatusOr> arg) { - ret = std::move(arg); + Status s; + Literal literal(device_buffer.on_host_shape()); + TransferLiteralFromDevice(substream, device_buffer, literal, + [&](Status status) { + s = status; + n.Notify(); + }); + n.WaitForNotification(); + if (!s.ok()) { + return s; + } + return MakeUnique(std::move(literal)); +} + +Status TransferManager::TransferLiteralFromDevice( + se::Stream* stream, const ShapedBuffer& device_buffer, + const MutableBorrowingLiteral& literal) { + se::Stream* substream = stream->GetOrCreateSubStream(); + auto cleanup = tensorflow::gtl::MakeCleanup( + [&]() { stream->ReturnSubStream(substream); }); + + Status ret; + tensorflow::Notification n; + TransferLiteralFromDevice(substream, device_buffer, literal, + [&](Status status) { + ret = status; n.Notify(); }); n.WaitForNotification(); @@ -76,22 +100,27 @@ Status TransferManager::TransferLiteralToDevice( StatusOr> TransferManager::TransferArrayFromDevice( se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source) { + StatusOr> ret; // Implement the synchronous version by waiting on the asynchronous version. // Use a substream so that if we are called from a HostCallback we don't // deadlock. - StatusOr> ret; se::Stream* substream = stream->GetOrCreateSubStream(); auto cleanup = tensorflow::gtl::MakeCleanup( [&]() { stream->ReturnSubStream(substream); }); tensorflow::Notification n; - TransferArrayFromDevice(substream, shape, source, - [&](StatusOr> arg) { - ret = std::move(arg); + Literal literal(shape); + Status s; + TransferArrayFromDevice(substream, shape, source, literal, + [&](Status status) { + s = status; n.Notify(); }); n.WaitForNotification(); - return ret; + if (!s.ok()) { + return s; + } + return MakeUnique(std::move(literal)); } Status TransferManager::TransferArrayToDevice( @@ -130,7 +159,7 @@ Status TransferManager::TransferArrayToDeviceAsync( void TransferManager::TransferArrayFromDevice( se::Stream* stream, const Shape& shape, const se::DeviceMemoryBase& source, - std::function>)> done) { + const MutableBorrowingLiteral& literal, std::function done) { if (!ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape)) { auto error = StrCat("Shape ", ShapeUtil::HumanString(shape), " has a differently shaped representation on-device: ", @@ -147,7 +176,8 @@ void TransferManager::TransferArrayFromDevice( stream->parent()->platform(), stream->parent()->device_ordinal()); shaped_buffer.set_buffer(source, /*index=*/{}); - return TransferLiteralFromDevice(stream, shaped_buffer, std::move(done)); + return TransferLiteralFromDevice(stream, shaped_buffer, literal, + std::move(done)); } /* static */ void TransferManager::RegisterTransferManager( diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h index 82c599e482d85fc5bbe5a5a48c6c6b053186803b..475a2e5c141d66fa689fb402da1ee81fb4ab80f7 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.h +++ b/tensorflow/compiler/xla/service/transfer_manager.h @@ -59,6 +59,9 @@ class TransferManager { // This function should be avoided in favor of the asynchronous version below. virtual StatusOr> TransferLiteralFromDevice( se::Stream* stream, const ShapedBuffer& device_buffer); + virtual Status TransferLiteralFromDevice( + se::Stream* stream, const ShapedBuffer& device_buffer, + const MutableBorrowingLiteral& literal); // Begins transferring a literal containing the data held in the given // ShapedBuffer using the provided executor. @@ -69,9 +72,10 @@ class TransferManager { // // device_buffer is copied by reference and must live at least until done() is // invoked. - virtual void TransferLiteralFromDevice( - se::Stream* stream, const ShapedBuffer& device_buffer, - std::function>)> done) = 0; + virtual void TransferLiteralFromDevice(se::Stream* stream, + const ShapedBuffer& device_buffer, + MutableBorrowingLiteral literal, + std::function done) = 0; // Transfers the given literal into the previously allocated device memory // represented by the given ShapedBuffer using the given executor. The shape @@ -101,10 +105,10 @@ class TransferManager { // transfer an array at a known address. Status TransferArrayToDevice(se::Stream* stream, const LiteralSlice& literal, const se::DeviceMemoryBase& dest); - void TransferArrayFromDevice( - se::Stream* stream, const Shape& shape, - const se::DeviceMemoryBase& source, - std::function>)> done); + void TransferArrayFromDevice(se::Stream* stream, const Shape& shape, + const se::DeviceMemoryBase& source, + const MutableBorrowingLiteral& literal, + std::function done); Status TransferArrayToDeviceAsync(se::Stream* stream, const LiteralSlice& literal, @@ -120,9 +124,9 @@ class TransferManager { // Transfers the given literal from the Outfeed interface of the device, // using the given executor. - virtual Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, - const Shape& literal_shape, - Literal* literal) = 0; + virtual Status TransferLiteralFromOutfeed( + se::StreamExecutor* executor, const Shape& literal_shape, + MutableBorrowingLiteral literal) = 0; // Resets the devices associated with this transfer manager. virtual Status ResetDevices( diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc index 0effdc80a43ed8c7edc7ef06aeef1c03c1c9224d..0447807a41b8b32ee297e1ca94393da8c687c5e6 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc @@ -232,8 +232,7 @@ Status TuplePointsToAnalysis::HandleGetTupleElement( // Copy the points-to set (and tuple sources) at index {element_index} of the // operand to the points-to set for this GetTupleElement instruction. points_to_set.ForEachMutableElement( - [&, this](const ShapeIndex& target_index, - PointsToSet::BufferList* points_to) { + [&](const ShapeIndex& target_index, PointsToSet::BufferList* points_to) { // Construct an index into the operand by prepending element_index to // the index for the GetTupleElement instruction's points-to set. ShapeIndex src_index; @@ -308,7 +307,7 @@ Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) { // Recursively copy the points to set of the operand tuple {0} to the output // element {0}. points_to_set.ForEachMutableElement( - [this, &points_to_set, &operand_points_to_set]( + [&points_to_set, &operand_points_to_set]( const ShapeIndex& index, PointsToSet::BufferList* buffers) { if (index.empty() || index[0] != 0) { return; @@ -517,7 +516,7 @@ Status TuplePointsToAnalysis::GatherBuffersDefinedByInstruction( const HloInstruction* instruction, TuplePointsToAnalysis::BufferDefinitionVector* buffers) { GetPointsToSet(instruction) - .ForEachElement([this, buffers, instruction]( + .ForEachElement([buffers, instruction]( const ShapeIndex& index, const PointsToSet::BufferList& source_buffers) { // Add buffers which 'instruction' is the source of. @@ -547,7 +546,7 @@ PointsToSet& TuplePointsToAnalysis::CreateCopiedPointsToSet( PointsToSet& dst_points_to_set = CreateEmptyPointsToSet(instruction); const PointsToSet& src_points_to_set = GetPointsToSet(src); dst_points_to_set.ForEachMutableElement( - [this, &dst_points_to_set, &src_points_to_set]( + [&dst_points_to_set, &src_points_to_set]( const ShapeIndex& index, PointsToSet::BufferList* buffers) { *buffers = src_points_to_set.element(index); for (auto& tuple_source : src_points_to_set.tuple_sources(index)) { diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc index 2e5f6468044036016c5c9e5013968ac5a1046b7d..10d382e8abc92145c1804cbf18bbed714fa34571 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc @@ -1118,7 +1118,7 @@ TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) { TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) { Shape data_shape = ShapeUtil::MakeShape(F32, {8}); - auto make_cond = [this, &data_shape]() { + auto make_cond = [&data_shape]() { auto builder = HloComputation::Builder(TestName() + ".Cond"); auto data = builder.AddInstruction( HloInstruction::CreateParameter(0, data_shape, "data")); @@ -1127,7 +1127,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) { return builder.Build(); }; - auto make_body = [this, &data_shape]() { + auto make_body = [&data_shape]() { auto builder = HloComputation::Builder(TestName() + ".Body"); auto data = builder.AddInstruction( HloInstruction::CreateParameter(0, data_shape, "data")); diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc new file mode 100644 index 0000000000000000000000000000000000000000..af2cb6dc2a3f4a004351acc62796e0daf46719c2 --- /dev/null +++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc @@ -0,0 +1,238 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/while_loop_analysis.h" +#include "tensorflow/compiler/xla/service/hlo_evaluator.h" + +namespace xla { + +using tensorflow::gtl::nullopt; +using tensorflow::gtl::optional; + +// Finds and returns the non-constant operand in instr. +// +// CHECK-fails if instr doesn't have exactly one unique non-constant operand. +static const HloInstruction* NonConstantOperand(const HloInstruction* instr) { + const HloInstruction* result = nullptr; + for (const HloInstruction* operand : instr->operands()) { + if (!operand->IsConstant()) { + if (result != nullptr) { + CHECK_EQ(result, operand); + } + result = operand; + } + } + CHECK_NE(result, nullptr); + return result; +} + +// If all of instr's operands are either constants or have the form +// get-tuple-element(gte_operand, N) +// for the same value N, returns N. Otherwise, returns nullopt. +static optional GetGTEOperandIndex(const HloInstruction* instr, + const HloInstruction* gte_operand) { + VLOG(2) << "GetGTEOperandIndex(" << instr->ToString() << ", " + << gte_operand->ToString() << ")"; + optional tuple_idx; + for (const HloInstruction* operand : instr->operands()) { + if (operand->IsConstant()) { + continue; + } + // Look through copies. + // TODO(b/68830972): We wouldn't need this if for loop matching on the GPU + // would run before copy insertion. + if (operand->opcode() == HloOpcode::kCopy) { + operand = operand->operand(0); + } + if (operand->opcode() != HloOpcode::kGetTupleElement) { + VLOG(2) << "instr uses something other than gte(gte_operand): " + << operand->ToString(); + return nullopt; + } + if (operand->operand(0) != gte_operand) { + VLOG(2) << "instr has gte whose operand is not gte_operand: " + << operand->ToString(); + return nullopt; + } + if (tuple_idx && tuple_idx != operand->tuple_index()) { + VLOG(2) << "instr has operands with conflicting gte indices, " + << *tuple_idx << " vs " << operand->tuple_index(); + return nullopt; + } + + tuple_idx = operand->tuple_index(); + } + return tuple_idx; +} + +// Tries to get the tuple index of the induction variable of a while loop. +// +// Checks that the loop condition and root both plumb the induction variable +// through the same tuple index, and that they both apply exactly one op to the +// induction variable before deciding whether to do another loop iteration (in +// the loop condition's case) or packing the induction variable into the result +// tuple (in the loop body's case). +// +// Specifically, checks that the loop condition has structure +// +// root = op(constants, get-tuple-elem(param0, N), constants) +// +// and the loop body has the structure +// +// inc = op(constants, get-tuple-elem(param0, N), constants) +// root = tuple(..., inc, ...) // inc is N'th operand of tuple(). +// +// If so, returns N. Otherwise, returns nullopt. +static optional GetLoopInductionVarTupleIdx( + const HloInstruction* while_op) { + CHECK_EQ(while_op->opcode(), HloOpcode::kWhile); + VLOG(2) << "Finding induction variable for loop " + << while_op->ToShortString(); + + // The while_cond computation should have the form + // + // while_cond_root = + // op(constants, get-tuple-elem(while_cond_param, N), constants). + // + // If it does, set indvar_tuple_idx to N. + auto* while_cond = while_op->while_condition(); + auto* while_cond_root = while_cond->root_instruction(); + auto* while_cond_param = while_cond->parameter_instruction(0); + optional indvar_tuple_idx = + GetGTEOperandIndex(while_cond_root, while_cond_param); + if (!indvar_tuple_idx) { + VLOG(2) << "Induction variable not found in loop condition: " + << while_cond->root_instruction()->ToString(); + return nullopt; + } + + // The while_body computation should have the form + // + // while_body_inc = + // op(constants, get-tuple-elem(while_body_param, N), constants) + // while_body_root = tuple(..., while_body_inc, ...) + // + // where while_body_inc is operand N of while_body_root. + auto* while_body = while_op->while_body(); + auto* while_body_root = while_body->root_instruction(); + if (while_body_root->opcode() != HloOpcode::kTuple) { + VLOG(2) << "While body's root is not a tuple instruction: " + << while_body_root->ToString(); + return nullopt; + } + + auto* while_body_inc = while_body_root->operand(*indvar_tuple_idx); + auto* while_body_param = while_body->parameter_instruction(0); + optional while_body_indvar_tuple_idx = + GetGTEOperandIndex(while_body_inc, while_body_param); + if (!while_body_indvar_tuple_idx) { + VLOG(2) + << "Induction variable not found in while body increment instruction: " + << while_body_inc->ToString(); + return nullopt; + } + if (while_body_indvar_tuple_idx != indvar_tuple_idx) { + VLOG(2) << "Tuple index of induction variable does not match between loop " + "condition (" + << *indvar_tuple_idx << ") and while body (" + << *while_body_indvar_tuple_idx << ")"; + return nullopt; + } + + // Finally, check that the while loop's initial value is a tuple with enough + // elements. + auto* while_init = while_op->operand(0); + if (while_init->opcode() != HloOpcode::kTuple) { + VLOG(2) << "While init expected to be a tuple: " << while_init->ToString(); + return nullopt; + } + + VLOG(2) << "Induction variable's tuple index: " << *indvar_tuple_idx; + return indvar_tuple_idx; +} + +optional ComputeWhileLoopTripCount(HloInstruction* while_op, + int64 max_value_returned) { + VLOG(2) << "Getting trip count for loop " << while_op->ToString(); + + // The loop's induction variable is found at + // + // get-tuple-elem(comp->parameter_instruction(0), *indvar_tuple_idx), + // + // where comp is while_op->while_body() or while_op->while_condition(). + optional indvar_tuple_idx = GetLoopInductionVarTupleIdx(while_op); + if (!indvar_tuple_idx) { + return nullopt; + } + + // Now that we know the index of the induction variable, we can we can try to + // compute how many times the loop executes. Start by computing the induction + // variable's initial value. + HloEvaluator evaluator(/*max_loop_iterations=*/0); + auto* while_init = while_op->mutable_operand(0); + auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx); + StatusOr> indvar_init_result = + evaluator.Evaluate(indvar_init); + if (!indvar_init_result.ok()) { + VLOG(2) << "Couldn't evaluate induction variable init: " + << indvar_init_result.status(); + return nullopt; + } + + auto* while_body = while_op->while_body(); + auto* while_body_indvar_update = + while_body->root_instruction()->operand(*indvar_tuple_idx); + auto* while_body_indvar = NonConstantOperand(while_body_indvar_update); + + // The initial value of the induction variable. + std::unique_ptr indvar_iter_val = + std::move(indvar_init_result).ValueOrDie(); + for (int64 trip_count = 0; trip_count != max_value_returned + 1; + ++trip_count) { + auto* while_cond = while_op->while_condition(); + auto* while_cond_root = while_cond->root_instruction(); + auto* while_cond_indvar = NonConstantOperand(while_cond_root); + StatusOr> result = + evaluator.EvaluateWithSubstitutions( + while_cond_root, {{while_cond_indvar, indvar_iter_val.get()}}); + if (!result.ok()) { + VLOG(2) << "Couldn't evaluate while cond: " << result.status(); + return nullopt; + } + if (result.ValueOrDie()->data() == + tensorflow::gtl::ArraySlice{false}) { + VLOG(2) << "Loop has static trip count of " << trip_count; + return trip_count; + } + + // Calculate the value of the induction variable after one iteration of the + // loop, and check whether the while condition is true with this new value. + StatusOr> indvar_next_result = + evaluator.EvaluateWithSubstitutions( + while_body_indvar_update, + {{while_body_indvar, indvar_iter_val.get()}}); + if (!indvar_next_result.ok()) { + VLOG(2) << "Couldn't evaluate induction variable update: " + << indvar_next_result.status(); + return nullopt; + } + indvar_iter_val = std::move(indvar_next_result).ValueOrDie(); + } + + VLOG(2) << "Loop has unknown trip count."; + return nullopt; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.h b/tensorflow/compiler/xla/service/while_loop_analysis.h new file mode 100644 index 0000000000000000000000000000000000000000..bf59813e8c405a8709446bf8457729348ceae4ec --- /dev/null +++ b/tensorflow/compiler/xla/service/while_loop_analysis.h @@ -0,0 +1,33 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_ + +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/core/lib/gtl/optional.h" + +namespace xla { + +// Returns the precise trip count of the loop if it's statically known, +// nullopt otherwise. max_value_returned limits the number of steps that are +// evaluated while trying to brute force a loop trip count, trip counts larger +// than max_value_returned result in nullopt. +tensorflow::gtl::optional ComputeWhileLoopTripCount( + HloInstruction *while_op, int64 max_value_returned = 128); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_ diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc index ec05a74e286c89dd8db5ae07580e461938d7c087..dd8697e680c56165f87c365a721eda2de1ebc085 100644 --- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc +++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/while_loop_simplifier.h" #include "tensorflow/compiler/xla/service/call_inliner.h" -#include "tensorflow/compiler/xla/service/hlo_evaluator.h" +#include "tensorflow/compiler/xla/service/while_loop_analysis.h" #include "tensorflow/core/lib/gtl/flatmap.h" #include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/lib/strings/str_util.h" @@ -26,23 +26,6 @@ namespace xla { using tensorflow::gtl::nullopt; using tensorflow::gtl::optional; -// Finds and returns the non-constant operand in instr. -// -// CHECK-fails if instr doesn't have exactly one unique non-constant operand. -static const HloInstruction* NonConstantOperand(const HloInstruction* instr) { - const HloInstruction* result = nullptr; - for (const HloInstruction* operand : instr->operands()) { - if (!operand->IsConstant()) { - if (result != nullptr) { - CHECK_EQ(result, operand); - } - result = operand; - } - } - CHECK_NE(result, nullptr); - return result; -} - // Determines whether the given instruction is a send/recv node, or has a // subcomputation which contains a send/recv node. static bool IsOrContainsSendOrRecv(const HloInstruction* instr); @@ -72,211 +55,6 @@ static bool IsOrContainsSendOrRecv(const HloInstruction* instr) { return false; } -// If all of instr's operands are either constants or have the form -// get-tuple-element(gte_operand, N) -// for the same value N, returns N. Otherwise, returns nullopt. -static optional GetGTEOperandIndex(const HloInstruction* instr, - const HloInstruction* gte_operand) { - VLOG(2) << "GetGTEOperandIndex(" << instr->ToString() << ", " - << gte_operand->ToString() << ")"; - optional tuple_idx; - for (const HloInstruction* operand : instr->operands()) { - if (operand->IsConstant()) { - continue; - } - if (operand->opcode() != HloOpcode::kGetTupleElement) { - VLOG(2) << "instr uses something other than gte(gte_operand): " - << operand->ToString(); - return nullopt; - } - if (operand->operand(0) != gte_operand) { - VLOG(2) << "instr has gte whose operand is not gte_operand: " - << operand->ToString(); - return nullopt; - } - if (tuple_idx && tuple_idx != operand->tuple_index()) { - VLOG(2) << "instr has operands with conflicting gte indices, " - << *tuple_idx << " vs " << operand->tuple_index(); - return nullopt; - } - - tuple_idx = operand->tuple_index(); - } - return tuple_idx; -} - -// Tries to get the tuple index of the induction variable of a while loop. -// -// Checks that the loop condition and root both plumb the induction variable -// through the same tuple index, and that they both apply exactly one op to the -// induction variable before deciding whether to do another loop iteration (in -// the loop condition's case) or packing the induction variable into the result -// tuple (in the loop body's case). -// -// Specifically, checks that the loop condition has structure -// -// root = op(constants, get-tuple-elem(param0, N), constants) -// -// and the loop body has the structure -// -// inc = op(constants, get-tuple-elem(param0, N), constants) -// root = tuple(..., inc, ...) // inc is N'th operand of tuple(). -// -// If so, returns N. Otherwise, returns nullopt. -static optional GetLoopInductionVarTupleIdx( - const HloInstruction* while_op) { - CHECK_EQ(while_op->opcode(), HloOpcode::kWhile); - VLOG(2) << "Finding induction variable for loop " - << while_op->ToShortString(); - - // The while_cond computation should have the form - // - // while_cond_root = - // op(constants, get-tuple-elem(while_cond_param, N), constants). - // - // If it does, set indvar_tuple_idx to N. - auto* while_cond = while_op->while_condition(); - auto* while_cond_root = while_cond->root_instruction(); - auto* while_cond_param = while_cond->parameter_instruction(0); - optional indvar_tuple_idx = - GetGTEOperandIndex(while_cond_root, while_cond_param); - if (!indvar_tuple_idx) { - VLOG(2) << "Induction variable not found in loop condition: " - << while_cond->root_instruction()->ToString(); - return nullopt; - } - - // The while_body computation should have the form - // - // while_body_inc = - // op(constants, get-tuple-elem(while_body_param, N), constants) - // while_body_root = tuple(..., while_body_inc, ...) - // - // where while_body_inc is operand N of while_body_root. - auto* while_body = while_op->while_body(); - auto* while_body_root = while_body->root_instruction(); - if (while_body_root->opcode() != HloOpcode::kTuple) { - VLOG(2) << "While body's root is not a tuple instruction: " - << while_body_root->ToString(); - return nullopt; - } - - auto* while_body_inc = while_body_root->operand(*indvar_tuple_idx); - auto* while_body_param = while_body->parameter_instruction(0); - optional while_body_indvar_tuple_idx = - GetGTEOperandIndex(while_body_inc, while_body_param); - if (!while_body_indvar_tuple_idx) { - VLOG(2) - << "Induction variable not found in while body increment instruction: " - << while_body_inc->ToString(); - return nullopt; - } - if (while_body_indvar_tuple_idx != indvar_tuple_idx) { - VLOG(2) << "Tuple index of induction variable does not match between loop " - "condition (" - << *indvar_tuple_idx << ") and while body (" - << *while_body_indvar_tuple_idx << ")"; - return nullopt; - } - - // Finally, check that the while loop's initial value is a tuple with enough - // elements. - auto* while_init = while_op->operand(0); - if (while_init->opcode() != HloOpcode::kTuple) { - VLOG(2) << "While init expected to be a tuple: " << while_init->ToString(); - return nullopt; - } - - VLOG(2) << "Induction variable's tuple index: " << *indvar_tuple_idx; - return indvar_tuple_idx; -} - -// Tries to determine the number of times the given loop executes. Currently -// simply returns 0, 1, or "can't tell" (nullopt). -static optional GetLoopTripCount(HloInstruction* while_op) { - CHECK_EQ(while_op->opcode(), HloOpcode::kWhile); - VLOG(2) << "Getting trip count for loop " << while_op->ToString(); - - // The loop's induction variable is found at - // - // get-tuple-elem(comp->parameter_instruction(0), *indvar_tuple_idx), - // - // where comp is while_op->while_body() or while_op->while_condition(). - optional indvar_tuple_idx = GetLoopInductionVarTupleIdx(while_op); - if (!indvar_tuple_idx) { - return nullopt; - } - - VLOG(2) << "Induction variable is at index " << *indvar_tuple_idx - << " in input tuple."; - - // Now that we know the index of the induction variable, we can we can try to - // compute how many times the loop executes. Start by computing the induction - // variable's initial value. - HloEvaluator evaluator(/*max_loop_iterations=*/0); - auto* while_init = while_op->mutable_operand(0); - auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx); - StatusOr> indvar_init_result = - evaluator.Evaluate(indvar_init); - if (!indvar_init_result.ok()) { - VLOG(2) << "Couldn't evaluate induction variable init: " - << indvar_init_result.status(); - return nullopt; - } - - // Evaluates the while loop's condition, returning either "true" (continue - // looping), "false" (stop looping), or nullopt (can't evaluate). - auto evaluate_while_cond = [&](const Literal& indvar) -> optional { - auto* while_cond = while_op->while_condition(); - auto* while_cond_root = while_cond->root_instruction(); - auto* while_cond_indvar = NonConstantOperand(while_cond_root); - StatusOr> result = - evaluator.EvaluateWithSubstitutions(while_cond_root, - {{while_cond_indvar, &indvar}}); - if (!result.ok()) { - VLOG(2) << "Couldn't evaluate while cond: " << result.status(); - return nullopt; - } - return result.ValueOrDie()->data() == - tensorflow::gtl::ArraySlice{true}; - }; - - // The initial value of the induction variable. - const Literal& indvar_iter0_val = *indvar_init_result.ValueOrDie(); - - // Evaluate whether the while condition is true when seeded with - // indvar_iter0_val. - optional while_cond_iter0_val = evaluate_while_cond(indvar_iter0_val); - if (while_cond_iter0_val == false) { - VLOG(2) << "Loop has static trip count of 0."; - return 0; - } - - // Calculate the value of the induction variable after one iteration of the - // loop, and check whether the while condition is true with this new value. - auto* while_body = while_op->while_body(); - auto* while_body_indvar_update = - while_body->root_instruction()->operand(*indvar_tuple_idx); - auto* while_body_indvar = NonConstantOperand(while_body_indvar_update); - StatusOr> indvar_iter1_result = - evaluator.EvaluateWithSubstitutions( - while_body_indvar_update, {{while_body_indvar, &indvar_iter0_val}}); - if (!indvar_iter1_result.ok()) { - VLOG(2) << "Couldn't evaluate induction variable update: " - << indvar_iter1_result.status(); - return nullopt; - } - const Literal& indvar_iter1_val = *indvar_iter1_result.ValueOrDie(); - optional while_cond_iter1_val = evaluate_while_cond(indvar_iter1_val); - if (while_cond_iter1_val == false) { - VLOG(2) << "Determined that loop has static trip count of 1."; - return 1; - } - - VLOG(2) << "Loop has unknown trip count >= 1."; - return nullopt; -} - // Tries to remove elements in a while loop's tuple that aren't used within the // loop. // @@ -577,7 +355,9 @@ static StatusOr TryRemoveWhileLoop(HloInstruction* while_op) { } // Remove while loops with static trip count of 0. - optional trip_count = GetLoopTripCount(while_op); + optional trip_count = + ComputeWhileLoopTripCount(while_op, + /*max_value_returned=*/1); if (trip_count && *trip_count == 0) { // The loop never executes, so the value of the loop is the value of its // "init" operand. diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc index 4391078b6484f25ba81aefa2c1d1f69d7d2774f4..c4c958be4a18f23b8e34f9e619e447c6bf4334b5 100644 --- a/tensorflow/compiler/xla/shape_tree_test.cc +++ b/tensorflow/compiler/xla/shape_tree_test.cc @@ -172,7 +172,7 @@ TEST_F(ShapeTreeTest, TupleShape) { // Write zero to all data elements. shape_tree.ForEachMutableElement( - [&sum](const ShapeIndex& /*index*/, int* data) { *data = 0; }); + [](const ShapeIndex& /*index*/, int* data) { *data = 0; }); EXPECT_EQ(0, shape_tree.element({})); EXPECT_EQ(0, shape_tree.element({0})); EXPECT_EQ(0, shape_tree.element({1})); diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc index ec901af1e2057449452c4c65243593b016a26f61..34869cc5078699603c006387161fddd4fee4a9f8 100644 --- a/tensorflow/compiler/xla/shape_util.cc +++ b/tensorflow/compiler/xla/shape_util.cc @@ -596,8 +596,7 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { }; auto comma_list_to_int64s = - [&s, - string_to_int64](const string& input) -> StatusOr> { + [string_to_int64](const string& input) -> StatusOr> { std::vector results; for (const string& piece : tensorflow::str_util::Split(input, ',')) { TF_ASSIGN_OR_RETURN(int64 element, string_to_int64(piece)); @@ -792,7 +791,7 @@ StatusOr ParseShapeStringInternal(tensorflow::StringPiece* s) { if (LayoutUtil::IsSparseArray(shape)) { allocated_element_count = LayoutUtil::MaxSparseElements(shape.layout()); } else { - CHECK(LayoutUtil::IsDenseArray(shape)); + CHECK(LayoutUtil::IsDenseArray(shape)) << shape.ShortDebugString(); tensorflow::gtl::ArraySlice padded_dimensions = LayoutUtil::PaddedDimensions(shape); if (!padded_dimensions.empty()) { diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc index d372d1ca434b1da416f671060f9461cf07aa5fc4..24b17b71007a1872462bed1f6b86ae1a5bb9922c 100644 --- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc +++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc @@ -733,7 +733,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) { var4D, [epsilon](float a) { return a + epsilon; }); auto rsqrt_var_add_epsilon = *ReferenceUtil::MapArray4D( - var_add_epsilon, [epsilon](float a) { return 1 / std::sqrt(a); }); + var_add_epsilon, [](float a) { return 1 / std::sqrt(a); }); auto grad_output_times_var = *ReferenceUtil::MapArray4D(grad_output_array, var_add_epsilon, diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h index 4a6e8a31241d39db21935576d57f0acb17caef11..b04a3b105ca017b6a91d271e603dcd0cc2068a33 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.h +++ b/tensorflow/compiler/xla/tests/client_library_test_base.h @@ -74,8 +74,9 @@ class ClientLibraryTestBase : public ::testing::Test { string TestName() const; void SetFastMathDisabled(bool disabled) { - execution_options_.mutable_debug_options()->set_xla_enable_fast_math( - !disabled); + auto* opts = execution_options_.mutable_debug_options(); + opts->set_xla_cpu_enable_fast_math(!disabled); + opts->set_xla_gpu_enable_fast_math(!disabled); } void SetSeed(uint64 seed) { execution_options_.set_seed(seed); } diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc index cfd36abf47c0e510b41b4ce8dfba077f4119a6c2..0e9e92ed996fbb34826d19b670c7c4920a1aad13 100644 --- a/tensorflow/compiler/xla/tests/dot_operation_test.cc +++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc @@ -111,7 +111,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, TrivialMatrixVectorDot) { this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, OneElementVectorDot) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, OneElementVectorDot) { using T = TypeParam; XlaBuilder builder(this->TestName()); auto lhs = ConstantR1(&builder, {static_cast(2.0f)}); @@ -137,7 +137,7 @@ std::vector MinorToMajorForIsRowMajor(bool row_major) { return {row_major ? 1 : 0, row_major ? 0 : 1}; } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x0) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, Dot_0x2_2x0) { using T = TypeParam; XlaBuilder builder(this->TestName()); auto lhs = ConstantR2FromArray2D(&builder, Array2D(0, 2)); @@ -148,7 +148,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x0) { this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x3) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, Dot_0x2_2x3) { using T = TypeParam; XlaBuilder builder(this->TestName()); auto lhs = ConstantR2FromArray2D(&builder, Array2D(0, 2)); @@ -160,7 +160,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x3) { this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_3x2_2x0) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, Dot_3x2_2x0) { using T = TypeParam; XlaBuilder builder(this->TestName()); auto lhs = ConstantR2FromArray2D( @@ -172,7 +172,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_3x2_2x0) { this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_2x0_0x2) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, Dot_2x0_0x2) { using T = TypeParam; XlaBuilder builder(this->TestName()); auto lhs = ConstantR2FromArray2D(&builder, Array2D(2, 0)); @@ -183,7 +183,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_2x0_0x2) { &builder, Array2D(2, 2, static_cast(0.0f)), {}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, FusedDot) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, FusedDot) { using T = TypeParam; XlaBuilder builder(this->TestName()); auto param0 = @@ -533,7 +533,7 @@ XLA_TEST_F(DotOperationTest, MatrixVectorC64) { &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, ConcurrentMatMult) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, ConcurrentMatMult) { using T = TypeParam; XlaBuilder builder(this->TestName()); @@ -612,7 +612,7 @@ XLA_TYPED_TEST(DotOperationTestForBatchMatMul, Types) { {x_data.get(), y_data.get()}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, GeneralMatMul) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMul) { using T = TypeParam; XlaBuilder builder(this->TestName()); @@ -648,7 +648,49 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, GeneralMatMul) { {x_data.get(), y_data.get()}, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, TransposeFolding) { +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulMultipleBatch) { + using T = TypeParam; + + XlaBuilder builder(this->TestName()); + auto x = Parameter(&builder, 0, ShapeUtil::MakeShapeWithType({2, 2, 2, 2}), + "x"); + auto y = Parameter(&builder, 1, ShapeUtil::MakeShapeWithType({2, 2, 2, 2}), + "y"); + + DotDimensionNumbers dnums; + dnums.add_lhs_contracting_dimensions(3); + dnums.add_rhs_contracting_dimensions(2); + dnums.add_lhs_batch_dimensions(0); + dnums.add_lhs_batch_dimensions(1); + dnums.add_rhs_batch_dimensions(0); + dnums.add_rhs_batch_dimensions(1); + + DotGeneral(x, y, dnums); + + auto x_data = + this->client_ + ->TransferToServer(*LiteralUtil::CreateR4FromArray4D( + {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}, + {{{9.0f, 10.0f}, {11.0f, 12.0f}}, + {{13.0f, 14.0f}, {15.0f, 16.0f}}}})) + .ConsumeValueOrDie(); + + auto y_data = + this->client_ + ->TransferToServer(*LiteralUtil::CreateR4FromArray4D( + {{{{1.0f, 0.0f}, {0.0f, 1.0f}}, {{1.0f, 0.0f}, {0.0f, 1.0f}}}, + {{{0.0f, 1.0f}, {1.0f, 0.0f}}, {{0.0f, 1.0f}, {1.0f, 0.0f}}}})) + .ConsumeValueOrDie(); + + this->template ComputeAndCompareR4( + &builder, + /*expected=*/ + {{{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}, + {{{10.0f, 9.0f}, {12.0f, 11.0f}}, {{14.0f, 13.0f}, {16.0f, 15.0f}}}}, + {x_data.get(), y_data.get()}, this->error_spec_); +} + +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, TransposeFolding) { using T = TypeParam; for (bool transpose_lhs : {false, true}) { for (bool transpose_rhs : {false, true}) { @@ -708,7 +750,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, TransposeFolding) { } } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, DotOfConcatOptimizationWithConstLHS) { using T = TypeParam; auto prim_type = primitive_util::NativeToPrimitiveType(); @@ -754,7 +796,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, this->error_spec_); } -XLA_TYPED_TEST(DotOperationTest_F16F32F64, +XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, DotOfConcatOptimizationWithConstRHS) { using T = TypeParam; std::unique_ptr> constant_rhs_array( diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc index b662e837168c8b16daea0181786be19fa0237a8c..f05d1a8b9d372e720ae1634a9c8d5c0591e39b89 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.cc +++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc @@ -83,13 +83,16 @@ ProgramShape GetProgramShapeWithLayout(const HloModule& module) { } // namespace -HloTestBase::HloTestBase() - : HloTestBase(GetTestPlatform(), GetReferencePlatform()) {} +HloTestBase::HloTestBase(bool allow_mixed_precision_in_hlo_verifier) + : HloTestBase(GetTestPlatform(), GetReferencePlatform(), + allow_mixed_precision_in_hlo_verifier) {} HloTestBase::HloTestBase(se::Platform* test_platform, - se::Platform* reference_platform) + se::Platform* reference_platform, + bool allow_mixed_precision_in_hlo_verifier) : test_runner_(test_platform), reference_runner_(reference_platform) { - hlo_verifier_ = MakeUnique(/*allow_mixed_precision=*/true); + hlo_verifier_ = + MakeUnique(allow_mixed_precision_in_hlo_verifier); } /* static */ @@ -233,6 +236,29 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal( reference_preprocessor); } +::testing::AssertionResult HloTestBase::Run(const StringPiece hlo_string) { + auto module_or_status = + HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()); + if (!module_or_status.ok()) { + return ::testing::AssertionFailure() + << "Error while parsing HLO text format: " + << module_or_status.status().ToString(); + } + const auto& fake_arguments = + MakeFakeArguments(module_or_status.ValueOrDie().get()) + .ConsumeValueOrDie(); + std::vector fake_argument_ptrs; + c_transform( + fake_arguments, std::back_inserter(fake_argument_ptrs), + [](const std::unique_ptr& literal) { return literal.get(); }); + return test_runner_ + .Execute(std::move(module_or_status.ValueOrDie()), + fake_argument_ptrs, /*run_hlo_passes=*/true) + .ok() + ? ::testing::AssertionSuccess() + : ::testing::AssertionFailure(); +} + ::testing::AssertionResult HloTestBase::RunAndCompareFromFile( const string& filename, const tensorflow::gtl::optional& error, const std::function& reference_preprocessor) { diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h index 66719b1460063a61541535ff7507468ae0ca1ada..4232eeceb10b37a209f247ffa70fb9a08be337e6 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.h +++ b/tensorflow/compiler/xla/tests/hlo_test_base.h @@ -80,12 +80,13 @@ class HloTestBase : public ::testing::Test { // automatically finds another supported backend as the test backend. If the // interpreter is the only supported backend, it will be both the test backend // and the reference backend. - HloTestBase(); + HloTestBase(bool allow_mixed_precision_in_hlo_verifier = true); // If your test doesn't use interpreter as the reference backend, you can use // this constructor. Note that your test target is responsible for linking in // both needed backends. - HloTestBase(se::Platform* test_platform, se::Platform* reference_platform); + HloTestBase(se::Platform* test_platform, se::Platform* reference_platform, + bool allow_mixed_precision_in_hlo_verifier = true); ~HloTestBase() override {} @@ -166,6 +167,8 @@ class HloTestBase : public ::testing::Test { const tensorflow::gtl::optional& error, const std::function& reference_preprocessor = nullptr) TF_MUST_USE_RESULT; + ::testing::AssertionResult Run(const tensorflow::StringPiece hlo_string) + TF_MUST_USE_RESULT; ::testing::AssertionResult RunAndCompareFromFile( const string& filename, const tensorflow::gtl::optional& error, const std::function& reference_preprocessor = nullptr) diff --git a/tensorflow/compiler/xla/tests/iota_test.cc b/tensorflow/compiler/xla/tests/iota_test.cc index f950aa1e8fe745075234a5ebff52d92be7378a5d..17ac95ae0198d98490b25f7f2edd32d1e0495803 100644 --- a/tensorflow/compiler/xla/tests/iota_test.cc +++ b/tensorflow/compiler/xla/tests/iota_test.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/core/lib/core/errors.h" namespace xla { @@ -34,7 +35,7 @@ class IotaTest : public ClientLibraryTestBase { } }; -TEST_F(IotaTest, SimpleR1) { +XLA_TEST_F(IotaTest, SimpleR1) { for (int num_elements = 1; num_elements < 10000001; num_elements *= 10) { { XlaBuilder builder(TestName() + "_f32"); diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test.cc b/tensorflow/compiler/xla/tests/local_client_aot_test.cc index 47cab796041e9669affaebd7866d0d80100730f1..115448c908ac9e7f0b01772ce348d23bf4d838ed 100644 --- a/tensorflow/compiler/xla/tests/local_client_aot_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_aot_test.cc @@ -42,13 +42,12 @@ extern "C" void SumStructElements(float* out, void** parameters) { TEST_F(LocalClientAotTest, Constant) { xla::ExecutableRunOptions run_options; OpaqueData opaque_data{100, 20, 3}; - void* parameters[] = {&opaque_data}; float out = 0; - void* temporary_buffers[] = {nullptr, &out}; - SumAndDouble(&out, &run_options, parameters, temporary_buffers); + void* temporary_buffers[] = {&opaque_data, &out}; + SumAndDouble(&out, &run_options, nullptr, temporary_buffers); EXPECT_EQ(out, 246.0f); opaque_data = {1, 2, 3}; - SumAndDouble(&out, &run_options, parameters, temporary_buffers); + SumAndDouble(&out, &run_options, nullptr, temporary_buffers); EXPECT_EQ(out, 12.0f); } diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc index 74494e60e883417d5772ce71544715aef5ef3ef2..60eb21aafd23a8d724d1f08d5c87098b7c3dcd6b 100644 --- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc +++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc @@ -92,10 +92,10 @@ int main(int argc, char** argv) { // It's lame to hard-code the buffer assignments, but we need // local_client_aot_test.cc to be able to easily invoke the function. CHECK_EQ(result->result_buffer_index(), 1); - CHECK_EQ(result->buffer_sizes().size(), 3); - CHECK_EQ(result->buffer_sizes()[0], -1); // param buffer - CHECK_EQ(result->buffer_sizes()[1], sizeof(float)); // result buffer - CHECK_EQ(result->buffer_sizes()[2], -1); // const buffer + CHECK_EQ(result->buffer_infos().size(), 3); + CHECK(result->buffer_infos()[0].is_entry_parameter()); // param buffer + CHECK_EQ(result->buffer_infos()[1].size(), sizeof(float)); // result buffer + CHECK(result->buffer_infos()[2].is_constant()); // const buffer if (triple.isOSBinFormatELF()) { // Check the ELF magic. CHECK_EQ(result->object_file_data()[0], 0x7F); diff --git a/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc b/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc index cea7006526f0c56ade3cedead489ea12c0ab3922..0a0426adcbc1b5b89be0841fa2c4204e2b65abf4 100644 --- a/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc +++ b/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/tests/local_client_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" #include "tensorflow/core/lib/core/status_test_util.h" namespace xla { @@ -22,9 +23,9 @@ namespace { // Tests that ensure outfeed instructions that are contained in nested // computations in non-root positions are executed. -class LocalClientExecuteTest : public LocalClientTestBase {}; +class OutfeedInNestedComputationTest : public LocalClientTestBase {}; -TEST_F(LocalClientExecuteTest, OutfeedInWhile) { +XLA_TEST_F(OutfeedInNestedComputationTest, OutfeedInWhile) { XlaBuilder b(TestName()); Shape state_tuple_array_shape = ShapeUtil::MakeShape(xla::S32, {10, 5}); @@ -117,7 +118,7 @@ TEST_F(LocalClientExecuteTest, OutfeedInWhile) { EXPECT_EQ(comp_result->Get({}), 0); } -TEST_F(LocalClientExecuteTest, OutfeedInConditional) { +XLA_TEST_F(OutfeedInNestedComputationTest, OutfeedInConditional) { XlaBuilder b(TestName()); Shape condition_shape = ShapeUtil::MakeShape(xla::PRED, {}); diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc index 029af69573e458a45cf1e446e942c7401cd9e629..326e13b3867f2f804e882e00e35850d0189ad8d7 100644 --- a/tensorflow/compiler/xla/tests/prng_test.cc +++ b/tensorflow/compiler/xla/tests/prng_test.cc @@ -182,7 +182,7 @@ XLA_TEST_F(PrngTest, Uniformity256) { XLA_TEST_F(PrngTest, MapUsingRng) { // Build a x -> (x + U[0,1)) computation. - auto build_sum_rng = [this](XlaBuilder& builder) { + auto build_sum_rng = [](XlaBuilder& builder) { auto b = builder.CreateSubBuilder("sum_with_rng"); auto x = Parameter(b.get(), 0, ShapeUtil::MakeShape(F32, {}), "input"); Add(x, diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc index 1bd6fdab31d6c3516339bdb98459ffe3bbdef1d1..92c93f08b2e8e543aeaa58020eddacd109b2e2da 100644 --- a/tensorflow/compiler/xla/tests/reduce_window_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc @@ -1341,7 +1341,7 @@ INSTANTIATE_TEST_CASE_P( // results on the interpreter backend. class ReduceWindowTextTest : public HloTestBase {}; -TEST_F(ReduceWindowTextTest, R2General256x384) { +XLA_TEST_F(ReduceWindowTextTest, R2General256x384) { const string hlo_string = R"( HloModule R2Window mul { @@ -1358,7 +1358,7 @@ ENTRY R2Window { EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(ReduceWindowTextTest, R2General256x384Layout01) { +XLA_TEST_F(ReduceWindowTextTest, R2General256x384Layout01) { const string hlo_string = R"( HloModule R2Window mul { @@ -1375,7 +1375,7 @@ ROOT reduce-window = f32[256,384]{0,1} reduce-window(operand, constant), window= EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(ReduceWindowTextTest, R2General2x5) { +XLA_TEST_F(ReduceWindowTextTest, R2General2x5) { const string hlo_string = R"( HloModule R2Window mul { @@ -1392,7 +1392,7 @@ ENTRY R2Window { EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(ReduceWindowTextTest, R2EffectiveScalar) { +XLA_TEST_F(ReduceWindowTextTest, R2EffectiveScalar) { const string hlo_string = R"( HloModule R2Window mul { @@ -1410,7 +1410,7 @@ ENTRY R2Window { EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(ReduceWindowTextTest, R3EffectiveScalar) { +XLA_TEST_F(ReduceWindowTextTest, R3EffectiveScalar) { const string hlo_string = R"( HloModule R3Window mul { @@ -1428,7 +1428,7 @@ ENTRY R3Window { EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0.001})); } -TEST_F(HloTestBase, ReduceWindowIdentity) { +XLA_TEST_F(HloTestBase, ReduceWindowIdentity) { const string hlo_string = R"( HloModule ReduceWindowIdentity identity.pad_to_reduce_window { @@ -1445,7 +1445,7 @@ ENTRY reduce-window-identity { EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt)); } -TEST_F(HloTestBase, ReduceWindowS32) { +XLA_TEST_F(HloTestBase, ReduceWindowS32) { const string hlo_string = R"( HloModule reduce-window @@ -1464,5 +1464,24 @@ ENTRY %reduce-window (parameter.0: s32[81,8], parameter.1: s32[]) -> s32[82,8] { EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt)); } +XLA_TEST_F(HloTestBase, ReduceWindowF16) { + const string hlo_string = R"( +HloModule reduce-window + +%identity.pad_to_reduce_window (param0: f16[], param1: f16[]) -> f16[] { + %param0 = f16[] parameter(0) + ROOT %param1 = f16[] parameter(1) +} + +ENTRY %reduce-window (parameter.0: f16[81,8], parameter.1: f16[]) -> f16[82,8] { + %parameter.0 = f16[81,8]{1,0} parameter(0) + %parameter.1 = f16[] parameter(1) + ROOT %reduce-window = f16[82,8]{1,0} reduce-window(f16[81,8]{1,0} %parameter.0, f16[] %parameter.1), window={size=1x1 pad=0_1x0_0}, to_apply=%identity.pad_to_reduce_window +} + +)"; + EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt)); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc index 2fd70b72b52f360fc74a73cd13d401b7dac6e708..97bbf80aff80e995ea5cdd3e5d8807ee4d380067 100644 --- a/tensorflow/compiler/xla/tests/tuple_test.cc +++ b/tensorflow/compiler/xla/tests/tuple_test.cc @@ -586,9 +586,9 @@ XLA_TEST_F(TupleHloTest, })); auto expected = LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1({2, 3})); - auto literal = MakeUnique(); + auto literal = Literal::CreateFromShape(expected->shape()); TF_EXPECT_OK(backend().transfer_manager()->TransferLiteralFromOutfeed( - backend().default_stream_executor(), expected->shape(), literal.get())); + backend().default_stream_executor(), expected->shape(), *literal)); EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *literal)); } diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc index c81c27891c29394fe01116ca22fa678b0a409c62..1bdf1867b9330b715b0ba4aca71d56307883c775 100644 --- a/tensorflow/compiler/xla/tests/while_test.cc +++ b/tensorflow/compiler/xla/tests/while_test.cc @@ -1236,6 +1236,35 @@ TEST_F(WhileTest, WhileWithLoopInvariantOperation) { {param_value.get()}, ErrorSpec(4e-5)); } +TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileInfeedCondition)) { + auto while_shape = ShapeUtil::MakeShape(S32, {}); + + XlaComputation condition; + { + XlaBuilder builder("condition"); + Parameter(&builder, 0, while_shape, "state"); + Infeed(&builder, ShapeUtil::MakeShape(PRED, {})); + TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build()); + } + + XlaComputation body; + { + XlaBuilder builder("body"); + auto indvar = Parameter(&builder, 0, while_shape, "state"); + Add(indvar, ConstantR0(&builder, 1)); + TF_ASSERT_OK_AND_ASSIGN(body, builder.Build()); + } + + XlaBuilder builder(TestName()); + While(condition, body, ConstantR0(&builder, 0)); + + TF_ASSERT_OK(client_->TransferToInfeed(*LiteralUtil::CreateR0(true))); + TF_ASSERT_OK(client_->TransferToInfeed(*LiteralUtil::CreateR0(true))); + TF_ASSERT_OK(client_->TransferToInfeed(*LiteralUtil::CreateR0(false))); + + ComputeAndCompareR0(&builder, 2, {}); +} + void BM_WhileLoop(int num_iters) { // Benchmark a simple kernel to measure while loop overheads. tensorflow::testing::StopTiming(); diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc index 0ee8e68c88011d53ab6484e0bd81eb969304b6fb..11f3efb1f34ad23ebdcbb65c90aa5fb7a6adeae5 100644 --- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -84,8 +84,8 @@ Status ParseOneProfileOutputLine( tensorflow::gtl::ArraySlice opcodes_to_ignore = {}) { string separator = "[^:]*:: +"; - string match_percentage = "\\d+\\.\\d\\d%"; - string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)"; + string match_percentage = R"(\d+\.\d*% +\d+Σ)"; + string match_cycles = R"((\d+) cycles +\( *()" + match_percentage + R"()\))"; string match_usecs = "([0-9.]+) usec"; string match_flops = "([^ ]*)"; string match_trops = "([^ ]*)"; @@ -225,7 +225,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) { MaybeFind(parsed_profile_lines, "tanh")); EXPECT_GT(total_profile.cycles, 0); - EXPECT_EQ(total_profile.cycles_percentage, "100.00%"); + EXPECT_EQ(total_profile.cycles_percentage, "100.% 100Σ"); EXPECT_TRUE(HasFlops(total_profile)); EXPECT_TRUE(HasTrops(total_profile)); @@ -333,7 +333,7 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) { EXPECT_GT(total_while_body_profile.cycles, 0); EXPECT_EQ(total_while_body_profile.opcode, "[total]"); - EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.00%"); + EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.% 100Σ"); EXPECT_GT(total_while_body_profile.cycles, multiply_profile.cycles); EXPECT_NE(multiply_profile.cycles_percentage, "0.00%"); diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD index d7cabbe876c662fc71237a0fb62141c93e69d14b..40d28a57bfddd3403cad8252df985b746362631f 100644 --- a/tensorflow/compiler/xla/tools/BUILD +++ b/tensorflow/compiler/xla/tools/BUILD @@ -87,6 +87,7 @@ cc_library( "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:testing", + "//tensorflow/compiler/xla/legacy_flags:debug_options_flags", "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/service/gpu:infeed_manager", diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc index 3bb2f3c0007bbe92aed6a995790284c89719be91..b4774233e588dc407bfb88defca9bf55e08eea09 100644 --- a/tensorflow/compiler/xla/tools/replay_computation.cc +++ b/tensorflow/compiler/xla/tools/replay_computation.cc @@ -30,6 +30,9 @@ limitations under the License. // The output format is: // // file_path: computation_name :: type:literal_str +// +// Note: If you pass multiple modules, they will be compiled in parallel but run +// in series. #include #include @@ -44,6 +47,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/execution_options_util.h" +#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" @@ -75,6 +79,18 @@ struct Options { int num_runs = 1; }; +std::unique_ptr CompileExecutable(const HloSnapshot& module, + LocalClient* client) { + XlaComputation computation(module.hlo().hlo_module()); + std::vector argument_layouts; + for (const auto& param : computation.proto().program_shape().parameters()) { + argument_layouts.push_back(¶m); + } + return client + ->Compile(computation, argument_layouts, ExecutableBuildOptions()) + .ValueOrDie(); +} + // Invokes the given computation passing arbitrary data for every (unbound) // parameter if use_fake_data, Otherwise use recorded data if available. // @@ -85,6 +101,7 @@ struct Options { // If neither generate_fake_infeed is true nor a fake_infeed_shape is provided, // no infeed is performed. StatusOr ReplayComputation(const HloSnapshot& module, + LocalExecutable* executable, LocalClient* client, const Options& opts) { XlaComputation computation(module.hlo().hlo_module()); @@ -167,34 +184,34 @@ StatusOr ReplayComputation(const HloSnapshot& module, }); } - std::vector argument_layouts; - for (const auto& param : computation.proto().program_shape().parameters()) { - argument_layouts.push_back(¶m); - } - std::unique_ptr executable = - client->Compile(computation, argument_layouts, ExecutableBuildOptions()) - .ValueOrDie(); - - // Do not attmept to run the executable, if num_runs is less than 1. + // Do not attempt to run the executable if num_runs is less than 1. if (opts.num_runs < 1) { return Cancelled("Cancelled after compilation since --num_runs < 1."); } // Run the computation num_runs times, and return the result from the last // execution. + const bool xla_hlo_profile = + legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile(); StreamExecutorMemoryAllocator allocator( client->platform(), {client->platform()->ExecutorForDevice(0).ValueOrDie()}); tensorflow::gtl::optional result; for (int i = 0; i < opts.num_runs; ++i) { + // If xla_hlo_profile is enabled, print a noisy message before the last run, + // making it easier to separate this profile from the others in the logspam. + if (xla_hlo_profile && i == opts.num_runs - 1) { + LOG(INFO) << "\n\n***** Final run below ******"; + } ExecutionProfile profile; ExecutableRunOptions run_options; run_options.set_execution_profile(&profile); run_options.set_allocator(&allocator); TF_ASSIGN_OR_RETURN(result, executable->Run(argument_ptrs, run_options)); - LOG(INFO) << "Execution took " - << static_cast(profile.compute_time_ns()) / 1e9 << "s"; + LOG(INFO) << "Done executing in " + << static_cast(profile.compute_time_ns()) / 1e9 + << "s: " << module.hlo().hlo_module().name(); } TF_ASSIGN_OR_RETURN(std::unique_ptr result_literal, @@ -206,9 +223,13 @@ StatusOr ParseInputFile(const string& filename, const Options& opts) { tensorflow::Env* env = tensorflow::Env::Default(); HloSnapshot snapshot; - if (tensorflow::ReadBinaryProto(env, filename, &snapshot).ok()) { + auto s = tensorflow::ReadBinaryProto(env, filename, &snapshot); + if (s.ok()) { return snapshot; } + if (s.code() == tensorflow::error::NOT_FOUND) { + return s; + } CHECK(opts.use_fake_data) << "Without --use_fake_data, you must pass an HloSnapshot -- HloProto " "and textual HLO don't carry real data."; @@ -235,15 +256,42 @@ StatusOr ParseInputFile(const string& filename, int RealMain(tensorflow::gtl::ArraySlice args, const Options& opts) { LocalClient* client = ClientLibrary::LocalClientOrDie(); int exit_status = EXIT_SUCCESS; + + std::vector snapshots; for (char* arg : args) { StatusOr maybe_snapshot = ParseInputFile(arg, opts); - if (!maybe_snapshot.ok()) { - continue; + if (maybe_snapshot.ok()) { + snapshots.push_back(std::move(maybe_snapshot).ValueOrDie()); + } else { + LOG(ERROR) << "Can't handle file " << arg << ": " + << maybe_snapshot.status(); } - HloSnapshot snapshot = std::move(maybe_snapshot).ValueOrDie(); - StatusOr result_status = ReplayComputation(snapshot, client, opts); + } + + // Compile all the modules in parallel. + LOG(INFO) << "Compiling " << snapshots.size() << " modules in parallel."; + std::vector> executables; + { + // ThreadPool CHECK-fails if we give it 0 threads. + tensorflow::thread::ThreadPool thread_pool( + tensorflow::Env::Default(), tensorflow::ThreadOptions(), + "compile_modules", std::max(size_t{1}, snapshots.size()), + /*low_latency_hint=*/false); + executables.resize(snapshots.size()); + for (int64 i = 0; i < snapshots.size(); ++i) { + thread_pool.Schedule([&snapshots, &executables, client, i] { + executables[i] = CompileExecutable(snapshots[i], client); + }); + } + } + LOG(INFO) << "Done compiling; now running the modules."; + + for (int64 i = 0; i < executables.size(); ++i) { + LocalExecutable* executable = executables[i].get(); + StatusOr result_status = + ReplayComputation(snapshots[i], executable, client, opts); if (!result_status.ok()) { - fprintf(stderr, "%s: error: %s\n", arg, + fprintf(stderr, "%s: error: %s\n", args[i], result_status.status().ToString().c_str()); exit_status = EXIT_FAILURE; continue; @@ -251,10 +299,11 @@ int RealMain(tensorflow::gtl::ArraySlice args, const Options& opts) { if (opts.print_result) { Literal result = std::move(result_status).ValueOrDie(); - fprintf(stdout, "%s: %s :: %s:%s\n", arg, - snapshot.hlo().hlo_module().name().c_str(), + fprintf(stdout, "%s: %s :: %s:%s\n", args[i], + executable->executable()->module().name().c_str(), ShapeUtil::HumanString(result.shape()).c_str(), result.ToString().c_str()); + auto& snapshot = snapshots[i]; if (snapshot.has_result()) { std::unique_ptr literal = Literal::CreateFromProto(snapshot.result()).ConsumeValueOrDie(); diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto index 10c0adc6707f01fcee87303a6e2ec5c570601309..3b72eb17c600abf542caffb66fe150a051b4bb4d 100644 --- a/tensorflow/compiler/xla/xla.proto +++ b/tensorflow/compiler/xla/xla.proto @@ -104,15 +104,6 @@ message DebugOptions { // interpretation of this value is left to the backends. int32 xla_backend_optimization_level = 31; - // When true, "unsafe" mathematical optimizations are enabled. These - // transformations include but are not limited to: - // - // - Reducing the precision of operations (e.g. using an approximate sin - // function, or transforming x/y into x * (1/y)). - // - Assuming that operations never produce or consume NaN or +/- Inf. - // - Assuming that +0 and -0 are indistinguishable. - bool xla_enable_fast_math = 32; - // Embed the compiler IR as a string in the executable. bool xla_embed_ir_in_executable = 33; @@ -194,6 +185,16 @@ message DebugOptions { // Maximum kernel unroll factor for the GPU backend. int32 xla_gpu_max_kernel_unroll_factor = 98; + // When true, "unsafe" mathematical optimizations are enabled. These + // transformations include but are not limited to: + // + // - Reducing the precision of operations (e.g. using an approximate sin + // function, or transforming x/y into x * (1/y)). + // - Assuming that operations never produce or consume NaN or +/- Inf. + // - Assuming that +0 and -0 are indistinguishable. + bool xla_cpu_enable_fast_math = 99; + bool xla_gpu_enable_fast_math = 100; + // Extra options to pass to the compilation backend; specific interpretation // of these values is left to the backend. map xla_backend_extra_options = 500; diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto index 0b300dc7b2d03cc8e1564f78412cc610cff518cd..4c35e93d38450b8263290da8e327d1f2126c1532 100644 --- a/tensorflow/compiler/xla/xla_data.proto +++ b/tensorflow/compiler/xla/xla_data.proto @@ -447,6 +447,20 @@ message GatherDimensionNumbers { int64 index_vector_dim = 4; } +// Describes the dimension numbers for a scatter operation. +// +// All the fields are similar to the corresponding fields in +// GatherDimensionNumbers. Differences are noted below. +message ScatterDimensionNumbers { + // The set of dimensions in the updates shape that are window dimensions. + repeated int64 update_window_dims = 1; + // The set of window dimensions that must be inserted into the updates shape. + repeated int64 inserted_window_dims = 2; + + repeated int64 scatter_dims_to_operand_dims = 3; + int64 index_vector_dim = 4; +} + message ConvolutionDimensionNumbers { // The number of the dimension that represents batch in the input. int64 input_batch_dimension = 7; @@ -547,3 +561,11 @@ message OpSharding { // to. repeated OpSharding tuple_shardings = 5; } + +// Describes the replica groups in a cross replica op (e.g., all-reduce and +// all-to-all). +message ReplicaGroup { + // The ids of the replicas that belongs to the same group. The ordering of the + // ids matters in some op (e.g., all-to-all). + repeated int64 replica_ids = 1; +} diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD index 6a4e252b44881c679350e121b1793e3b797f0785..23bb783e2207da7076833138f4421980ad20bd96 100644 --- a/tensorflow/contrib/BUILD +++ b/tensorflow/contrib/BUILD @@ -46,6 +46,7 @@ py_library( "//tensorflow/contrib/gan", "//tensorflow/contrib/graph_editor:graph_editor_py", "//tensorflow/contrib/grid_rnn:grid_rnn_py", + "//tensorflow/contrib/hadoop", "//tensorflow/contrib/hooks", "//tensorflow/contrib/image:distort_image_py", "//tensorflow/contrib/image:image_py", @@ -107,7 +108,6 @@ py_library( "//tensorflow/contrib/tfprof", "//tensorflow/contrib/timeseries", "//tensorflow/contrib/tpu", - "//tensorflow/contrib/tpu:tpu_py", "//tensorflow/contrib/training:training_py", "//tensorflow/contrib/util:util_py", "//tensorflow/python:util", @@ -147,6 +147,7 @@ cc_library( "//tensorflow/contrib/coder:all_kernels", "//tensorflow/contrib/data/kernels:dataset_kernels", "//tensorflow/contrib/factorization/kernels:all_kernels", + "//tensorflow/contrib/hadoop:dataset_kernels", "//tensorflow/contrib/input_pipeline:input_pipeline_ops_kernels", "//tensorflow/contrib/layers:sparse_feature_cross_op_kernel", "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_kernels", @@ -182,6 +183,7 @@ cc_library( "//tensorflow/contrib/data:dataset_ops_op_lib", "//tensorflow/contrib/factorization:all_ops", "//tensorflow/contrib/framework:all_ops", + "//tensorflow/contrib/hadoop:dataset_ops_op_lib", "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib", "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib", "//tensorflow/contrib/nccl:nccl_ops_op_lib", diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py index ded05da71877566781a5fb6d0c21e1c8d43de9ed..e18ea8df4df719a7317333cf9038ce7facf8d6ac 100644 --- a/tensorflow/contrib/__init__.py +++ b/tensorflow/contrib/__init__.py @@ -22,6 +22,7 @@ from __future__ import print_function import os # Add projects here, they will show up under tf.contrib. +from tensorflow.contrib import autograph from tensorflow.contrib import batching from tensorflow.contrib import bayesflow from tensorflow.contrib import checkpoint diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py index 159d985db5c48f8fe1a26350255f8d8f68482473..3b539734a236804026826a8117d9c668c0dd089a 100644 --- a/tensorflow/contrib/all_reduce/python/all_reduce.py +++ b/tensorflow/contrib/all_reduce/python/all_reduce.py @@ -32,10 +32,10 @@ def _flatten_tensors(tensors): """Check tensors for isomorphism and flatten. Args: - tensors: list of T @{tf.Tensor} which must all have the same shape. + tensors: list of T `tf.Tensor` which must all have the same shape. Returns: - tensors: a list of T @{tf.Tensor} which are flattened (1D) views of tensors + tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors shape: the original shape of each element of input tensors Raises: @@ -61,12 +61,12 @@ def _reshape_tensors(tensors, shape): """Reshape tensors flattened by _flatten_tensors. Args: - tensors: list of T @{tf.Tensor} of identical length 1D tensors. + tensors: list of T `tf.Tensor` of identical length 1D tensors. shape: list of integers describing the desired shape. Product of the elements must equal the length of each tensor. Returns: - list of T @{tf.Tensor} which are the reshaped inputs. + list of T `tf.Tensor` which are the reshaped inputs. """ reshaped = [] for t in tensors: @@ -79,12 +79,12 @@ def _padded_split(tensor, pieces): """Like split for 1D tensors but pads-out case where len % pieces != 0. Args: - tensor: T @{tf.Tensor} that must be 1D. + tensor: T `tf.Tensor` that must be 1D. pieces: a positive integer specifying the number of pieces into which tensor should be split. Returns: - list of T @{tf.Tensor} of length pieces, which hold the values of + list of T `tf.Tensor` of length pieces, which hold the values of thin input tensor, in order. The final tensor may be zero-padded on the end to make its size equal to those of all of the other tensors. @@ -132,11 +132,11 @@ def _strip_padding(tensors, pad_len): """Strip the suffix padding added by _padded_split. Args: - tensors: list of T @{tf.Tensor} of identical length 1D tensors. + tensors: list of T `tf.Tensor` of identical length 1D tensors. pad_len: number of elements to be stripped from the end of each tensor. Returns: - list of T @{tf.Tensor} which are the stripped inputs. + list of T `tf.Tensor` which are the stripped inputs. Raises: ValueError: tensors must be a non-empty list of 1D tensors, and @@ -161,12 +161,12 @@ def _ragged_split(tensor, pieces): """Like split for 1D tensors but allows case where len % pieces != 0. Args: - tensor: T @{tf.Tensor} that must be 1D. + tensor: T `tf.Tensor` that must be 1D. pieces: a positive integer specifying the number of pieces into which tensor should be split. Returns: - list of T @{tf.Tensor} of length pieces, which hold the values of + list of T `tf.Tensor` of length pieces, which hold the values of the input tensor, in order. The final tensor may be shorter than the others, which will all be of equal length. @@ -256,7 +256,7 @@ def build_ring_all_reduce(input_tensors, num_workers, num_subchunks, """Construct a subgraph performing a ring-style all-reduce of input_tensors. Args: - input_tensors: a list of T @{tf.Tensor} objects, which must all + input_tensors: a list of T `tf.Tensor` objects, which must all have the same shape and type. num_workers: number of worker tasks spanned by input_tensors. num_subchunks: number of subchunks each device should process in one tick. @@ -272,7 +272,7 @@ def build_ring_all_reduce(input_tensors, num_workers, num_subchunks, size. Returns: - a list of T @{tf.Tensor} identical sum-reductions of input_tensors. + a list of T `tf.Tensor` identical sum-reductions of input_tensors. """ if len(input_tensors) < 2: raise ValueError("input_tensors must be length 2 or longer") @@ -299,7 +299,7 @@ def _build_ring_gather(input_tensors, devices, num_subchunks, """Construct a subgraph for the first (reduction) pass of ring all-reduce. Args: - input_tensors: a list of T @{tf.Tensor} 1D input tensors of same + input_tensors: a list of T `tf.Tensor` 1D input tensors of same shape and type. devices: array of device name strings num_subchunks: number of subchunks each device should process in one tick. @@ -311,7 +311,7 @@ def _build_ring_gather(input_tensors, devices, num_subchunks, ValueError: tensors must all be one dimensional. Returns: - list of list of T @{tf.Tensor} of (partially) reduced values where + list of list of T `tf.Tensor` of (partially) reduced values where exactly num_subchunks chunks at each device are fully reduced. """ num_devices = len(input_tensors) @@ -360,11 +360,11 @@ def _apply_unary_to_chunks(f, chunks_by_dev): """Apply a unary op to each tensor in chunks_by_dev, on same device. Args: - f: a unary function over T @{tf.Tensor}. - chunks_by_dev: list of lists of T @{tf.Tensor}. + f: a unary function over T `tf.Tensor`. + chunks_by_dev: list of lists of T `tf.Tensor`. Returns: - new list of lists of T @{tf.Tensor} with the same structure as + new list of lists of T `tf.Tensor` with the same structure as chunks_by_dev containing the derived tensors. """ output = [] @@ -381,14 +381,14 @@ def _build_ring_scatter(pred_by_s_d, rank_by_s_d, Args: pred_by_s_d: as produced by _ring_permutations rank_by_s_d: as produced by _ring_permutations - chunks_by_dev: list of list of T @{tf.Tensor} indexed by ints + chunks_by_dev: list of list of T `tf.Tensor` indexed by ints (device, chunk) Raises: ValueError: chunks_by_dev is not well-formed Returns: - list of T @{tf.Tensor} which are the fully reduced tensors, one + list of T `tf.Tensor` which are the fully reduced tensors, one at each device corresponding to the outer dimension of chunks_by_dev. """ num_devices = len(chunks_by_dev) @@ -448,12 +448,12 @@ def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None): the future with edge-case specific logic. Args: - input_tensors: list of T @{tf.Tensor} to be elementwise reduced. + input_tensors: list of T `tf.Tensor` to be elementwise reduced. red_op: a binary elementwise reduction Op. un_op: an optional unary elementwise Op to apply to reduced values. Returns: - list of T @{tf.Tensor} which are the fully reduced tensors, one + list of T `tf.Tensor` which are the fully reduced tensors, one at each device of input_tensors. Raises: @@ -475,13 +475,13 @@ def _build_recursive_hd_gather(input_tensors, devices, red_op): """Construct the gather phase of recursive halving-doubling all-reduce. Args: - input_tensors: list of T @{tf.Tensor} to be elementwise reduced. + input_tensors: list of T `tf.Tensor` to be elementwise reduced. devices: a list of strings naming the devices hosting input_tensors, which will also be used to host the (partial) reduction values. red_op: a binary elementwise reduction Op. Returns: - list of T @{tf.Tensor} which are the fully reduced tensor shards. + list of T `tf.Tensor` which are the fully reduced tensor shards. Raises: ValueError: num_devices not a power of 2, or tensor len not divisible @@ -516,12 +516,12 @@ def _build_recursive_hd_scatter(input_tensors, devices): """Construct the scatter phase of recursive halving-doublng all-reduce. Args: - input_tensors: list of T @{tf.Tensor} that are fully-reduced shards. + input_tensors: list of T `tf.Tensor` that are fully-reduced shards. devices: a list of strings naming the devices on which the reconstituted full tensors should be placed. Returns: - list of T @{tf.Tensor} which are the fully reduced tensors. + list of T `tf.Tensor` which are the fully reduced tensors. """ num_devices = len(devices) num_hops = int(math.log(num_devices, 2)) @@ -571,7 +571,7 @@ def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None): un_op: optional elementwise unary Op to be applied to fully-reduced values. Returns: - list of T @{tf.Tensor} which are the fully reduced tensors. + list of T `tf.Tensor` which are the fully reduced tensors. """ input_tensors, shape = _flatten_tensors(input_tensors) dst_devices = [t.device for t in input_tensors] @@ -594,7 +594,7 @@ def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None): un_op: optional elementwise unary Op to be applied to fully-reduced values. Returns: - list of T @{tf.Tensor} which are the fully reduced shards. + list of T `tf.Tensor` which are the fully reduced shards. Raises: ValueError: inputs not well-formed. @@ -629,7 +629,7 @@ def _build_shuffle_scatter(reduced_shards, dst_devices): should be reconstituted. Returns: - list of T @{tf.Tensor} scattered tensors. + list of T `tf.Tensor` scattered tensors. """ num_devices = len(dst_devices) out_tensors = [] @@ -644,7 +644,7 @@ def _split_by_task(devices, values): Args: devices: list of device name strings - values: list of T @{tf.tensor} of same length as devices. + values: list of T `tf.tensor` of same length as devices. Returns: (per_task_devices, per_task_values) where both values are @@ -680,14 +680,14 @@ def build_nccl_all_reduce(input_tensors, red_op, un_op=None): """Build a subgraph that does one full all-reduce, using NCCL. Args: - input_tensors: list of T @{tf.Tensor} of same-shape and type values to + input_tensors: list of T `tf.Tensor` of same-shape and type values to be reduced. red_op: binary elementwise reduction operator. Must be one of {tf.add} un_op: optional unary elementwise Op to apply to fully-reduce values. Returns: - list of T @{tf.Tensor} of reduced values. + list of T `tf.Tensor` of reduced values. Raises: ValueError: red_op not supported. @@ -709,14 +709,14 @@ def _build_nccl_hybrid(input_tensors, red_op, upper_level_f): """Construct a subgraph for NCCL hybrid all-reduce. Args: - input_tensors: list of T @{tf.Tensor} of same-shape and type values to + input_tensors: list of T `tf.Tensor` of same-shape and type values to be reduced. red_op: binary elementwise reduction operator. upper_level_f: function for reducing one value per worker, across workers. Returns: - list of T @{tf.Tensor} of reduced values. + list of T `tf.Tensor` of reduced values. Raises: ValueError: inputs not well-formed. @@ -797,7 +797,7 @@ def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f): """Construct a subgraph for Shuffle hybrid all-reduce. Args: - input_tensors: list of T @{tf.Tensor} of same-shape and type values to + input_tensors: list of T `tf.Tensor` of same-shape and type values to be reduced. gather_devices: list of device names on which to host gather shards. red_op: binary elementwise reduction operator. @@ -805,7 +805,7 @@ def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f): workers. Returns: - list of T @{tf.Tensor} of reduced values. + list of T `tf.Tensor` of reduced values. Raises: ValueError: inputs not well-formed. diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD index 7cbba7168383f3d0cdc80fda9908cb7d70836bb4..2d2ab7040a8bb76f9538f201f75a2e4dcba0f511 100644 --- a/tensorflow/contrib/autograph/converters/BUILD +++ b/tensorflow/contrib/autograph/converters/BUILD @@ -204,6 +204,7 @@ py_test( name = "side_effect_guards_test", srcs = ["side_effect_guards_test.py"], srcs_version = "PY2AND3", + tags = ["notsan"], deps = [ ":converters", "//tensorflow/contrib/autograph/core:test_lib", diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py index a36b3d77a9233daed864c616306b2ad27f582a38..2d1bed3367fa0b283200b775c5953da80c855367 100644 --- a/tensorflow/contrib/autograph/converters/call_trees.py +++ b/tensorflow/contrib/autograph/converters/call_trees.py @@ -238,7 +238,7 @@ class CallTreeTransformer(converter.Base): # Before we could convert all the time though, we'd need a reasonable # caching mechanism. template = """ - ag__.converted_call(func, True, False, {}, args) + ag__.converted_call(func, True, False, False, {}, args) """ call_expr = templates.replace(template, func=node.func, args=node.args) new_call = call_expr[0].value diff --git a/tensorflow/contrib/autograph/converters/directives.py b/tensorflow/contrib/autograph/converters/directives.py index ccdf79d47be65dd777a7ae3a226246a62e274430..77f625bac792621c45799d1a220f99eb4b99f7af 100644 --- a/tensorflow/contrib/autograph/converters/directives.py +++ b/tensorflow/contrib/autograph/converters/directives.py @@ -42,10 +42,30 @@ def _map_args(call_node, function): Returns: Dict[Text, ast.AST], mapping each of the function's argument names to the respective AST node. + Raises: + ValueError: if the default arguments are not correctly set """ args = call_node.args kwds = {kwd.arg: kwd.value for kwd in call_node.keywords} - return tf_inspect.getcallargs(function, *args, **kwds) + call_args = tf_inspect.getcallargs(function, *args, **kwds) + + # Keyword arguments not specified in kwds will be mapped to their defaults, + # which are Python values. Since we don't currently have a way to transform + # those into AST references, we simply remove them. By convention, directives + # use UNSPECIFIED as default value for for optional arguments. No other + # defaults should be present. + unexpected_defaults = [] + for k in call_args: + if (k not in kwds + and call_args[k] not in args + and call_args[k] is not directives.UNSPECIFIED): + unexpected_defaults.append(k) + if unexpected_defaults: + raise ValueError('Unexpected keyword argument values, %s, for function %s' + % (zip(unexpected_defaults, + [call_args[k] for k in unexpected_defaults]), + function)) + return {k: v for k, v in call_args.items() if v is not directives.UNSPECIFIED} class DirectivesTransformer(converter.Base): diff --git a/tensorflow/contrib/autograph/converters/directives_test.py b/tensorflow/contrib/autograph/converters/directives_test.py index a573ba5850609f65ea60432470485c523cd3da3b..a2d083b891314d2f8f3fa61b46edc347ca8e24eb 100644 --- a/tensorflow/contrib/autograph/converters/directives_test.py +++ b/tensorflow/contrib/autograph/converters/directives_test.py @@ -23,6 +23,7 @@ from tensorflow.contrib.autograph.core import converter_testing from tensorflow.contrib.autograph.core.converter import AgAnno from tensorflow.contrib.autograph.lang import directives from tensorflow.contrib.autograph.pyct import anno +from tensorflow.contrib.autograph.pyct import parser from tensorflow.python.platform import test @@ -71,7 +72,23 @@ class DirectivesTest(converter_testing.TestCase): d = d[directives.set_loop_options] self.assertEqual(d['parallel_iterations'].n, 10) self.assertEqual(d['back_prop'].id, 'a') - self.assertEqual(d['swap_memory'], directives.UNSPECIFIED) + self.assertNotIn('swap_memory', d) + + def test_invalid_default(self): + + def invalid_directive(valid_arg, invalid_default=object()): + del valid_arg + del invalid_default + return + + def call_invalid_directive(): + invalid_directive(1) + + node, _ = parser.parse_entity(call_invalid_directive) + # Find the call to the invalid directive + node = node.body[0].body[0].value + with self.assertRaisesRegexp(ValueError, 'Unexpected keyword.*'): + directives_converter._map_args(node, invalid_directive) if __name__ == '__main__': diff --git a/tensorflow/contrib/autograph/converters/error_handlers_test.py b/tensorflow/contrib/autograph/converters/error_handlers_test.py index cd74e5f18f76d0c531f487bc0c736b421c9c3fb4..5d61b220afa0fcf9a9e619bbd78f83a5076c473a 100644 --- a/tensorflow/contrib/autograph/converters/error_handlers_test.py +++ b/tensorflow/contrib/autograph/converters/error_handlers_test.py @@ -34,8 +34,10 @@ class ErrorHandlersTest(converter_testing.TestCase): raise ValueError() node, ctx = self.prepare(test_fn, {}) - anno.setanno(node, anno.Basic.ORIGIN, - origin_info.OriginInfo(None, None, None)) + anno.setanno( + node, anno.Basic.ORIGIN, + origin_info.OriginInfo(None, 'test_function_name', 'test_code', + 'test_comment')) node = error_handlers.transform(node, ctx) with self.compiled(node, {}) as result: with self.assertRaises(errors.GraphConstructionError): diff --git a/tensorflow/contrib/autograph/core/converter.py b/tensorflow/contrib/autograph/core/converter.py index a93e4a806469db63e7d767563e64dadfe71f50ee..83a80c1f52123c325782a67c651e892163af83b3 100644 --- a/tensorflow/contrib/autograph/core/converter.py +++ b/tensorflow/contrib/autograph/core/converter.py @@ -233,7 +233,7 @@ class Base(transformer.Base): arg_values = [] for def_ in defs: if (directive not in def_.directives or - arg not in arg not in def_.directives[directive]): + arg not in def_.directives[directive]): continue arg_value = def_.directives[directive][arg] for prev_value in arg_values: diff --git a/tensorflow/contrib/autograph/core/errors_test.py b/tensorflow/contrib/autograph/core/errors_test.py index c0e2c74e47ddfb8ee812d6d839b06784e7a01dba..404c1f5456f9654724d068e3007fe9ced15cbf07 100644 --- a/tensorflow/contrib/autograph/core/errors_test.py +++ b/tensorflow/contrib/autograph/core/errors_test.py @@ -43,7 +43,8 @@ class RuntimeErrorsTest(test.TestCase): filename = tf_inspect.getsourcefile(function) lineno += line_offset loc = origin_info.LineLocation(filename, lineno) - origin = origin_info.OriginInfo(loc, 'test_function_name', 'test_code') + origin = origin_info.OriginInfo(loc, 'test_function_name', 'test_code', + 'test_comment') return loc, origin def test_improved_errors_basic(self): diff --git a/tensorflow/contrib/autograph/docs/pyfunc_dtypes.md b/tensorflow/contrib/autograph/docs/pyfunc_dtypes.md new file mode 100644 index 0000000000000000000000000000000000000000..bcbb920cc53de4b89dc67128c9c2c2312f030f0a --- /dev/null +++ b/tensorflow/contrib/autograph/docs/pyfunc_dtypes.md @@ -0,0 +1,33 @@ +# Specifying return data type for `py_func` calls + +The `py_func` op requires specifying a +[data type](https://www.tensorflow.org/guide/tensors#data_types). + +When wrapping a function with `py_func`, for instance using +`@autograph.do_not_convert(run_mode=autograph.RunMode.PY_FUNC)`, you have two +options to specify the returned data type: + + * explicitly, with a specified `tf.DType` value + * by matching the data type of an input argument, which is then assumed to be + a `Tensor` + +Examples: + +Specify an explicit data type: + +``` + def foo(a): + return a + 1 + + autograph.util.wrap_py_func(f, return_dtypes=[tf.float32]) +``` + +Match the data type of the first argument: + +``` + def foo(a): + return a + 1 + + autograph.util.wrap_py_func( + f, return_dtypes=[autograph.utils.py_func.MatchDType(0)]) +``` diff --git a/tensorflow/contrib/autograph/examples/integration_tests/BUILD b/tensorflow/contrib/autograph/examples/integration_tests/BUILD index 0ab4e2eb5e13f99f062afb90dd9c4087ea12e5ba..6c281485b4a3c4d09292a4d7af16330cdc44edd4 100644 --- a/tensorflow/contrib/autograph/examples/integration_tests/BUILD +++ b/tensorflow/contrib/autograph/examples/integration_tests/BUILD @@ -22,6 +22,7 @@ py_test( "errors_test.py", ], srcs_version = "PY2AND3", + tags = ["no_windows"], visibility = ["//visibility:public"], deps = [ "//tensorflow:tensorflow_py", diff --git a/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py b/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py index 73125eb452fc3f3f94a8323d677341345931c4ea..7e7ef5a3e2bbf6a15936eb181c9c4112f8b820e6 100644 --- a/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py +++ b/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py @@ -44,6 +44,33 @@ class ModelWithStaticConditional(object): return x +class BasicBlock(tf.keras.Model): + + def __init__(self): + super(BasicBlock, self).__init__() + self.conv1 = tf.keras.layers.Conv2D(8, 3) + self.pool = tf.keras.layers.GlobalAveragePooling2D() + self.dense = tf.keras.layers.Dense(3) + + def call(self, x): + x = self.conv1(x) + x = self.pool(x) + x = self.dense(x) + return x + + +class CompoundModel(tf.keras.Model): + + def __init__(self): + super(CompoundModel, self).__init__() + self.block = BasicBlock() + + @autograph.convert(recursive=True) + def call(self, x): + x = self.block(x) # pylint: disable=not-callable + return x + + class KerasTest(tf.test.TestCase): def test_basic(self): @@ -57,6 +84,20 @@ class KerasTest(tf.test.TestCase): model = ModelWithStaticConditional(True) self.assertEqual(model.call(), 25) + def test_recursive_true(self): + with self.assertRaisesRegexp(NotImplementedError, + 'Object conversion is not yet supported.'): + with tf.Graph().as_default(): + model = CompoundModel() + model.build(tf.TensorShape((None, 10, 10, 1))) + init = tf.global_variables_initializer() + + with tf.Session() as sess: + sess.run(init) + sample_input = tf.random_uniform((1, 10, 10, 1)) + output = model(sample_input) # pylint: disable=not-callable + self.assertEqual(sess.run(output).shape, (1, 3)) + if __name__ == '__main__': tf.test.main() diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py index 0adff76a9f2cae9480954a32dd1a81458bab37f7..276a3871801da2c66fbfffc38ac1ea39704b5de1 100644 --- a/tensorflow/contrib/autograph/impl/api.py +++ b/tensorflow/contrib/autograph/impl/api.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Public API.""" +"""This module contains the user-facing API for AutoGraph.""" from __future__ import absolute_import from __future__ import division @@ -42,33 +42,30 @@ from tensorflow.python.util import tf_inspect # (currently we require (module + class name, type)) -def convert(recursive=False, verbose=False, arg_types=None): - """Decorator that compiles a function to graph mode. +# TODO(mdan): This should behave like to_graph (e.g. convert statically). +def convert(recursive=False, verbose=False): + """Decorator that compiles a function to use TensorFlow ops. - The decorator is dynamic - invoking compilation whenever the decorated - function is called. This means the parameter values are known at compilation. + The decorator is dynamic - it recompiles the target whenever the decorated + function is called. This means the parameter values are known at conversion. + It also means that repeated calls with different types of parameters will be + correctly processed. Args: - recursive: Whether to recursively convert any functions that the decorator - function may call. - verbose: Whether to output the compiled code in the logs. - arg_types: See to_graph. + recursive: bool, whether to recursively convert any functions or classes + that the converted function may use. + verbose: bool, whether to output the compiled code in the logs. Returns: - A decorator that compiles the given function to graph mode. - - Raises: - ValueError: If any of the arguments are illegal. + Callable, a decorator that converts the given function into an equivalent + function that uses TensorFlow ops. """ - if arg_types is None: - arg_types = {} - def decorator(f): """Decorator implementation.""" @wraps(f) def wrapper(*args, **kwargs): - return converted_call(f, recursive, verbose, arg_types, *args, **kwargs) + return converted_call(f, recursive, verbose, True, {}, *args, **kwargs) wrapper = tf_decorator.make_decorator(f, wrapper) @@ -81,22 +78,34 @@ def convert(recursive=False, verbose=False, arg_types=None): class RunMode(Enum): + """Specifies the way a converted function or method should be executed in TF. + + The enum values have the following semantics: + + * GRAPH: Call this function directly, as-is. This is suitable for functions + that were already designed for TF graphs and contain ops. + * PY_FUNC: Wrap this function into a py_func op. This is suitable for code + that will only run correctly in Python, for example code that renders + to the display, reads keyboard input, etc. + """ GRAPH = 1 PY_FUNC = 2 def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None): - """Decorator that suppresses compilation of a function. + """Decorator that suppresses the conversion of a function. + + See also: docs/pyfunc_dtypes.md Args: - run_as: RunMode value. Whether to run the function as-is, or wrap it into - a py_func. - return_dtypes: See autograph.utils.py_func.wrap_py_func. Setting to None or - empty list or tuple will create a dummy return value that can be used - to set control dependencies. + run_as: RunMode, specifies how to use the function in TensorFlow. + return_dtypes: Optional[Iterable[ + Union[tf.DType, utils.py_func.MatchDType]]], the return data types of + the converted function, if run_as is RunMode.PY_FUNC. Ignored otherwise. + May be set to None if the function has no return values. Returns: - A decorator that wraps the original function. + Callable, a decorator that wraps the original function. """ def decorator(f): @@ -129,12 +138,13 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None): return decorator -def converted_call(f, recursive, verbose, arg_types, *args, **kwargs): - """Compiles a function call inline.""" +# TODO(mdan): Move to a private, undocumented module. +def converted_call(f, recursive, verbose, force_conversion, arg_types, *args, + **kwargs): + """Compiles a function call inline. For internal use only.""" # TODO(mdan): This needs cleanup. # In particular, we may want to avoid renaming functions altogether. - - if conversion.is_whitelisted_for_graph(f): + if not force_conversion and conversion.is_whitelisted_for_graph(f): return f(*args, **kwargs) unknown_arg_value = object() # Sentinel for arguments of unknown value @@ -201,39 +211,41 @@ def converted_call(f, recursive, verbose, arg_types, *args, **kwargs): return converted_f(*effective_args, **kwargs) +# TODO(mdan): Rename: to_ops? +# TODO(mdan): Looki into overloading as function and decorator, like tfe.defun. +# TODO(mdan): Remove partial_types. def to_graph(e, recursive=True, verbose=False, arg_values=None, arg_types=None, partial_types=None): - """Compile a Python entity into equivalent TensorFlow code. + """Converts a Python entity into equivalent code that uses TensorFlow ops. - Currently supported entities: + Supported Python entities include: * functions * classes - Classes are handled by converting all their methods into a new class. + Classes are converted by converting all their methods into a new class. Args: - e: A Python entity. - recursive: Whether to recursively convert any functions that the decorator - function may call. - verbose: Whether to output the compiled code in the logs. - arg_values: A dict containing value hints for symbols like function - parameters. - arg_types: A dict containing type hints for symbols like function - parameters. - partial_types: A set of types (e.g. classes) that will not be converted - entirely. Calls to member functions for these types will be renamed - independently. + e: Union[Callable, Type], the Python entity to convert. + recursive: bool, whether to recursively convert any functions that the + converted function may call. + verbose: bool, whether to output the compiled code in the logs. + arg_values: Optional[Dict[Text, Any]], value hints for symbols including + function arguments. + arg_types: Optional[Dict[Text, Type]], type hints for symbols including + function arguments. + partial_types: Set[Type], reserved for internal use. Returns: - A function with a signature identical to `o`, but which when executed it - creates TF a graph that has the same functionality as the original entity. + Union[Callable, Type], the converted entity, which is the same kind as e + (that is, a function is e is a function, a class if e is a class, etc.) but + its code has been converted to use TF ops. + Raises: - ValueError: If the converted function defines or refers to symbol names that - are reserved for AutoGraph. + ValueError: If the entity could not be converted. """ program_ctx = converter.ProgramContext( recursive=recursive, @@ -287,20 +299,23 @@ def to_code(e, arg_types=None, partial_types=None, indentation=' '): - """Return the equivalent of an entity in TensorFlow code. + """Returns the equivalent code that uses TensorFlow ops. - See `to_graph` for more details. + Also see: `to_graph`, `convert` Args: - e: A Python entity. - recursive: See to_graph. - arg_values: See to_graph. - arg_types: See to_graph. - partial_types: See to_graph. - indentation: String, when to use for each level of indentation. + e: Union[Callable, Type], the Python entity to convert. + recursive: bool, whether to recursively convert any functions that the + converted function may call. + arg_values: Optional[Dict[Text, Any]], value hints for symbols including + function arguments. + arg_types: Optional[Dict[Text, Type]], type hints for symbols including + function arguments. + partial_types: Set[Type], reserved for internal use. + indentation: Text, when to use for each level of indentation. Returns: - String. + Text, the converted code. """ program_ctx = converter.ProgramContext( recursive=recursive, diff --git a/tensorflow/contrib/autograph/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py index 754baa87b0c3e4fa071923686078ac7235076533..803fde9089b1c004d9bfc0dfefd3d6b422752f0a 100644 --- a/tensorflow/contrib/autograph/impl/api_test.py +++ b/tensorflow/contrib/autograph/impl/api_test.py @@ -183,8 +183,8 @@ class ApiTest(test.TestCase): @api.convert(recursive=True) def test_method(self, x, s, a): while tf.reduce_sum(x) > s: - x //= api.converted_call(self.called_member, False, False, {}, self, - a) + x //= api.converted_call(self.called_member, False, False, False, {}, + self, a) return x tc = TestClass() @@ -195,7 +195,7 @@ class ApiTest(test.TestCase): self.assertListEqual([0, 1], sess.run(x).tolist()) def test_converted_call_builtin(self): - x = api.converted_call(range, False, False, {}, 3) + x = api.converted_call(range, False, False, False, {}, 3) self.assertEqual((0, 1, 2), tuple(x)) def test_converted_call_function(self): @@ -206,7 +206,7 @@ class ApiTest(test.TestCase): return x with self.test_session() as sess: - x = api.converted_call(test_fn, False, False, {}, + x = api.converted_call(test_fn, False, False, False, {}, constant_op.constant(-1)) self.assertEqual(1, sess.run(x)) @@ -224,7 +224,7 @@ class ApiTest(test.TestCase): with self.test_session() as sess: tc = TestClass(constant_op.constant(-1)) - x = api.converted_call(tc.test_method, False, False, {}, tc) + x = api.converted_call(tc.test_method, False, False, False, {}, tc) self.assertEqual(1, sess.run(x)) def test_converted_call_method_by_class(self): @@ -241,7 +241,7 @@ class ApiTest(test.TestCase): with self.test_session() as sess: tc = TestClass(constant_op.constant(-1)) - x = api.converted_call(TestClass.test_method, False, False, {}, tc) + x = api.converted_call(TestClass.test_method, False, False, False, {}, tc) self.assertEqual(1, sess.run(x)) def test_converted_call_callable_object(self): @@ -258,7 +258,7 @@ class ApiTest(test.TestCase): with self.test_session() as sess: tc = TestClass(constant_op.constant(-1)) - x = api.converted_call(tc, False, False, {}) + x = api.converted_call(tc, False, False, False, {}) self.assertEqual(1, sess.run(x)) def test_converted_call_constructor(self): @@ -274,7 +274,7 @@ class ApiTest(test.TestCase): return self.x with self.test_session() as sess: - tc = api.converted_call(TestClass, False, False, {}, + tc = api.converted_call(TestClass, False, False, False, {}, constant_op.constant(-1)) # tc is now a converted object. x = tc.test_method() @@ -286,11 +286,12 @@ class ApiTest(test.TestCase): return x == 0 with self.test_session() as sess: - x = api.converted_call(f, False, False, {}, constant_op.constant(0)) + x = api.converted_call(f, False, False, False, {}, + constant_op.constant(0)) self.assertTrue(sess.run(x)) converted_f = api.to_graph(f) - x = api.converted_call(converted_f, False, False, {}, + x = api.converted_call(converted_f, False, False, False, {}, constant_op.constant(0)) self.assertTrue(sess.run(x)) diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py index afb10d4d8bec648a43615762f780f989eb8de950..fc8a976d3f3ecdc9c6339995dd0dfc776824b90d 100644 --- a/tensorflow/contrib/autograph/impl/conversion.py +++ b/tensorflow/contrib/autograph/impl/conversion.py @@ -118,6 +118,17 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types): node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types) elif tf_inspect.ismethod(o): node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types) + # TODO(mdan,yashkatariya): Remove when object conversion is implemented. + elif hasattr(o, '__class__'): + raise NotImplementedError( + 'Object conversion is not yet supported. If you are ' + 'trying to convert code that uses an existing object, ' + 'try including the creation of that object in the ' + 'conversion. For example, instead of converting the method ' + 'of a class, try converting the entire class instead. ' + 'See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/' + 'contrib/autograph/README.md#using-the-functional-api ' + 'for more information.') else: raise ValueError( 'Entity "%s" has unsupported type "%s". Only functions and classes are ' @@ -181,7 +192,7 @@ def class_to_graph(c, program_ctx): class_name = namer.compiled_class_name(c.__name__, c) # TODO(mdan): This needs to be explained more thoroughly. - # Process any base classes: if the sueprclass if of a whitelisted type, an + # Process any base classes: if the superclass if of a whitelisted type, an # absolute import line is generated. Otherwise, it is marked for conversion # (as a side effect of the call to namer.compiled_class_name() followed by # program_ctx.update_name_map(namer)). diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py index 1c5d4d09c4e74a3f88b47186aa563419aa7dcb07..86432573a719ea3f2b163746996dbf3301785a91 100644 --- a/tensorflow/contrib/autograph/impl/conversion_test.py +++ b/tensorflow/contrib/autograph/impl/conversion_test.py @@ -50,7 +50,7 @@ class ConversionTest(test.TestCase): self.assertTrue(conversion.is_whitelisted_for_graph(constant_op.constant)) def test_entity_to_graph_unsupported_types(self): - with self.assertRaises(ValueError): + with self.assertRaises(NotImplementedError): program_ctx = self._simple_program_ctx() conversion.entity_to_graph('dummy', program_ctx, None, None) diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py index 988df70157170ed0a9ece33976e871e6f7693bbc..9909e521644a7a901653dc09853222167828c75c 100644 --- a/tensorflow/contrib/autograph/operators/control_flow.py +++ b/tensorflow/contrib/autograph/operators/control_flow.py @@ -141,7 +141,7 @@ def _dataset_for_stmt(ds, extra_test, body, init_state): while_body, init_state=(epoch_number, iterate) + init_state, extra_deps=()) - # Dropping the epoch number and iterate because they are not not syntactically + # Dropping the epoch number and iterate because they are not syntactically # visible. results = results[2:] @@ -212,12 +212,12 @@ def if_stmt(cond, body, orelse): Tuple containing the statement outputs. """ if tensor_util.is_tensor(cond): - return _tf_if_stmt(cond, body, orelse) + return tf_if_stmt(cond, body, orelse) else: return _py_if_stmt(cond, body, orelse) -def _tf_if_stmt(cond, body, orelse): +def tf_if_stmt(cond, body, orelse): """Overload of if_stmt that stages a TF cond.""" return control_flow_ops.cond(cond, body, orelse) diff --git a/tensorflow/contrib/autograph/pyct/origin_info.py b/tensorflow/contrib/autograph/pyct/origin_info.py index 9f98e48a6a9773c17e19ca80ecdb32bae4027421..b60651a30e342dabe40cbcef1486826e16c2e2c7 100644 --- a/tensorflow/contrib/autograph/pyct/origin_info.py +++ b/tensorflow/contrib/autograph/pyct/origin_info.py @@ -18,8 +18,10 @@ from __future__ import division from __future__ import print_function import collections +import tokenize import gast +import six from tensorflow.contrib.autograph.pyct import anno from tensorflow.contrib.autograph.pyct import ast_util @@ -56,13 +58,14 @@ class Location( class OriginInfo( collections.namedtuple( 'OriginInfo', - ('loc', 'function_name', 'source_code_line'))): + ('loc', 'function_name', 'source_code_line', 'comment'))): """Container for information about the source code before conversion. Attributes: loc: Location function_name: Optional[Text] source_code_line: Text + comment: Optional[Text] """ def as_frame(self): @@ -152,6 +155,15 @@ def resolve(nodes, source, function=None): function_lineno = None function_filepath = None + # TODO(mdan): Pull this to a separate utility. + code_reader = six.StringIO(source) + comment_map = {} + for token in tokenize.generate_tokens(code_reader.readline): + tok_type, tok_string, loc, _, _ = token + srow, _ = loc + if tok_type == tokenize.COMMENT: + comment_map[srow] = tok_string.strip()[1:].strip() + source_lines = source.split('\n') for node in nodes: for n in gast.walk(node): @@ -162,12 +174,13 @@ def resolve(nodes, source, function=None): source_code_line = source_lines[lineno_in_body - 1] if function: - source_lineno = function_lineno + lineno_in_body - 1 + source_lineno = function_lineno + lineno_in_body function_name = function.__name__ else: source_lineno = lineno_in_body function_name = None location = Location(function_filepath, source_lineno, n.col_offset) - origin = OriginInfo(location, function_name, source_code_line) + origin = OriginInfo(location, function_name, + source_code_line, comment_map.get(source_lineno)) anno.setanno(n, anno.Basic.ORIGIN, origin) diff --git a/tensorflow/contrib/autograph/pyct/origin_info_test.py b/tensorflow/contrib/autograph/pyct/origin_info_test.py index 6d7d8b1622a2ddb1a1d0eaeec50bdfaf38f05182..eeaa13007ea0ae331293c216a76352956c0ee9ec 100644 --- a/tensorflow/contrib/autograph/pyct/origin_info_test.py +++ b/tensorflow/contrib/autograph/pyct/origin_info_test.py @@ -85,16 +85,19 @@ class OriginInfoTest(test.TestCase): self.assertEqual(origin.loc.lineno, 1) self.assertEqual(origin.loc.col_offset, 0) self.assertEqual(origin.source_code_line, 'def test_fn(x):') + self.assertIsNone(origin.comment) origin = anno.getanno(fn_node.body[0], anno.Basic.ORIGIN) self.assertEqual(origin.loc.lineno, 2) self.assertEqual(origin.loc.col_offset, 2) self.assertEqual(origin.source_code_line, ' """Docstring."""') + self.assertIsNone(origin.comment) origin = anno.getanno(fn_node.body[1], anno.Basic.ORIGIN) self.assertEqual(origin.loc.lineno, 3) self.assertEqual(origin.loc.col_offset, 2) self.assertEqual(origin.source_code_line, ' return x # comment') + self.assertEqual(origin.comment, 'comment') if __name__ == '__main__': diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/reaching_definitions.py b/tensorflow/contrib/autograph/pyct/static_analysis/reaching_definitions.py index 9a84f1231cb71745f778285f30ada151a7c1accd..7f2b379d3de236020f1ec2b8a4972cc67b10b060 100644 --- a/tensorflow/contrib/autograph/pyct/static_analysis/reaching_definitions.py +++ b/tensorflow/contrib/autograph/pyct/static_analysis/reaching_definitions.py @@ -39,7 +39,7 @@ from tensorflow.contrib.autograph.pyct.static_analysis import annos class Definition(object): """Definition objects describe a unique definition of a variable. - Subclasses of this may be used by passing an appropriate factory fuction to + Subclasses of this may be used by passing an appropriate factory function to resolve. Attributes: diff --git a/tensorflow/contrib/autograph/pyct/testing/BUILD b/tensorflow/contrib/autograph/pyct/testing/BUILD index 957db356f7e1acf673ce5db7c8087208af43ac23..9ef1ac9663eac8febffd697d7164425716b65d9d 100644 --- a/tensorflow/contrib/autograph/pyct/testing/BUILD +++ b/tensorflow/contrib/autograph/pyct/testing/BUILD @@ -33,7 +33,10 @@ py_test( size = "large", srcs = ["codegen_test.py"], srcs_version = "PY2AND3", - tags = ["no_windows"], + tags = [ + "no_windows", + "nomsan", + ], deps = [ ":testing", "//tensorflow/contrib/autograph/pyct", diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py index ccbe5fc9541dfad561d8eab730e2b15f6250ceb2..4dd440ef197b7e24b901bc9e30794b0182378a32 100644 --- a/tensorflow/contrib/autograph/utils/builtins.py +++ b/tensorflow/contrib/autograph/utils/builtins.py @@ -44,6 +44,8 @@ def dynamic_builtin(f, *args, **kwargs): return dynamic_int(*args, **kwargs) if f is float: return dynamic_float(*args, **kwargs) + if f is abs: + return dynamic_abs(*args, **kwargs) raise NotImplementedError( 'The "%s" builtin is not yet supported.' % f.__name__) @@ -81,6 +83,13 @@ def dynamic_float(num_or_tensor, **kwargs): return float(num_or_tensor) +def dynamic_abs(num_or_tensor, **kwargs): + if tensor_util.is_tensor(num_or_tensor): + return math_ops.abs(num_or_tensor, **kwargs) + else: + return abs(num_or_tensor, **kwargs) + + def dynamic_range(start_or_stop, stop=None, step=None): """Implementation of range using dynamic dispatch.""" if type_check.is_tensor(start_or_stop, stop, step): diff --git a/tensorflow/contrib/autograph/utils/builtins_test.py b/tensorflow/contrib/autograph/utils/builtins_test.py index b4821f36fcab8c201956e366d394bababb9f02b6..b1cd5253bc3ffb1e67d89ef79cf56eaeb65fae07 100644 --- a/tensorflow/contrib/autograph/utils/builtins_test.py +++ b/tensorflow/contrib/autograph/utils/builtins_test.py @@ -44,6 +44,23 @@ class BuiltinsTest(test.TestCase): with self.test_session() as sess: self.assertEqual(3, sess.run(builtins.dynamic_builtin(len, a))) + def test_dynamic_abs_tf_scalar(self): + a = constant_op.constant(-1) + + with self.test_session() as sess: + self.assertEqual(1, sess.run(builtins.dynamic_builtin(abs, a))) + + def test_dynamic_abs_tf_array(self): + a = constant_op.constant([-1, 2, -3]) + + with self.test_session() as sess: + self.assertListEqual([1, 2, 3], + list(sess.run(builtins.dynamic_builtin(abs, a)))) + + def test_dynamic_abs_py_scalar(self): + a = -1 + self.assertEqual(1, builtins.dynamic_builtin(abs, a)) + def test_dynamic_len_tf_matrix(self): a = constant_op.constant([[1, 2], [3, 4]]) diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md index d7c71a20ed4ba6a55dc0356ab5a3d096ed042e59..b9abfa8295f9013cd8e92f87466a73952ccceb10 100644 --- a/tensorflow/contrib/bigtable/README.md +++ b/tensorflow/contrib/bigtable/README.md @@ -1,4 +1,4 @@ -# Bigtable # +# Google Cloud Bigtable [Cloud Bigtable](https://cloud.google.com/bigtable/) is a high performance storage system that can store and serve training data. This contrib @@ -13,7 +13,7 @@ Bigtable at high speed, in particular to feed modern accelerators. For general-purpose Cloud Bigtable APIs, see the [official Cloud Bigtable client library documentation][clientdoc]. -[clientdoc]: https://cloud.google.com/bigtable/docs/reference/libraries +[clientdoc]: https://cloud.google.com/bigtable/docs/reference/libraries ## Sample Use @@ -324,7 +324,7 @@ If you encounter a log line that includes the following: "filename":"/usr/share/grpc/roots.pem" ``` -you likely need to copy the [gRPC roots.pem file][grpcPem] to +you likely need to copy the [gRPC `roots.pem` file][grpcPem] to `/usr/share/grpc/roots.pem` on your local machine. [grpcPem]: https://github.com/grpc/grpc/blob/master/etc/roots.pem @@ -338,7 +338,10 @@ are available. - **Compute Engine**: When running on Compute Engine, the client will often use the service account from the virtual machine's metadata service. Be sure to authorize your Compute Engine VM to have access to the Cloud Bigtable service - when creating your VM. + when creating your VM, or [update the VM's scopes][update-vm-scopes] on a + running VM if you run into this issue. - **Cloud TPU**: Your Cloud TPUs run with the designated Cloud TPU service account dedicated to your GCP project. Ensure the service account has been authorized via the Cloud Console to access your Cloud Bigtable instances. + +[update-vm-scopes]: https://cloud.google.com/compute/docs/access/create-enable-service-accounts-for-instances#changeserviceaccountandscopes diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc index a6755a3496f3e1720f1c8c67f75521f2380a9845..a25a641cdb4608dee6d6c1bd18697860cc1f5613 100644 --- a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc +++ b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc @@ -84,6 +84,8 @@ class BigtableClientOp : public OpKernel { channel_args.SetMaxReceiveMessageSize( max_receive_message_size_); channel_args.SetUserAgentPrefix("tensorflow"); + channel_args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 0); + channel_args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, 60 * 1000); client_options.set_channel_arguments(channel_args); std::shared_ptr client = google::cloud::bigtable::CreateDefaultDataClient( @@ -216,11 +218,11 @@ class ToBigtableOp : public AsyncOpKernel { OP_REQUIRES_OK_ASYNC( ctx, GetDatasetFromVariantTensor(ctx->input(1), &dataset), done); - IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx); std::unique_ptr iterator; OP_REQUIRES_OK_ASYNC( ctx, - dataset->MakeIterator(&iter_ctx, "ToBigtableOpIterator", &iterator), + dataset->MakeIterator(IteratorContext(ctx), "ToBigtableOpIterator", + &iterator), done); int64 timestamp_int; @@ -243,9 +245,10 @@ class ToBigtableOp : public AsyncOpKernel { ::google::cloud::bigtable::BulkMutation mutation; // TODO(saeta): Make # of mutations configurable. for (uint64 i = 0; i < 100 && !end_of_sequence; ++i) { - OP_REQUIRES_OK_ASYNC( - ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence), - done); + OP_REQUIRES_OK_ASYNC(ctx, + iterator->GetNext(IteratorContext(ctx), + &components, &end_of_sequence), + done); if (!end_of_sequence) { OP_REQUIRES_OK_ASYNC( ctx, diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc index 9e49fa35db4b2cd2c8991100a28a5b9c55f01ffe..bd32672aa99d7bf70c44a264f488482c4f213a0b 100644 --- a/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc +++ b/tensorflow/contrib/bigtable/kernels/bigtable_lookup_dataset_op.cc @@ -53,7 +53,7 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, BigtableTableResource* table, @@ -61,7 +61,7 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel { std::vector columns, const DataTypeVector& output_types, std::vector output_shapes) - : GraphDatasetBase(ctx), + : DatasetBase(DatasetContext(ctx)), input_(input), table_(table), column_families_(std::move(column_families)), @@ -80,8 +80,8 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel { std::unique_ptr MakeIteratorInternal( const string& prefix) const override { - return std::unique_ptr(new Iterator( - {this, strings::StrCat(prefix, "::BigtableLookupDataset")})); + return std::unique_ptr( + new Iterator({this, strings::StrCat(prefix, "::BigtableLookup")})); } const DataTypeVector& output_dtypes() const override { @@ -96,6 +96,14 @@ class BigtableLookupDatasetOp : public UnaryDatasetOpKernel { return "BigtableLookupDatasetOp::Dataset"; } + protected: + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + return errors::Unimplemented("%s does not support serialization", + DebugString()); + } + private: static ::google::cloud::bigtable::Filter MakeFilter( const std::vector& column_families, diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc index e960719614a1c7c6c4af53ea924aef214a09b24d..a803fdcb49604ef4e596b64d62c7278c69764c15 100644 --- a/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc +++ b/tensorflow/contrib/bigtable/kernels/bigtable_prefix_key_dataset_op.cc @@ -35,11 +35,13 @@ class BigtablePrefixKeyDatasetOp : public DatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table, string prefix) - : GraphDatasetBase(ctx), table_(table), prefix_(std::move(prefix)) { + : DatasetBase(DatasetContext(ctx)), + table_(table), + prefix_(std::move(prefix)) { table_->Ref(); } @@ -47,8 +49,8 @@ class BigtablePrefixKeyDatasetOp : public DatasetOpKernel { std::unique_ptr MakeIteratorInternal( const string& prefix) const override { - return std::unique_ptr(new Iterator( - {this, strings::StrCat(prefix, "::BigtablePrefixKeyDataset")})); + return std::unique_ptr( + new Iterator({this, strings::StrCat(prefix, "::BigtablePrefixKey")})); } const DataTypeVector& output_dtypes() const override { @@ -68,6 +70,14 @@ class BigtablePrefixKeyDatasetOp : public DatasetOpKernel { BigtableTableResource* table() const { return table_; } + protected: + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + return errors::Unimplemented("%s does not support serialization", + DebugString()); + } + private: class Iterator : public BigtableReaderDatasetIterator { public: diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc index 96d3565d9b90e72f9e25e69e91f1931c982714cd..5cd0371c79f7eded9303b81dd388df8d306dff80 100644 --- a/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc +++ b/tensorflow/contrib/bigtable/kernels/bigtable_range_key_dataset_op.cc @@ -39,11 +39,11 @@ class BigtableRangeKeyDatasetOp : public DatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table, string start_key, string end_key) - : GraphDatasetBase(ctx), + : DatasetBase(DatasetContext(ctx)), table_(table), start_key_(std::move(start_key)), end_key_(std::move(end_key)) { @@ -54,8 +54,8 @@ class BigtableRangeKeyDatasetOp : public DatasetOpKernel { std::unique_ptr MakeIteratorInternal( const string& prefix) const override { - return std::unique_ptr(new Iterator( - {this, strings::StrCat(prefix, "::BigtableRangeKeyDataset")})); + return std::unique_ptr( + new Iterator({this, strings::StrCat(prefix, "::BigtableRangeKey")})); } const DataTypeVector& output_dtypes() const override { @@ -75,6 +75,14 @@ class BigtableRangeKeyDatasetOp : public DatasetOpKernel { BigtableTableResource* table() const { return table_; } + protected: + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + return errors::Unimplemented("%s does not support serialization", + DebugString()); + } + private: class Iterator : public BigtableReaderDatasetIterator { public: diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc index a1a63a975afd62325e01586542006058fa2c83bc..6928d9423c84f7504fea3ac1abd929357da034a5 100644 --- a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc +++ b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc @@ -52,11 +52,11 @@ class BigtableSampleKeyPairsDatasetOp : public DatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table, string prefix, string start_key, string end_key) - : GraphDatasetBase(ctx), + : DatasetBase(DatasetContext(ctx)), table_(table), key_range_(MakeMultiModeKeyRange( std::move(prefix), std::move(start_key), std::move(end_key))) { @@ -68,7 +68,7 @@ class BigtableSampleKeyPairsDatasetOp : public DatasetOpKernel { std::unique_ptr MakeIteratorInternal( const string& prefix) const override { return std::unique_ptr(new Iterator( - {this, strings::StrCat(prefix, "::BigtableSampleKeyPairsDataset")})); + {this, strings::StrCat(prefix, "::BigtableSampleKeyPairs")})); } const DataTypeVector& output_dtypes() const override { @@ -87,6 +87,14 @@ class BigtableSampleKeyPairsDatasetOp : public DatasetOpKernel { return "BigtableSampleKeyPairsDatasetOp::Dataset"; } + protected: + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + return errors::Unimplemented("%s does not support serialization", + DebugString()); + } + private: static MultiModeKeyRange MakeMultiModeKeyRange(string prefix, string start_key, diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc index a5a47cfe2dcf7c4034e0d5bc7d9a73ef9c1dc94e..a759fb5063900199325304ccf83c52f3bdd7d702 100644 --- a/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc +++ b/tensorflow/contrib/bigtable/kernels/bigtable_sample_keys_dataset_op.cc @@ -31,10 +31,10 @@ class BigtableSampleKeysDatasetOp : public DatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table) - : GraphDatasetBase(ctx), table_(table) { + : DatasetBase(DatasetContext(ctx)), table_(table) { table_->Ref(); } @@ -43,7 +43,7 @@ class BigtableSampleKeysDatasetOp : public DatasetOpKernel { std::unique_ptr MakeIteratorInternal( const string& prefix) const override { return std::unique_ptr(new Iterator( - {this, strings::StrCat(prefix, "::BigtableSampleKeysDataset")})); + {this, strings::StrCat(prefix, "::BigtableSampleKeys")})); } const DataTypeVector& output_dtypes() const override { @@ -63,6 +63,14 @@ class BigtableSampleKeysDatasetOp : public DatasetOpKernel { BigtableTableResource* table() const { return table_; } + protected: + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + return errors::Unimplemented("%s does not support serialization", + DebugString()); + } + private: class Iterator : public DatasetIterator { public: diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc index 13cb8681679ec1541b74a20474665f770790201f..78a920b077680980a209ad8c30c09409a6f4ebf5 100644 --- a/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc +++ b/tensorflow/contrib/bigtable/kernels/bigtable_scan_dataset_op.cc @@ -84,7 +84,7 @@ class BigtableScanDatasetOp : public DatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: explicit Dataset(OpKernelContext* ctx, BigtableTableResource* table, string prefix, string start_key, string end_key, @@ -92,7 +92,7 @@ class BigtableScanDatasetOp : public DatasetOpKernel { std::vector columns, float probability, const DataTypeVector& output_types, std::vector output_shapes) - : GraphDatasetBase(ctx), + : DatasetBase(DatasetContext(ctx)), table_(table), prefix_(std::move(prefix)), start_key_(std::move(start_key)), @@ -111,8 +111,8 @@ class BigtableScanDatasetOp : public DatasetOpKernel { std::unique_ptr MakeIteratorInternal( const string& prefix) const override { - return std::unique_ptr(new Iterator( - {this, strings::StrCat(prefix, "::BigtableScanDataset")})); + return std::unique_ptr( + new Iterator({this, strings::StrCat(prefix, "::BigtableScan")})); } const DataTypeVector& output_dtypes() const override { @@ -129,6 +129,14 @@ class BigtableScanDatasetOp : public DatasetOpKernel { BigtableTableResource* table() const { return table_; } + protected: + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + return errors::Unimplemented("%s does not support serialization", + DebugString()); + } + private: class Iterator : public BigtableReaderDatasetIterator { public: diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py index fd30aa8bbb962257c1ef5ac07e047fffca88c4bc..3e1b6228673fbdcb5a228a11532d29e6b2c817dc 100644 --- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py +++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""The Python API for TensorFlow's Bigtable integration. +"""The Python API for TensorFlow's Cloud Bigtable integration. TensorFlow has support for reading from and writing to Cloud Bigtable. To use -the Bigtable TensorFlow integration, first create a BigtableClient (which -configures your connection to Cloud Bigtable), and then open a Table. The Table -object then allows you to create numerous @{tf.data.Dataset}s to read data, or -write a @{tf.data.Dataset} object to the underlying Bigtable Table. +TensorFlow + Cloud Bigtable integration, first create a BigtableClient to +configure your connection to Cloud Bigtable, and then create a BigtableTable +object to allow you to create numerous `tf.data.Dataset`s to read data, or +write a `tf.data.Dataset` object to the underlying Cloud Bigtable table. -For background on Google Cloud Bigtable, see: https://cloud.google.com/bigtable. +For background on Cloud Bigtable, see: https://cloud.google.com/bigtable . """ from __future__ import absolute_import @@ -48,7 +48,7 @@ class BigtableClient(object): """BigtableClient is the entrypoint for interacting with Cloud Bigtable in TF. BigtableClient encapsulates a connection to Cloud Bigtable, and exposes the - `table` method to open a Bigtable Table. + `table` method to open a Bigtable table. """ def __init__(self, @@ -94,7 +94,7 @@ class BigtableClient(object): project_id, instance_id, connection_pool_size, max_receive_message_size) def table(self, name, snapshot=None): - """Opens a table and returns a `BigtableTable` object. + """Opens a table and returns a `tf.contrib.bigtable.BigtableTable` object. Args: name: A `tf.string` `tf.Tensor` name of the table to open. @@ -102,8 +102,8 @@ class BigtableClient(object): request the creation of a snapshot. (Note: currently unimplemented.) Returns: - A `BigtableTable` python object representing the operations available on - the table. + A `tf.contrib.bigtable.BigtableTable` Python object representing the + operations available on the table. """ # TODO(saeta): Implement snapshot functionality. table = gen_bigtable_ops.bigtable_table(self._resource, name) @@ -133,7 +133,8 @@ class BigtableTable(object): """Retrieves the values of columns for a dataset of keys. Example usage: - ``` + + ```python table = bigtable_client.table("my_table") key_dataset = table.get_keys_prefix("imagenet") images = key_dataset.apply(table.lookup_columns(("cf1", "image"), @@ -144,7 +145,8 @@ class BigtableTable(object): Alternatively, you can use keyword arguments to specify the columns to capture. Example (same as above, rewritten): - ``` + + ```python table = bigtable_client.table("my_table") key_dataset = table.get_keys_prefix("imagenet") images = key_dataset.apply(table.lookup_columns( @@ -152,15 +154,17 @@ class BigtableTable(object): training_data = images.map(parse_and_crop, num_parallel_calls=64).batch(128) ``` - Note: certain kwargs keys are reserved, and thus some column families cannot - be identified using the kwargs syntax. Instead, please use the args syntax. - This list includes: + Note: certain `kwargs` keys are reserved, and thus, some column families + cannot be identified using the `kwargs` syntax. Instead, please use the + `args` syntax. This list includes: + - 'name' - This list can change at any time. + + Note: this list can change at any time. Args: *args: A list of tuples containing (column family, column name) pairs. - **kwargs: Column families and + **kwargs: Column families (keys) and column qualifiers (values). Returns: A function that can be passed to `tf.data.Dataset.apply` to retrieve the @@ -199,7 +203,7 @@ class BigtableTable(object): be retrieved. If end is None, all subsequent row keys will be retrieved. Returns: - A @{tf.data.Dataset} containing `tf.string` Tensors corresponding to all + A `tf.data.Dataset` containing `tf.string` Tensors corresponding to all of the row keys between `start` and `end`. """ # TODO(saeta): Make inclusive / exclusive configurable? @@ -215,7 +219,7 @@ class BigtableTable(object): retrieved. Returns: - A @{tf.data.Dataset}. containing `tf.string` Tensors corresponding to all + A `tf.data.Dataset`. containing `tf.string` Tensors corresponding to all of the row keys matching that prefix. """ return _BigtablePrefixKeyDataset(self, prefix) @@ -224,11 +228,11 @@ class BigtableTable(object): """Retrieves a sampling of row keys from the Bigtable table. This dataset is most often used in conjunction with - @{tf.contrib.data.parallel_interleave} to construct a set of ranges for + `tf.contrib.data.parallel_interleave` to construct a set of ranges for scanning in parallel. Returns: - A @{tf.data.Dataset} returning string row keys. + A `tf.data.Dataset` returning string row keys. """ return _BigtableSampleKeysDataset(self) @@ -268,7 +272,7 @@ class BigtableTable(object): that are treated as the column qualifier (column name). Returns: - A @{tf.data.Dataset} returning the row keys and the cell contents. + A `tf.data.Dataset` returning the row keys and the cell contents. Raises: ValueError: If the configured probability is unexpected. @@ -313,7 +317,7 @@ class BigtableTable(object): that are treated as the column qualifier (column name). Returns: - A @{tf.data.Dataset} returning the row keys and the cell contents. + A `tf.data.Dataset` returning the row keys and the cell contents. Raises: ValueError: If the configured probability is unexpected. @@ -331,7 +335,7 @@ class BigtableTable(object): """Retrieves row (including values) from the Bigtable service at high speed. Rows with row-key prefixed by `prefix` will be retrieved. This method is - similar to `scan_prefix`, but by constrast performs multiple sub-scans in + similar to `scan_prefix`, but by contrast performs multiple sub-scans in parallel in order to achieve higher performance. Note: The dataset produced by this method is not deterministic! @@ -369,7 +373,7 @@ class BigtableTable(object): that are treated as the column qualifier (column name). Returns: - A @{tf.data.Dataset} returning the row keys and the cell contents. + A `tf.data.Dataset` returning the row keys and the cell contents. Raises: ValueError: If the configured probability is unexpected. @@ -390,7 +394,7 @@ class BigtableTable(object): """Retrieves rows (including values) from the Bigtable service. Rows with row-keys between `start` and `end` will be retrieved. This method - is similar to `scan_range`, but by constrast performs multiple sub-scans in + is similar to `scan_range`, but by contrast performs multiple sub-scans in parallel in order to achieve higher performance. Note: The dataset produced by this method is not deterministic! @@ -431,7 +435,7 @@ class BigtableTable(object): that are treated as the column qualifier (column name). Returns: - A @{tf.data.Dataset} returning the row keys and the cell contents. + A `tf.data.Dataset` returning the row keys and the cell contents. Raises: ValueError: If the configured probability is unexpected. @@ -446,12 +450,12 @@ class BigtableTable(object): """Writes a dataset to the table. Args: - dataset: A @{tf.data.Dataset} to be written to this table. It must produce + dataset: A `tf.data.Dataset` to be written to this table. It must produce a list of number-of-columns+1 elements, all of which must be strings. The first value will be used as the row key, and subsequent values will be used as cell values for the corresponding columns from the corresponding column_families and columns entries. - column_families: A @{tf.Tensor} of `tf.string`s corresponding to the + column_families: A `tf.Tensor` of `tf.string`s corresponding to the column names to store the dataset's elements into. columns: A `tf.Tensor` of `tf.string`s corresponding to the column names to store the dataset's elements into. @@ -459,7 +463,7 @@ class BigtableTable(object): Leave as None to use server-provided timestamps. Returns: - A @{tf.Operation} that can be run to perform the write. + A `tf.Operation` that can be run to perform the write. Raises: ValueError: If there are unexpected or incompatible types, or if the @@ -498,7 +502,7 @@ class BigtableTable(object): normalized_columns: The column families and column qualifiers to retrieve. Returns: - A @{tf.data.Dataset} representing the result of the parallel scan. + A `tf.data.Dataset` representing the result of the parallel scan. """ if num_parallel_scans is None: num_parallel_scans = 50 @@ -712,7 +716,7 @@ class _BigtableScanDataset(dataset_ops.Dataset): class _BigtableSampleKeyPairsDataset(dataset_ops.Dataset): - """_BigtableKeyRangeDataset returns key pairs from the Bigtable. + """_BigtableSampleKeyPairsDataset returns key pairs from a Bigtable table. """ def __init__(self, table, prefix, start, end): diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD index f4a375328eb9cdbe17682637c2f20e3aa8a1e0ca..5fcb19a47aac492d49b0d8e99af5699bae2ad9f0 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD +++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD @@ -191,7 +191,7 @@ py_test( py_test( name = "estimator_test", - size = "medium", + size = "large", srcs = ["estimator_test.py"], srcs_version = "PY2AND3", tags = [ diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py index dbfa69edcbf9e59fedc068b8ee516b92e2c03f4f..194a5c8754cb0ab2db299e3fb5c998c0f27f8435 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py @@ -86,7 +86,8 @@ def _dnn_tree_combined_model_fn( tree_center_bias=False, dnn_to_tree_distillation_param=None, use_core_versions=False, - output_type=model.ModelBuilderOutputType.MODEL_FN_OPS): + output_type=model.ModelBuilderOutputType.MODEL_FN_OPS, + override_global_step_value=None): """DNN and GBDT combined model_fn. Args: @@ -135,6 +136,12 @@ def _dnn_tree_combined_model_fn( will be set to True. use_core_versions: Whether feature columns and loss are from the core (as opposed to contrib) version of tensorflow. + output_type: Whether to return ModelFnOps (old interface) or EstimatorSpec + (new interface). + override_global_step_value: If after the training is done, global step + value must be reset to this value. This is particularly useful for hyper + parameter tuning, which can't recognize early stopping due to the number + of trees. If None, no override of global step will happen. Returns: A `ModelFnOps` object. @@ -350,7 +357,8 @@ def _dnn_tree_combined_model_fn( trainer_hooks.SwitchTrainOp(dnn_train_op, dnn_steps_to_train, tree_train_op), trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, - finalized_trees) + finalized_trees, + override_global_step_value) ]) return model_fn_ops @@ -378,7 +386,8 @@ def _dnn_tree_combined_model_fn( trainer_hooks.SwitchTrainOp(dnn_spec.train_op, dnn_steps_to_train, tree_spec.train_op), trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, - finalized_trees) + finalized_trees, + override_global_step_value) ] fusion_spec = fusion_spec._replace(training_hooks=training_hooks + list(fusion_spec.training_hooks)) @@ -411,7 +420,8 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator): tree_feature_columns=None, tree_center_bias=False, dnn_to_tree_distillation_param=None, - use_core_versions=False): + use_core_versions=False, + override_global_step_value=None): """Initializes a DNNBoostedTreeCombinedClassifier instance. Args: @@ -467,6 +477,10 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator): will be set to True. use_core_versions: Whether feature columns and loss are from the core (as opposed to contrib) version of tensorflow. + override_global_step_value: If after the training is done, global step + value must be reset to this value. This is particularly useful for hyper + parameter tuning, which can't recognize early stopping due to the number + of trees. If None, no override of global step will happen. """ head = head_lib.multi_class_head( n_classes=n_classes, @@ -497,7 +511,8 @@ class DNNBoostedTreeCombinedClassifier(estimator.Estimator): tree_feature_columns=tree_feature_columns, tree_center_bias=tree_center_bias, dnn_to_tree_distillation_param=dnn_to_tree_distillation_param, - use_core_versions=use_core_versions) + use_core_versions=use_core_versions, + override_global_step_value=override_global_step_value) super(DNNBoostedTreeCombinedClassifier, self).__init__( model_fn=_model_fn, @@ -531,7 +546,8 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator): tree_feature_columns=None, tree_center_bias=False, dnn_to_tree_distillation_param=None, - use_core_versions=False): + use_core_versions=False, + override_global_step_value=None): """Initializes a DNNBoostedTreeCombinedRegressor instance. Args: @@ -587,6 +603,10 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator): will be set to True. use_core_versions: Whether feature columns and loss are from the core (as opposed to contrib) version of tensorflow. + override_global_step_value: If after the training is done, global step + value must be reset to this value. This is particularly useful for hyper + parameter tuning, which can't recognize early stopping due to the number + of trees. If None, no override of global step will happen. """ head = head_lib.regression_head( label_name=label_name, @@ -622,7 +642,8 @@ class DNNBoostedTreeCombinedRegressor(estimator.Estimator): tree_feature_columns=tree_feature_columns, tree_center_bias=tree_center_bias, dnn_to_tree_distillation_param=dnn_to_tree_distillation_param, - use_core_versions=use_core_versions) + use_core_versions=use_core_versions, + override_global_step_value=override_global_step_value) super(DNNBoostedTreeCombinedRegressor, self).__init__( model_fn=_model_fn, @@ -657,7 +678,8 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator): tree_feature_columns=None, tree_center_bias=False, dnn_to_tree_distillation_param=None, - use_core_versions=False): + use_core_versions=False, + override_global_step_value=None): """Initializes a DNNBoostedTreeCombinedEstimator instance. Args: @@ -708,6 +730,10 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator): will be set to True. use_core_versions: Whether feature columns and loss are from the core (as opposed to contrib) version of tensorflow. + override_global_step_value: If after the training is done, global step + value must be reset to this value. This is particularly useful for hyper + parameter tuning, which can't recognize early stopping due to the number + of trees. If None, no override of global step will happen. """ def _model_fn(features, labels, mode, config): @@ -732,7 +758,8 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator): tree_feature_columns=tree_feature_columns, tree_center_bias=tree_center_bias, dnn_to_tree_distillation_param=dnn_to_tree_distillation_param, - use_core_versions=use_core_versions) + use_core_versions=use_core_versions, + override_global_step_value=override_global_step_value) super(DNNBoostedTreeCombinedEstimator, self).__init__( model_fn=_model_fn, @@ -832,7 +859,8 @@ class CoreDNNBoostedTreeCombinedEstimator(core_estimator.Estimator): tree_center_bias=tree_center_bias, dnn_to_tree_distillation_param=dnn_to_tree_distillation_param, output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC, - use_core_versions=True) + use_core_versions=True, + override_global_step_value=None) super(CoreDNNBoostedTreeCombinedEstimator, self).__init__( model_fn=_model_fn, model_dir=model_dir, config=config) diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py index 2df879f924d735c5bcd0d354159c825dee3afda8..870ce2442bb5e98db7615c43054c9c827b8c88f0 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py @@ -22,8 +22,10 @@ from tensorflow.contrib.boosted_trees.estimator_batch import model from tensorflow.contrib.boosted_trees.python.utils import losses from tensorflow.contrib.learn.python.learn.estimators import estimator from tensorflow.contrib.learn.python.learn.estimators import head as head_lib +from tensorflow.python.estimator.canned import head as core_head_lib from tensorflow.python.estimator import estimator as core_estimator from tensorflow.python.ops import math_ops +from tensorflow.python.ops.losses import losses as core_losses # ================== Old estimator interface=================================== @@ -49,7 +51,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator): logits_modifier_function=None, center_bias=True, use_core_libs=False, - output_leaf_index=False): + output_leaf_index=False, + override_global_step_value=None): """Initializes a GradientBoostedDecisionTreeClassifier estimator instance. Args: @@ -83,6 +86,14 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator): for result_dict in result_iter: # access leaf index list by result_dict["leaf_index"] # which contains one leaf index per tree + override_global_step_value: If after the training is done, global step + value must be reset to this value. This should be used to reset global + step to a number > number of steps used to train the current ensemble. + For example, the usual way is to train a number of trees and set a very + large number of training steps. When the training is done (number of + trees were trained), this parameter can be used to set the global step + to a large value, making it look like that number of training steps ran. + If None, no override of global step will happen. Raises: ValueError: If learner_config is not valid. @@ -123,6 +134,7 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator): 'logits_modifier_function': logits_modifier_function, 'use_core_libs': use_core_libs, 'output_leaf_index': output_leaf_index, + 'override_global_step_value': override_global_step_value }, model_dir=model_dir, config=config, @@ -146,7 +158,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator): logits_modifier_function=None, center_bias=True, use_core_libs=False, - output_leaf_index=False): + output_leaf_index=False, + override_global_step_value=None): """Initializes a GradientBoostedDecisionTreeRegressor estimator instance. Args: @@ -180,6 +193,14 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator): for example_prediction_result in result_dict: # access leaf index list by example_prediction_result["leaf_index"] # which contains one leaf index per tree + override_global_step_value: If after the training is done, global step + value must be reset to this value. This should be used to reset global + step to a number > number of steps used to train the current ensemble. + For example, the usual way is to train a number of trees and set a very + large number of training steps. When the training is done (number of + trees were trained), this parameter can be used to set the global step + to a large value, making it look like that number of training steps ran. + If None, no override of global step will happen. """ head = head_lib.regression_head( label_name=label_name, @@ -203,6 +224,7 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator): 'center_bias': center_bias, 'use_core_libs': use_core_libs, 'output_leaf_index': False, + 'override_global_step_value': override_global_step_value }, model_dir=model_dir, config=config, @@ -228,7 +250,8 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator): logits_modifier_function=None, center_bias=True, use_core_libs=False, - output_leaf_index=False): + output_leaf_index=False, + override_global_step_value=None): """Initializes a GradientBoostedDecisionTreeEstimator estimator instance. Args: @@ -258,6 +281,14 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator): for example_prediction_result in result_dict: # access leaf index list by example_prediction_result["leaf_index"] # which contains one leaf index per tree + override_global_step_value: If after the training is done, global step + value must be reset to this value. This should be used to reset global + step to a number > number of steps used to train the current ensemble. + For example, the usual way is to train a number of trees and set a very + large number of training steps. When the training is done (number of + trees were trained), this parameter can be used to set the global step + to a large value, making it look like that number of training steps ran. + If None, no override of global step will happen. """ super(GradientBoostedDecisionTreeEstimator, self).__init__( model_fn=model.model_builder, @@ -272,6 +303,7 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator): 'center_bias': center_bias, 'use_core_libs': use_core_libs, 'output_leaf_index': False, + 'override_global_step_value': override_global_step_value }, model_dir=model_dir, config=config, @@ -281,24 +313,23 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator): class GradientBoostedDecisionTreeRanker(estimator.Estimator): """A ranking estimator using gradient boosted decision trees.""" - def __init__( - self, - learner_config, - examples_per_layer, - head, - ranking_model_pair_keys, - num_trees=None, - feature_columns=None, - weight_column_name=None, - model_dir=None, - config=None, - label_keys=None, - feature_engineering_fn=None, - logits_modifier_function=None, - center_bias=False, - use_core_libs=False, - output_leaf_index=False, - ): + def __init__(self, + learner_config, + examples_per_layer, + head, + ranking_model_pair_keys, + num_trees=None, + feature_columns=None, + weight_column_name=None, + model_dir=None, + config=None, + label_keys=None, + feature_engineering_fn=None, + logits_modifier_function=None, + center_bias=False, + use_core_libs=False, + output_leaf_index=False, + override_global_step_value=None): """Initializes a GradientBoostedDecisionTreeRanker instance. This is an estimator that can be trained off the pairwise data and can be @@ -338,7 +369,14 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator): for result_dict in result_iter: # access leaf index list by result_dict["leaf_index"] # which contains one leaf index per tree - + override_global_step_value: If after the training is done, global step + value must be reset to this value. This should be used to reset global + step to a number > number of steps used to train the current ensemble. + For example, the usual way is to train a number of trees and set a very + large number of training steps. When the training is done (number of + trees were trained), this parameter can be used to set the global step + to a large value, making it look like that number of training steps ran. + If None, no override of global step will happen. Raises: ValueError: If learner_config is not valid. """ @@ -357,6 +395,7 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator): 'use_core_libs': use_core_libs, 'output_leaf_index': output_leaf_index, 'ranking_model_pair_keys': ranking_model_pair_keys, + 'override_global_step_value': override_global_step_value }, model_dir=model_dir, config=config, @@ -366,6 +405,25 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator): # The estimators below use new core Estimator interface and must be used with # new feature columns and heads. +# For multiclass classification, use the following head since it uses loss +# that is twice differentiable. +def core_multiclass_head(n_classes): + """Core head for multiclass problems.""" + + def loss_fn(labels, logits): + result = losses.per_example_maxent_loss( + labels=labels, logits=logits, weights=None, num_classes=n_classes) + return result[0] + + # pylint:disable=protected-access + head_fn = core_head_lib._multi_class_head_with_softmax_cross_entropy_loss( + n_classes=n_classes, + loss_fn=loss_fn, + loss_reduction=core_losses.Reduction.SUM_OVER_NONZERO_WEIGHTS) + # pylint:enable=protected-access + + return head_fn + class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator): """An estimator using gradient boosted decision trees. @@ -435,6 +493,7 @@ class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator): 'logits_modifier_function': logits_modifier_function, 'use_core_libs': True, 'output_leaf_index': output_leaf_index, + 'override_global_step_value': None }, output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC) @@ -445,22 +504,20 @@ class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator): class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator): """A ranking estimator using gradient boosted decision trees.""" - def __init__( - self, - learner_config, - examples_per_layer, - head, - ranking_model_pair_keys, - num_trees=None, - feature_columns=None, - weight_column_name=None, - model_dir=None, - config=None, - label_keys=None, - logits_modifier_function=None, - center_bias=False, - output_leaf_index=False, - ): + def __init__(self, + learner_config, + examples_per_layer, + head, + ranking_model_pair_keys, + num_trees=None, + feature_columns=None, + weight_column_name=None, + model_dir=None, + config=None, + label_keys=None, + logits_modifier_function=None, + center_bias=False, + output_leaf_index=False): """Initializes a GradientBoostedDecisionTreeRanker instance. This is an estimator that can be trained off the pairwise data and can be @@ -519,6 +576,7 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator): 'use_core_libs': True, 'output_leaf_index': output_leaf_index, 'ranking_model_pair_keys': ranking_model_pair_keys, + 'override_global_step_value': None }, output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC) diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py index 9e9febbbef662a594d3589b501e9ae0eea0af196..68d710d713770a3a4a623b9447bb6a6b93569cac 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py @@ -25,10 +25,12 @@ from tensorflow.python.estimator.canned import head as head_lib from tensorflow.python.feature_column import feature_column_lib as core_feature_column from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops from tensorflow.python.framework import test_util from tensorflow.python.ops.losses import losses from tensorflow.python.platform import gfile from tensorflow.python.platform import googletest +from tensorflow.python.training import checkpoint_utils def _train_input_fn(): @@ -37,6 +39,15 @@ def _train_input_fn(): return features, label +def _multiclass_train_input_fn(): + features = { + "x": constant_op.constant([[2.], [1.], [1.], [5.], [3.5], [4.6], [3.5]]) + } + label = constant_op.constant( + [[1], [0], [0], [2], [2], [0], [1]], dtype=dtypes.int32) + return features, label + + def _ranking_train_input_fn(): features = { "a.f1": constant_op.constant([[3.], [0.3], [1.]]), @@ -68,6 +79,10 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase): self._export_dir_base = tempfile.mkdtemp() + "export/" gfile.MkDir(self._export_dir_base) + def _assert_checkpoint(self, model_dir, global_step): + reader = checkpoint_utils.load_checkpoint(model_dir) + self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP)) + def testFitAndEvaluateDontThrowException(self): learner_config = learner_pb2.LearnerConfig() learner_config.num_classes = 2 @@ -202,6 +217,126 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase): model.evaluate(input_fn=_ranking_train_input_fn, steps=1) model.predict(input_fn=_infer_ranking_train_input_fn) + def testDoesNotOverrideGlobalSteps(self): + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = 2 + learner_config.constraints.max_tree_depth = 2 + model_dir = tempfile.mkdtemp() + config = run_config.RunConfig() + + classifier = estimator.GradientBoostedDecisionTreeClassifier( + learner_config=learner_config, + num_trees=1, + examples_per_layer=3, + model_dir=model_dir, + config=config, + feature_columns=[contrib_feature_column.real_valued_column("x")], + output_leaf_index=False) + + classifier.fit(input_fn=_train_input_fn, steps=15) + # When no override of global steps, 5 steps were used. + self._assert_checkpoint(classifier.model_dir, global_step=5) + + def testOverridesGlobalSteps(self): + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = 2 + learner_config.constraints.max_tree_depth = 2 + model_dir = tempfile.mkdtemp() + config = run_config.RunConfig() + + classifier = estimator.GradientBoostedDecisionTreeClassifier( + learner_config=learner_config, + num_trees=1, + examples_per_layer=3, + model_dir=model_dir, + config=config, + feature_columns=[contrib_feature_column.real_valued_column("x")], + output_leaf_index=False, + override_global_step_value=10000000) + + classifier.fit(input_fn=_train_input_fn, steps=15) + self._assert_checkpoint(classifier.model_dir, global_step=10000000) + + def testFitAndEvaluateMultiClassTreePerClassDontThrowException(self): + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = 3 + learner_config.constraints.max_tree_depth = 1 + learner_config.multi_class_strategy = ( + learner_pb2.LearnerConfig.TREE_PER_CLASS) + + model_dir = tempfile.mkdtemp() + config = run_config.RunConfig() + + classifier = estimator.GradientBoostedDecisionTreeClassifier( + learner_config=learner_config, + n_classes=learner_config.num_classes, + num_trees=1, + examples_per_layer=7, + model_dir=model_dir, + config=config, + feature_columns=[contrib_feature_column.real_valued_column("x")]) + + classifier.fit(input_fn=_multiclass_train_input_fn, steps=100) + classifier.evaluate(input_fn=_eval_input_fn, steps=1) + classifier.export(self._export_dir_base) + result_iter = classifier.predict(input_fn=_eval_input_fn) + for prediction_dict in result_iter: + self.assertTrue("classes" in prediction_dict) + + def testFitAndEvaluateMultiClassDiagonalDontThrowException(self): + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = 3 + learner_config.constraints.max_tree_depth = 1 + learner_config.multi_class_strategy = ( + learner_pb2.LearnerConfig.DIAGONAL_HESSIAN) + + model_dir = tempfile.mkdtemp() + config = run_config.RunConfig() + + classifier = estimator.GradientBoostedDecisionTreeClassifier( + learner_config=learner_config, + n_classes=learner_config.num_classes, + num_trees=1, + examples_per_layer=7, + model_dir=model_dir, + config=config, + center_bias=False, + feature_columns=[contrib_feature_column.real_valued_column("x")]) + + classifier.fit(input_fn=_multiclass_train_input_fn, steps=100) + classifier.evaluate(input_fn=_eval_input_fn, steps=1) + classifier.export(self._export_dir_base) + result_iter = classifier.predict(input_fn=_eval_input_fn) + for prediction_dict in result_iter: + self.assertTrue("classes" in prediction_dict) + + def testFitAndEvaluateMultiClassFullDontThrowException(self): + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = 3 + learner_config.constraints.max_tree_depth = 1 + learner_config.multi_class_strategy = ( + learner_pb2.LearnerConfig.FULL_HESSIAN) + + model_dir = tempfile.mkdtemp() + config = run_config.RunConfig() + + classifier = estimator.GradientBoostedDecisionTreeClassifier( + learner_config=learner_config, + n_classes=learner_config.num_classes, + num_trees=1, + examples_per_layer=7, + model_dir=model_dir, + config=config, + center_bias=False, + feature_columns=[contrib_feature_column.real_valued_column("x")]) + + classifier.fit(input_fn=_multiclass_train_input_fn, steps=100) + classifier.evaluate(input_fn=_eval_input_fn, steps=1) + classifier.export(self._export_dir_base) + result_iter = classifier.predict(input_fn=_eval_input_fn) + for prediction_dict in result_iter: + self.assertTrue("classes" in prediction_dict) + class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase): @@ -257,6 +392,87 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase): est.evaluate(input_fn=_ranking_train_input_fn, steps=1) est.predict(input_fn=_infer_ranking_train_input_fn) + def testFitAndEvaluateMultiClassTreePerClasssDontThrowException(self): + n_classes = 3 + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = n_classes + learner_config.constraints.max_tree_depth = 1 + learner_config.multi_class_strategy = ( + learner_pb2.LearnerConfig.TREE_PER_CLASS) + + head_fn = estimator.core_multiclass_head(n_classes=n_classes) + + model_dir = tempfile.mkdtemp() + config = run_config.RunConfig() + + classifier = estimator.CoreGradientBoostedDecisionTreeEstimator( + learner_config=learner_config, + head=head_fn, + num_trees=1, + center_bias=False, + examples_per_layer=7, + model_dir=model_dir, + config=config, + feature_columns=[core_feature_column.numeric_column("x")]) + + classifier.train(input_fn=_multiclass_train_input_fn, steps=100) + classifier.evaluate(input_fn=_multiclass_train_input_fn, steps=1) + classifier.predict(input_fn=_eval_input_fn) + + def testFitAndEvaluateMultiClassDiagonalDontThrowException(self): + n_classes = 3 + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = n_classes + learner_config.constraints.max_tree_depth = 1 + learner_config.multi_class_strategy = ( + learner_pb2.LearnerConfig.DIAGONAL_HESSIAN) + + head_fn = estimator.core_multiclass_head(n_classes=n_classes) + + model_dir = tempfile.mkdtemp() + config = run_config.RunConfig() + + classifier = estimator.CoreGradientBoostedDecisionTreeEstimator( + learner_config=learner_config, + head=head_fn, + num_trees=1, + center_bias=False, + examples_per_layer=7, + model_dir=model_dir, + config=config, + feature_columns=[core_feature_column.numeric_column("x")]) + + classifier.train(input_fn=_multiclass_train_input_fn, steps=100) + classifier.evaluate(input_fn=_multiclass_train_input_fn, steps=1) + classifier.predict(input_fn=_eval_input_fn) + + def testFitAndEvaluateMultiClassFullDontThrowException(self): + n_classes = 3 + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = n_classes + learner_config.constraints.max_tree_depth = 1 + learner_config.multi_class_strategy = ( + learner_pb2.LearnerConfig.FULL_HESSIAN) + + head_fn = estimator.core_multiclass_head(n_classes=n_classes) + + model_dir = tempfile.mkdtemp() + config = run_config.RunConfig() + + classifier = estimator.CoreGradientBoostedDecisionTreeEstimator( + learner_config=learner_config, + head=head_fn, + num_trees=1, + center_bias=False, + examples_per_layer=7, + model_dir=model_dir, + config=config, + feature_columns=[core_feature_column.numeric_column("x")]) + + classifier.train(input_fn=_multiclass_train_input_fn, steps=100) + classifier.evaluate(input_fn=_multiclass_train_input_fn, steps=1) + classifier.predict(input_fn=_eval_input_fn) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py index 161cc42cb0fe93c18722923095edf7228b5b378c..04b46c3483fa25286078b88c2776b76e4f3c0bcf 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py @@ -58,6 +58,10 @@ def model_builder(features, * weight_column_name: The name of weight column. * center_bias: Whether a separate tree should be created for first fitting the bias. + * override_global_step_value: If after the training is done, global step + value must be reset to this value. This is particularly useful for hyper + parameter tuning, which can't recognize early stopping due to the number + of trees. If None, no override of global step will happen. config: `RunConfig` of the estimator. output_type: Whether to return ModelFnOps (old interface) or EstimatorSpec (new interface). @@ -76,6 +80,7 @@ def model_builder(features, use_core_libs = params["use_core_libs"] logits_modifier_function = params["logits_modifier_function"] output_leaf_index = params["output_leaf_index"] + override_global_step_value = params.get("override_global_step_value", None) if features is None: raise ValueError("At least one feature must be specified.") @@ -136,7 +141,8 @@ def model_builder(features, finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor() training_hooks.append( trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, - finalized_trees)) + finalized_trees, + override_global_step_value)) if output_type == ModelBuilderOutputType.MODEL_FN_OPS: if use_core_libs and callable(create_estimator_spec_op): @@ -206,6 +212,10 @@ def ranking_model_builder(features, for left and right part of the training pairs for ranking. For example, for an Example with features "a.f1" and "b.f1", the keys would be ("a", "b"). + * override_global_step_value: If after the training is done, global step + value must be reset to this value. This is particularly useful for hyper + parameter tuning, which can't recognize early stopping due to the number + of trees. If None, no override of global step will happen. config: `RunConfig` of the estimator. output_type: Whether to return ModelFnOps (old interface) or EstimatorSpec (new interface). @@ -226,6 +236,7 @@ def ranking_model_builder(features, logits_modifier_function = params["logits_modifier_function"] output_leaf_index = params["output_leaf_index"] ranking_model_pair_keys = params["ranking_model_pair_keys"] + override_global_step_value = params.get("override_global_step_value", None) if features is None: raise ValueError("At least one feature must be specified.") @@ -347,7 +358,8 @@ def ranking_model_builder(features, gbdt_model_main.get_number_of_trees_tensor()) training_hooks.append( trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, - finalized_trees)) + finalized_trees, + override_global_step_value)) if output_type == ModelBuilderOutputType.MODEL_FN_OPS: if use_core_libs and callable(create_estimator_spec_op): diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py index 2e4151cac40f770e2bece70d752122eb7f34dd40..f137ada35524bf2467314f4a284ea35a82f06825 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/trainer_hooks.py @@ -25,6 +25,7 @@ from tensorflow.contrib.learn.python.learn.session_run_hook import SessionRunArg from tensorflow.core.framework.summary_pb2 import Summary from tensorflow.python.framework import ops from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import state_ops from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import training_util from tensorflow.python.training.summary_io import SummaryWriterCache @@ -150,12 +151,23 @@ class FeedFnHook(session_run_hook.SessionRunHook): class StopAfterNTrees(session_run_hook.SessionRunHook): """Stop training after building N full trees.""" - def __init__(self, n, num_attempted_trees_tensor, num_finalized_trees_tensor): + def __init__(self, n, num_attempted_trees_tensor, num_finalized_trees_tensor, + override_global_step_value=None): self._num_trees = n # num_attempted_trees_tensor and num_finalized_trees_tensor are both # tensors. self._num_attempted_trees_tensor = num_attempted_trees_tensor self._num_finalized_trees_tensor = num_finalized_trees_tensor + self._override_global_step_value = override_global_step_value + + def begin(self): + self._global_step_tensor = training_util.get_global_step() + if self._global_step_tensor is None: + raise RuntimeError("Global step should be created.") + + if self._override_global_step_value is not None: + self._override_global_step_op = state_ops.assign( + self._global_step_tensor, self._override_global_step_value) def before_run(self, run_context): del run_context # unused by StopTrainingAfterNTrees. @@ -175,6 +187,9 @@ class StopAfterNTrees(session_run_hook.SessionRunHook): num_attempted_trees > 2 * self._num_trees): logging.info("Requesting stop since we have reached %d trees.", num_finalized_trees) + if self._override_global_step_value is not None: + logging.info("Overriding global steps value.") + run_context.session.run(self._override_global_step_op) run_context.request_stop() diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc index 5b4be2f25838d5405a8148ea20cb0f759cd3a8fb..1375fddf2bea1a8f856c35d756c38a8beb14a53f 100644 --- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc @@ -125,6 +125,8 @@ void QuantizeFeatures( auto flat_values = values_tensor.flat(); for (int64 instance = 0; instance < num_values; ++instance) { const float value = flat_values(instance); + CHECK(!buckets_vector.empty()) + << "Got empty buckets for feature " << feature_index; auto bucket_iter = std::lower_bound(buckets_vector.begin(), buckets_vector.end(), value); if (bucket_iter == buckets_vector.end()) { diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py index 1b7f59ea4218355a13f1df7264352bd68503bd19..5d4819b0f1cb598cfbe146f569aecd7883186339 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py @@ -131,6 +131,10 @@ class BaseSplitHandler(object): }, stamp_token, None) return control_flow_ops.group(update_1, *update_2[self]) + @abc.abstractmethod + def reset(self, stamp_token, next_stamp_token): + """Resets the state maintained by the handler.""" + @abc.abstractmethod def make_splits(self, stamp_token, next_stamp_token, class_id): """Create the best split using the accumulated stats and flush the state. diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py index bf686237ff696dadad9713d26bf784d7442b80d0..efe29216c2a7d8aa985da54cdbb839b9e6f69078 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py @@ -202,3 +202,7 @@ class EqualitySplitHandler(base_split_handler.BaseSplitHandler): # always return ready. are_splits_ready = constant_op.constant(True) return (are_splits_ready, partition_ids, gains, split_infos) + + def reset(self, stamp_token, next_stamp_token): + reset = self._stats_accumulator.flush(stamp_token, next_stamp_token) + return reset diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py index df0bec1fe363e07bbff6b059e86076239bd605e9..2559fe9913f377ce38aa11dfa908cd25ec76dab4 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py @@ -79,6 +79,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops + _BIAS_FEATURE_ID = -1 # Pattern to remove all non alpha numeric from a string. _PATTERN = re.compile(r"[\W_]+") @@ -147,6 +148,11 @@ class InequalitySplitHandler(base_split_handler.BaseSplitHandler): num_quantiles=num_quantiles, name="QuantileAccumulator/{}".format(self._name)) + def reset(self, stamp_token, next_stamp_token): + reset_1 = self._stats_accumulator.flush(stamp_token, next_stamp_token) + reset_2 = self._quantile_accumulator.flush(stamp_token, next_stamp_token) + return control_flow_ops.group([reset_1, reset_2]) + class DenseSplitHandler(InequalitySplitHandler): """Computes stats and finds the best inequality splits on dense columns.""" @@ -264,6 +270,7 @@ class DenseSplitHandler(InequalitySplitHandler): self._feature_column_group_id, self._l1_regularization, self._l2_regularization, self._tree_complexity_regularization, self._min_node_weight, self._loss_uses_sum_reduction)) + return are_splits_ready, partition_ids, gains, split_infos @@ -579,8 +586,10 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column, example_partition_ids, feature_ids, gradients, hessians = ( control_flow_ops.cond( - math_ops.logical_and(are_buckets_ready, is_active[0]), - ready_inputs_fn, not_ready_inputs_fn)) + math_ops.logical_and( + math_ops.logical_and(are_buckets_ready, + array_ops.size(quantile_buckets) > 0), + is_active[0]), ready_inputs_fn, not_ready_inputs_fn)) return (quantile_values, quantile_weights, example_partition_ids, feature_ids, gradients, hessians) @@ -674,8 +683,10 @@ def sparse_make_stats_update( lambda: handler_not_active)) example_partition_ids, feature_ids, gradients, hessians = ( - control_flow_ops.cond(are_buckets_ready, quantiles_ready, - quantiles_not_ready)) + control_flow_ops.cond( + math_ops.logical_and(are_buckets_ready, + array_ops.size(quantile_buckets) > 0), + quantiles_ready, quantiles_not_ready)) return (quantile_indices, quantile_values, quantile_shape, quantile_weights, example_partition_ids, feature_ids, gradients, hessians) diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py index d59732cf92eb85e88732ac5a17dccf475ae5342f..5d82c4cae5dbe28c82fa8754a7c65db62a2e6814 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py @@ -1072,8 +1072,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase): def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self): with self.test_session() as sess: # Batch is 4, 2 classes - gradients = array_ops.constant( - [[0.2, 1.4], [-0.5, 0.1], [1.2, 3], [4.0, -3]]) + gradients = array_ops.constant([[0.2, 1.4], [-0.5, 0.1], [1.2, 3], + [4.0, -3]]) # 2x2 matrix for each instance hessian_0 = [[0.12, 0.02], [0.3, 0.11]] hessian_1 = [[0.07, -0.2], [-0.5, 0.2]] @@ -1167,8 +1167,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase): def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self): with self.test_session() as sess: # Batch is 4, 2 classes - gradients = array_ops.constant( - [[0.2, 1.4], [-0.5, 0.1], [1.2, 3], [4.0, -3]]) + gradients = array_ops.constant([[0.2, 1.4], [-0.5, 0.1], [1.2, 3], + [4.0, -3]]) # Each hessian is a diagonal from a full hessian matrix. hessian_0 = [0.12, 0.11] hessian_1 = [0.07, 0.2] @@ -1406,6 +1406,100 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase): self.assertEqual(len(gains), 0) self.assertEqual(len(splits), 0) + def testEmptyBuckets(self): + """Test that reproduces the case when quantile buckets were empty.""" + with self.test_session() as sess: + sparse_column = array_ops.sparse_placeholder(dtypes.float32) + + # We have two batches - at first, a sparse feature is empty. + empty_indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2]) + empty_values = array_ops.constant([], dtype=dtypes.float32) + empty_sparse_column = sparse_tensor.SparseTensor(empty_indices, + empty_values, [4, 2]) + empty_sparse_column = empty_sparse_column.eval(session=sess) + + # For the second batch, the sparse feature is not empty. + non_empty_indices = array_ops.constant( + [[0, 0], [2, 1], [3, 2]], dtype=dtypes.int64, shape=[3, 2]) + non_empty_values = array_ops.constant( + [0.52, 0.3, 0.52], dtype=dtypes.float32) + non_empty_sparse_column = sparse_tensor.SparseTensor( + non_empty_indices, non_empty_values, [4, 2]) + non_empty_sparse_column = non_empty_sparse_column.eval(session=sess) + + gradient_shape = tensor_shape.scalar() + hessian_shape = tensor_shape.scalar() + class_id = -1 + + split_handler = ordinal_split_handler.SparseSplitHandler( + l1_regularization=0.0, + l2_regularization=2.0, + tree_complexity_regularization=0.0, + min_node_weight=0.0, + epsilon=0.01, + num_quantiles=2, + feature_column_group_id=0, + sparse_float_column=sparse_column, + init_stamp_token=0, + gradient_shape=gradient_shape, + hessian_shape=hessian_shape, + multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS) + resources.initialize_resources(resources.shared_resources()).run() + gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0]) + hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13]) + partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32) + + empty_gradients, empty_hessians = get_empty_tensors( + gradient_shape, hessian_shape) + example_weights = array_ops.ones([4, 1], dtypes.float32) + + update_1 = split_handler.update_stats_sync( + 0, + partition_ids, + gradients, + hessians, + empty_gradients, + empty_hessians, + example_weights, + is_active=array_ops.constant([True, True])) + with ops.control_dependencies([update_1]): + are_splits_ready = split_handler.make_splits( + np.int64(0), np.int64(1), class_id)[0] + + # First, calculate quantiles and try to update on an empty data for a + # feature. + are_splits_ready = ( + sess.run( + are_splits_ready, + feed_dict={sparse_column: empty_sparse_column})) + self.assertFalse(are_splits_ready) + + update_2 = split_handler.update_stats_sync( + 1, + partition_ids, + gradients, + hessians, + empty_gradients, + empty_hessians, + example_weights, + is_active=array_ops.constant([True, True])) + with ops.control_dependencies([update_2]): + are_splits_ready2, partitions, gains, splits = ( + split_handler.make_splits(np.int64(1), np.int64(2), class_id)) + + # Now the feature in the second batch is not empty, but buckets + # calculated on the first batch are empty. + are_splits_ready2, partitions, gains, splits = ( + sess.run( + [are_splits_ready2, partitions, gains, splits], + feed_dict={sparse_column: non_empty_sparse_column})) + self.assertFalse(are_splits_ready) + self.assertTrue(are_splits_ready2) + # Since the buckets were empty, we can't calculate the splits. + self.assertEqual(len(partitions), 0) + self.assertEqual(len(gains), 0) + self.assertEqual(len(splits), 0) + def testDegenerativeCase(self): with self.test_session() as sess: # One data example only, one leaf and thus one quantile bucket.The same diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py index 19e053fcb629c73c00cbfcf6f9afee75b10e5f15..d0d1249bd6afc9cdbf6d88298c5024a4a54a5073 100644 --- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py +++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py @@ -51,6 +51,7 @@ from tensorflow.python.platform import tf_logging as logging from tensorflow.python.summary import summary from tensorflow.python.training import device_setter + # Key names for prediction dict. ENSEMBLE_STAMP = "ensemble_stamp" PREDICTIONS = "predictions" @@ -353,6 +354,9 @@ class GradientBoostedDecisionTreeModel(object): self._gradient_shape = tensor_shape.scalar() self._hessian_shape = tensor_shape.scalar() else: + if center_bias: + raise ValueError("Center bias should be False for multiclass.") + self._gradient_shape = tensor_shape.TensorShape([logits_dimension]) if (learner_config.multi_class_strategy == learner_pb2.LearnerConfig.FULL_HESSIAN): @@ -895,7 +899,7 @@ class GradientBoostedDecisionTreeModel(object): reset_ops = [] for handler in handlers: - reset_ops.append(handler.make_splits(stamp_token, next_stamp_token, 0)) + reset_ops.append(handler.reset(stamp_token, next_stamp_token)) if self._center_bias: reset_ops.append( bias_stats_accumulator.flush(stamp_token, next_stamp_token)) diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py index 2fbaa31d5e19b58c335cd0a894e1db9af2c34d08..e92f0bb841ac6dc57547874881af8bd10c47474f 100644 --- a/tensorflow/contrib/checkpoint/__init__.py +++ b/tensorflow/contrib/checkpoint/__init__.py @@ -31,6 +31,9 @@ Checkpointable data structures: @@List @@Mapping @@UniqueNameTracker + +Checkpoint management: +@@CheckpointManager """ from __future__ import absolute_import @@ -41,6 +44,7 @@ from tensorflow.contrib.checkpoint.python.containers import UniqueNameTracker from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint from tensorflow.core.protobuf.checkpointable_object_graph_pb2 import CheckpointableObjectGraph +from tensorflow.python.training.checkpoint_management import CheckpointManager from tensorflow.python.training.checkpointable.base import CheckpointableBase from tensorflow.python.training.checkpointable.data_structures import List from tensorflow.python.training.checkpointable.data_structures import Mapping diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc index 1bfd27305d569668a0bd67d876e59eec082296b3..58fadffce32f9a8fec047d1e99f9f4eb5a710d91 100644 --- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc +++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.cc @@ -85,7 +85,7 @@ Status BigQueryTableAccessor::New( int64 timestamp_millis, int64 row_buffer_size, const string& end_point, const std::vector& columns, const BigQueryTablePartition& partition, std::unique_ptr auth_provider, - std::unique_ptr http_request_factory, + std::shared_ptr http_request_factory, std::unique_ptr* accessor) { if (timestamp_millis <= 0) { return errors::InvalidArgument( @@ -94,29 +94,19 @@ Status BigQueryTableAccessor::New( const string& big_query_end_point = end_point.empty() ? kBigQueryEndPoint : end_point; if (auth_provider == nullptr && http_request_factory == nullptr) { - accessor->reset(new BigQueryTableAccessor( - project_id, dataset_id, table_id, timestamp_millis, row_buffer_size, - big_query_end_point, columns, partition)); - } else { - accessor->reset(new BigQueryTableAccessor( - project_id, dataset_id, table_id, timestamp_millis, row_buffer_size, - big_query_end_point, columns, partition, std::move(auth_provider), - std::move(http_request_factory))); + http_request_factory = std::make_shared(); + auto compute_engine_metadata_client = + std::make_shared(http_request_factory); + auth_provider = std::unique_ptr( + new GoogleAuthProvider(compute_engine_metadata_client)); } - return (*accessor)->ReadSchema(); -} -BigQueryTableAccessor::BigQueryTableAccessor( - const string& project_id, const string& dataset_id, const string& table_id, - int64 timestamp_millis, int64 row_buffer_size, const string& end_point, - const std::vector& columns, const BigQueryTablePartition& partition) - : BigQueryTableAccessor( - project_id, dataset_id, table_id, timestamp_millis, row_buffer_size, - end_point, columns, partition, - std::unique_ptr(new GoogleAuthProvider()), - std::unique_ptr( - new CurlHttpRequest::Factory())) { - row_buffer_.resize(row_buffer_size); + accessor->reset(new BigQueryTableAccessor( + project_id, dataset_id, table_id, timestamp_millis, row_buffer_size, + big_query_end_point, columns, partition, std::move(auth_provider), + std::move(http_request_factory))); + + return (*accessor)->ReadSchema(); } BigQueryTableAccessor::BigQueryTableAccessor( @@ -124,7 +114,7 @@ BigQueryTableAccessor::BigQueryTableAccessor( int64 timestamp_millis, int64 row_buffer_size, const string& end_point, const std::vector& columns, const BigQueryTablePartition& partition, std::unique_ptr auth_provider, - std::unique_ptr http_request_factory) + std::shared_ptr http_request_factory) : project_id_(project_id), dataset_id_(dataset_id), table_id_(table_id), diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h index b349063715c903c982cfe2fb116b6525e35ff63b..1af43a3e1070d466bb50019f12b22a060c1e6ab1 100644 --- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h +++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor.h @@ -109,24 +109,17 @@ class BigQueryTableAccessor { const std::vector& columns, const BigQueryTablePartition& partition, std::unique_ptr auth_provider, - std::unique_ptr http_request_factory, + std::shared_ptr http_request_factory, std::unique_ptr* accessor); /// \brief Constructs an object for a given table and partition. - BigQueryTableAccessor(const string& project_id, const string& dataset_id, - const string& table_id, int64 timestamp_millis, - int64 row_buffer_size, const string& end_point, - const std::vector& columns, - const BigQueryTablePartition& partition); - - /// Used for unit testing. BigQueryTableAccessor( const string& project_id, const string& dataset_id, const string& table_id, int64 timestamp_millis, int64 row_buffer_size, const string& end_point, const std::vector& columns, const BigQueryTablePartition& partition, std::unique_ptr auth_provider, - std::unique_ptr http_request_factory); + std::shared_ptr http_request_factory); /// \brief Parses column values for a given row. Status ParseColumnValues(const Json::Value& value, @@ -199,7 +192,7 @@ class BigQueryTableAccessor { SchemaNode schema_root_; std::unique_ptr auth_provider_; - std::unique_ptr http_request_factory_; + std::shared_ptr http_request_factory_; TF_DISALLOW_COPY_AND_ASSIGN(BigQueryTableAccessor); }; diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py index f9dc3effd075d7e0add07aa77039824031976772..1ab150d74ac00c5f9acf3c9399880708b2f62b1e 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py +++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py @@ -148,6 +148,9 @@ class TPUClusterResolver(ClusterResolver): else: tpu = self._envVarFallback() + if tpu is None: + raise ValueError('Please provide a TPU Name to connect to.') + self._tpu = compat.as_bytes(tpu) # self._tpu is always bytes self._job_name = job_name self._credentials = credentials diff --git a/tensorflow/contrib/cmake/external/eigen.cmake b/tensorflow/contrib/cmake/external/eigen.cmake index 45a0096085cc2a6332c82e1ea284812acdd45152..33bb31148d2e5b7ca177d7c30b7781e8f620c3cb 100644 --- a/tensorflow/contrib/cmake/external/eigen.cmake +++ b/tensorflow/contrib/cmake/external/eigen.cmake @@ -19,6 +19,12 @@ # build_file = "eigen.BUILD", #) +option(eigen_PATCH_FILE "Patch file to apply to eigen" OFF) +set(eigen_PATCH_COMMAND "") +if(eigen_PATCH_FILE) + set(eigen_PATCH_COMMAND PATCH_COMMAND patch -p0 -i "${eigen_PATCH_FILE}") +endif(eigen_PATCH_FILE) + include (ExternalProject) # We parse the current Eigen version and archive hash from the bazel configuration @@ -45,6 +51,7 @@ ExternalProject_Add(eigen URL ${eigen_URL} DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" INSTALL_DIR "${eigen_INSTALL}" + ${eigen_PATCH_COMMAND} CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF diff --git a/tensorflow/contrib/cmake/external/highwayhash.cmake b/tensorflow/contrib/cmake/external/highwayhash.cmake index a6e8a38d8c2ee3deb5453c264e0c5eb23248301f..7d260b85f21e7e56e153daf550c81155e4b68777 100644 --- a/tensorflow/contrib/cmake/external/highwayhash.cmake +++ b/tensorflow/contrib/cmake/external/highwayhash.cmake @@ -20,14 +20,6 @@ set(highwayhash_TAG be5edafc2e1a455768e260ccd68ae7317b6690ee) set(highwayhash_BUILD ${CMAKE_CURRENT_BINARY_DIR}/highwayhash/src/highwayhash) set(highwayhash_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/highwayhash/install) -# put highwayhash includes in the directory where they are expected -add_custom_target(highwayhash_create_destination_dir - COMMAND ${CMAKE_COMMAND} -E make_directory ${highwayhash_INCLUDE_DIR}/highwayhash - DEPENDS highwayhash) - -add_custom_target(highwayhash_copy_headers_to_destination - DEPENDS highwayhash_create_destination_dir) - if(WIN32) set(highwayhash_HEADERS "${highwayhash_BUILD}/highwayhash/*.h") set(highwayhash_STATIC_LIBRARIES ${highwayhash_INSTALL}/lib/highwayhash.lib) @@ -36,6 +28,20 @@ else() set(highwayhash_STATIC_LIBRARIES ${highwayhash_INSTALL}/lib/libhighwayhash.a) endif() +set(highwayhash_HEADERS + "${highwayhash_INSTALL}/include/code_annotation.h" + "${highwayhash_INSTALL}/include/highway_tree_hash.h" + "${highwayhash_INSTALL}/include/scalar_highway_tree_hash.h" + "${highwayhash_INSTALL}/include/scalar_sip_tree_hash.h" + "${highwayhash_INSTALL}/include/sip_hash.h" + "${highwayhash_INSTALL}/include/sip_tree_hash.h" + "${highwayhash_INSTALL}/include/sse41_highway_tree_hash.h" + "${highwayhash_INSTALL}/include/state_helpers.h" + "${highwayhash_INSTALL}/include/types.h" + "${highwayhash_INSTALL}/include/vec.h" + "${highwayhash_INSTALL}/include/vec2.h" +) + ExternalProject_Add(highwayhash PREFIX highwayhash GIT_REPOSITORY ${highwayhash_URL} @@ -50,5 +56,15 @@ ExternalProject_Add(highwayhash -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF -DCMAKE_INSTALL_PREFIX:STRING=${highwayhash_INSTALL}) -add_custom_command(TARGET highwayhash_copy_headers_to_destination PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_directory ${highwayhash_INSTALL}/include/ ${highwayhash_INCLUDE_DIR}/highwayhash) +# put highwayhash includes in the directory where they are expected +add_custom_target(highwayhash_create_destination_dir + COMMAND ${CMAKE_COMMAND} -E make_directory ${highwayhash_INCLUDE_DIR}/highwayhash + DEPENDS highwayhash) + +add_custom_target(highwayhash_copy_headers_to_destination + DEPENDS highwayhash_create_destination_dir) + +foreach(header_file ${highwayhash_HEADERS}) + add_custom_command(TARGET highwayhash_copy_headers_to_destination PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${highwayhash_INCLUDE_DIR}/highwayhash/) +endforeach() diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake index eba3bcfc79efe87d0a45c979c5accfa1b6511ed0..1d638e64023c7e2706d8d97ff8679677b6cd289d 100644 --- a/tensorflow/contrib/cmake/external/nsync.cmake +++ b/tensorflow/contrib/cmake/external/nsync.cmake @@ -20,14 +20,6 @@ set(nsync_TAG 1.20.0) set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync) set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install) -# put nsync includes in the directory where they are expected -add_custom_target(nsync_create_destination_dir - COMMAND ${CMAKE_COMMAND} -E make_directory ${nsync_INCLUDE_DIR} - DEPENDS nsync) - -add_custom_target(nsync_copy_headers_to_destination - DEPENDS nsync_create_destination_dir) - if(WIN32) set(nsync_HEADERS "${nsync_BUILD}/public/*.h") set(nsync_STATIC_LIBRARIES ${nsync_INSTALL}/lib/nsync.lib) @@ -49,7 +41,35 @@ ExternalProject_Add(nsync -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF -DCMAKE_INSTALL_PREFIX:STRING=${nsync_INSTALL} - -DNSYNC_LANGUAGE:STRING=c++11) + -DNSYNC_LANGUAGE:STRING=c++11) + +set(nsync_HEADERS + "${nsync_INSTALL}/include/nsync.h" + "${nsync_INSTALL}/include/nsync_atomic.h" + "${nsync_INSTALL}/include/nsync_counter.h" + "${nsync_INSTALL}/include/nsync_cpp.h" + "${nsync_INSTALL}/include/nsync_cv.h" + "${nsync_INSTALL}/include/nsync_debug.h" + "${nsync_INSTALL}/include/nsync_mu.h" + "${nsync_INSTALL}/include/nsync_mu_wait.h" + "${nsync_INSTALL}/include/nsync_note.h" + "${nsync_INSTALL}/include/nsync_once.h" + "${nsync_INSTALL}/include/nsync_time.h" + "${nsync_INSTALL}/include/nsync_time_internal.h" + "${nsync_INSTALL}/include/nsync_waiter.h" +) + +# put nsync includes in the directory where they are expected +add_custom_target(nsync_create_destination_dir + COMMAND ${CMAKE_COMMAND} -E make_directory ${nsync_INCLUDE_DIR} + DEPENDS nsync) + +add_custom_target(nsync_copy_headers_to_destination + DEPENDS nsync_create_destination_dir) + +foreach(header_file ${nsync_HEADERS}) + add_custom_command(TARGET nsync_copy_headers_to_destination PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${nsync_INCLUDE_DIR}/) +endforeach() + -add_custom_command(TARGET nsync_copy_headers_to_destination PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_directory ${nsync_INSTALL}/include/ ${nsync_INCLUDE_DIR}/) diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt index 75e00f32675df1b7e523bc7e8bb44fa584b79347..a5a947f7261559b6d25c452efe35097258d5625c 100644 --- a/tensorflow/contrib/cmake/python_modules.txt +++ b/tensorflow/contrib/cmake/python_modules.txt @@ -115,7 +115,6 @@ tensorflow/contrib/coder tensorflow/contrib/coder/kernels tensorflow/contrib/coder/ops tensorflow/contrib/coder/python -tensorflow/contrib/coder/python/layers tensorflow/contrib/coder/python/ops tensorflow/contrib/compiler tensorflow/contrib/constrained_optimization @@ -187,6 +186,8 @@ tensorflow/contrib/graph_editor/examples tensorflow/contrib/grid_rnn tensorflow/contrib/grid_rnn/python tensorflow/contrib/grid_rnn/python/ops +tensorflow/contrib/hadoop/python +tensorflow/contrib/hadoop/python/ops tensorflow/contrib/hooks tensorflow/contrib/hooks/python tensorflow/contrib/image diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake index 32b185f07b6ba836ffb47e85beff6fb2481fdc3e..6d86daf5f174a3238ab92e5bba6085c904766766 100755 --- a/tensorflow/contrib/cmake/tf_python.cmake +++ b/tensorflow/contrib/cmake/tf_python.cmake @@ -198,7 +198,7 @@ function(add_python_module MODULE_NAME) # so we currently add explicit commands to include those files # later on in this script. if (NOT "${script}" MATCHES "_test\.py$") - add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD + add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/${script} ${CMAKE_CURRENT_BINARY_DIR}/tf_python/${script}) endif() endforeach() @@ -297,7 +297,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name) ) target_link_libraries(${tf_python_op_lib_name}_gen_python PRIVATE tf_protos_cc - tf_python_protos_cc + tf_python_protos_cc ${tensorflow_EXTERNAL_LIBRARIES} ) @@ -549,15 +549,15 @@ if(WIN32) ${NUMPY_INCLUDE_DIR} ) #target_link_libraries(pywrap_tensorflow_internal_static - # tf_protos_cc - # tf_python_protos_cc + # tf_protos_cc + # tf_python_protos_cc #) add_dependencies(pywrap_tensorflow_internal_static tf_protos_cc tf_python_protos_cc) set(pywrap_tensorflow_internal_static_dependencies $ $ $ - ${nsync_STATIC_LIBRARIES} + ${nsync_STATIC_LIBRARIES} ) if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*") @@ -737,7 +737,7 @@ endif() ######################################################## # Parse tensorflow/python/tools/api/generator/BUILD to get list of generated files. -FILE(READ ${tensorflow_source_dir}/tensorflow/python/tools/api/generator/api_gen.bzl api_generator_BUILD_text) +FILE(READ ${tensorflow_source_dir}/tensorflow/python/tools/api/generator/api_init_files.bzl api_generator_BUILD_text) STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text}) string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text}) string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text}) @@ -763,57 +763,40 @@ file(WRITE "${api_init_list_file}" "${api_init_files}") # recongnize paths. As CUDA isn't built with MKL, the MKL built directory is the only path to this command to work around that issue. # To not override the CUDA and system path in other circumstances, `if-else` branch used here to handle this problem, # and should be removed if the path issue can be resolved. +# UPDATE: Below block appears to handle multiple items in PATH correctly, but risks command line limits if PATH is large. +# If you have issues, try `set(PY_RUNTIME_ENV "PATH=${mkl_BIN_DIRS}")` instead. ### -if (tensorflow_ENABLE_MKL_SUPPORT) +set(PY_RUNTIME_ENV "") +if(tensorflow_ENABLE_MKL_SUPPORT) # add mkl dist dlls to system path for python - # TODO: In current cmake version, PY_RUNTIME_ENV behaves strange with multiple paths, - # so we have to specify only one path in it to work around the issue. We need this if/else - # to protect overwriting CUDA environments - set(PY_RUNTIME_ENV ${mkl_BIN_DIRS}) - add_custom_command( - OUTPUT ${api_init_files} - DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops - - # tensorflow/__init__.py depends on files generated in this step. So, remove it while - # this step is running since the files aren't there yet. - COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py - - # Run create_python_api.py to generate API init files. - COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python PATH=${PY_RUNTIME_ENV} ${PYTHON_EXECUTABLE} - "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/tools/api/generator/create_python_api.py" - "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py" - "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow" - "--package=tensorflow.python" - "--apiname=tensorflow" - "${api_init_list_file}" - - COMMENT "Generating __init__.py files for Python API." - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python" - VERBATIM - ) -else (tensorflow_ENABLE_MKL_SUPPORT) - add_custom_command( - OUTPUT ${api_init_files} - DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops - - # tensorflow/__init__.py depends on files generated in this step. So, remove it while - # this step is running since the files aren't there yet. - COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py - - # Run create_python_api.py to generate API init files. - COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE} - "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/tools/api/generator/create_python_api.py" - "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py" - "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow" - "--package=tensorflow.python" - "--apiname=tensorflow" - "${api_init_list_file}" - - COMMENT "Generating __init__.py files for Python API." - WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python" - ) -endif (tensorflow_ENABLE_MKL_SUPPORT) + file(TO_CMAKE_PATH "$ENV{PATH}" PY_RUNTIME_ENV) + set(PY_RUNTIME_ENV ${mkl_BIN_DIRS} ${PY_RUNTIME_ENV}) + file(TO_NATIVE_PATH "${PY_RUNTIME_ENV}" PY_RUNTIME_ENV) + set(PY_RUNTIME_ENV "PATH=${PY_RUNTIME_ENV}") +endif(tensorflow_ENABLE_MKL_SUPPORT) + +add_custom_command( + OUTPUT ${api_init_files} + DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops + + # tensorflow/__init__.py depends on files generated in this step. So, remove it while + # this step is running since the files aren't there yet. + COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py + + # Run create_python_api.py to generate API init files. + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python "${PY_RUNTIME_ENV}" ${PYTHON_EXECUTABLE} + "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/tools/api/generator/create_python_api.py" + "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py" + "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow" + "--package=tensorflow.python" + "--apiname=tensorflow" + "${api_init_list_file}" + + COMMENT "Generating __init__.py files for Python API." + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python" + VERBATIM +) add_custom_target(tf_python_api SOURCES ${api_init_files}) add_dependencies(tf_python_api tf_python_ops) @@ -848,12 +831,12 @@ add_custom_command( DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops # Run create_python_api.py to generate API init files. - COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE} + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python "${PY_RUNTIME_ENV}" ${PYTHON_EXECUTABLE} "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/tools/api/generator/create_python_api.py" "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/estimator/api" "--package=tensorflow.python.estimator" "--apiname=estimator" - "--output_package=tensorflow.python.estimator.api" + "--output_package=tensorflow.python.estimator.api" "${estimator_api_init_list_file}" COMMENT "Generating __init__.py files for Python API." diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake index b2330c4e340d531f70234de812ab6f6b2e5c1160..2c878c17167c662d10a8c7dabf41687efdbf65d8 100644 --- a/tensorflow/contrib/cmake/tf_tests.cmake +++ b/tensorflow/contrib/cmake/tf_tests.cmake @@ -122,6 +122,17 @@ function(AddPythonTests) endforeach() endfunction(AddPythonTests) +# +# ensure that every element is an existing file +# +function(CheckExists TYPE SOURCES) + foreach(source ${SOURCES}) + if(NOT EXISTS ${source}) + message(SEND_ERROR "${TYPE} not found: ${source}") + endif() + endforeach(source) +endfunction(CheckExists) + if (tensorflow_BUILD_PYTHON_TESTS) # # python tests. This assumes that the tensorflow wheel is @@ -145,7 +156,6 @@ if (tensorflow_BUILD_PYTHON_TESTS) "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py" "${tensorflow_source_dir}/tensorflow/contrib/estimator/python/estimator/*_test.py" "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py" - "${tensorflow_source_dir}/tensorflow/python/meta_graph_transform/*_test.py" "${tensorflow_source_dir}/tensorflow/python/ops/quantized_conv_ops_test.py" "${tensorflow_source_dir}/tensorflow/python/ops/quantized_ops_test.py" "${tensorflow_source_dir}/tensorflow/python/platform/build_info_test.py" @@ -198,7 +208,6 @@ if (tensorflow_BUILD_PYTHON_TESTS) "${tensorflow_source_dir}/tensorflow/python/saved_model/saved_model_test.py" "${tensorflow_source_dir}/tensorflow/contrib/image/python/kernel_tests/sparse_image_warp_test.py" # requires scipy - "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/preprocessing/*_test.py" "${tensorflow_source_dir}/tensorflow/contrib/tfprof/python/tools/tfprof/pprof_profiler_test.py" "${tensorflow_source_dir}/tensorflow/contrib/image/python/kernel_tests/interpolate_spline_test.py" # Takes very long to run without sharding (defined in bazel build file). @@ -256,10 +265,9 @@ if (tensorflow_BUILD_PYTHON_TESTS) # Flaky because of local cluster creation. "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py" "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py" - "${tensorflow_source_dir}tensorflow/python/training/localhost_cluster_performance_test.py" + "${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py" "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py" "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py" - "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py" # Type error in testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU. "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_test.py" "${tensorflow_source_dir}/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py" @@ -329,6 +337,7 @@ if (tensorflow_BUILD_PYTHON_TESTS) "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/utils/io_utils_test.py" # b/72894325 ) endif() + CheckExists(${tf_test_src_py_exclude}) list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude}) AddPythonTests( @@ -480,6 +489,7 @@ if (tensorflow_BUILD_CC_TESTS) "${tensorflow_source_dir}/tensorflow/cc/saved_model/*_test.cc" ) + CheckExists(${tf_test_src_simple_exclude}) list(REMOVE_ITEM tf_test_src_simple ${tf_test_src_simple_exclude} ${tf_cc_saved_model_test_srcs} @@ -494,6 +504,7 @@ if (tensorflow_BUILD_CC_TESTS) ${tf_core_profiler_test_srcs} ) + CheckExists(${tf_src_testlib}) set(tf_test_lib tf_test_lib) add_library(${tf_test_lib} STATIC ${tf_src_testlib}) diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD index a2c6e413039ee3b5af3cb53d1af3325037536d36..855c824ead2f7de4c37db2d2a3648a9ee00fb9e9 100644 --- a/tensorflow/contrib/coder/BUILD +++ b/tensorflow/contrib/coder/BUILD @@ -1,5 +1,5 @@ # Description: -# Contains tools related to data compression. +# Contains ops related to data compression. package(default_visibility = [ "//learning/brain:__subpackages__", @@ -168,7 +168,6 @@ py_library( srcs_version = "PY2AND3", deps = [ ":coder_ops_py", - ":entropybottleneck_py", ], ) @@ -205,44 +204,3 @@ tf_py_test( ], main = "python/ops/coder_ops_test.py", ) - -py_library( - name = "entropybottleneck_py", - srcs = [ - "python/layers/entropybottleneck.py", - ], - srcs_version = "PY2AND3", - deps = [ - ":coder_ops_py", - "//tensorflow/python:array_ops", - "//tensorflow/python:constant_op", - "//tensorflow/python:dtypes", - "//tensorflow/python:functional_ops", - "//tensorflow/python:init_ops", - "//tensorflow/python:math_ops", - "//tensorflow/python:nn", - "//tensorflow/python:ops", - "//tensorflow/python:random_ops", - "//tensorflow/python:state_ops", - "//tensorflow/python:summary_ops", - "//tensorflow/python:tensor_shape", - "//tensorflow/python:variable_scope", - "//tensorflow/python/eager:context", - "//tensorflow/python/keras:engine", - "//third_party/py/numpy", - ], -) - -tf_py_test( - name = "entropybottleneck_py_test", - srcs = [ - "python/layers/entropybottleneck_test.py", - ], - additional_deps = [ - ":entropybottleneck_py", - "//tensorflow/python:client_testlib", - "//tensorflow/python:variables", - "//tensorflow/python:training", - ], - main = "python/layers/entropybottleneck_test.py", -) diff --git a/tensorflow/contrib/coder/README.md b/tensorflow/contrib/coder/README.md deleted file mode 100644 index c6c379c458893551b765327c0c1cbfff7f24f9c3..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/coder/README.md +++ /dev/null @@ -1,73 +0,0 @@ -# Entropy coder - -This module contains range encoder and range decoder which can encode integer -data into string with cumulative distribution functions (CDF). - -## Data and CDF values - -The data to be encoded should be non-negative integers in half-open interval -`[0, m)`. Then a CDF is represented as an integral vector of length `m + 1` -where `CDF(i) = f(Pr(X < i) * 2^precision)` for i = 0,1,...,m, and `precision` -is an attribute in range `0 < precision <= 16`. The function `f` maps real -values into integers, e.g., round or floor. It is important that to encode a -number `i`, `CDF(i + 1) - CDF(i)` cannot be zero. - -Note that we used `Pr(X < i)` not `Pr(X <= i)`, and therefore CDF(0) = 0 always. - -## RangeEncode: data shapes and CDF shapes - -For each data element, its CDF has to be provided. Therefore if the shape of CDF -should be `data.shape + (m + 1,)` in NumPy-like notation. For example, if `data` -is a 2-D tensor of shape (10, 10) and its elements are in `[0, 64)`, then the -CDF tensor should have shape (10, 10, 65). - -This may make CDF tensor too large, and in many applications all data elements -may have the same probability distribution. To handle this, `RangeEncode` -supports limited broadcasting CDF into data. Broadcasting is limited in the -following sense: - -- All CDF axes but the last one is broadcasted into data but not the other way - around, -- The number of CDF axes does not extend, i.e., `CDF.ndim == data.ndim + 1`. - -In the previous example where data has shape (10, 10), the following are -acceptable CDF shapes: - -- (10, 10, 65) -- (1, 10, 65) -- (10, 1, 65) -- (1, 1, 65) - -## RangeDecode - -`RangeEncode` encodes neither data shape nor termination character. Therefore -the decoder should know how many characters are encoded into the string, and -`RangeDecode` takes the encoded data shape as the second argument. The same -shape restrictions as `RangeEncode` inputs apply here. - -## Example - -```python -data = tf.random_uniform((128, 128), 0, 10, dtype=tf.int32) - -histogram = tf.bincount(data, minlength=10, maxlength=10) -cdf = tf.cumsum(histogram, exclusive=False) -# CDF should have length m + 1. -cdf = tf.pad(cdf, [[1, 0]]) -# CDF axis count must be one more than data. -cdf = tf.reshape(cdf, [1, 1, -1]) - -# Note that data has 2^14 elements, and therefore the sum of CDF is 2^14. -data = tf.cast(data, tf.int16) -encoded = coder.range_encode(data, cdf, precision=14) -decoded = coder.range_decode(encoded, tf.shape(data), cdf, precision=14) - -# data and decoded should be the same. -sess = tf.Session() -x, y = sess.run((data, decoded)) -assert np.all(x == y) -``` - -## Authors -Sung Jin Hwang (github: [ssjhv](https://github.com/ssjhv)) and Nick Johnston -(github: [nmjohn](https://github.com/nmjohn)) diff --git a/tensorflow/contrib/coder/__init__.py b/tensorflow/contrib/coder/__init__.py index 99b8ac7595ec632b2918e6b7ca22c06dd7f0a8b3..8897312046c63c42d85e7fba5b62d2ed908dd6e9 100644 --- a/tensorflow/contrib/coder/__init__.py +++ b/tensorflow/contrib/coder/__init__.py @@ -12,14 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Data compression tools.""" +"""Data compression ops.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function # pylint: disable=wildcard-import -from tensorflow.contrib.coder.python.layers.entropybottleneck import * from tensorflow.contrib.coder.python.ops.coder_ops import * # pylint: enable=wildcard-import diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck.py b/tensorflow/contrib/coder/python/layers/entropybottleneck.py deleted file mode 100644 index 0c997bd4fdfa4233117c9fec2c4397301b1c8cb9..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/coder/python/layers/entropybottleneck.py +++ /dev/null @@ -1,697 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Entropy bottleneck layer.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - -from tensorflow.contrib.coder.python.ops import coder_ops - -from tensorflow.python.eager import context -from tensorflow.python.framework import constant_op -from tensorflow.python.framework import dtypes -from tensorflow.python.framework import ops -from tensorflow.python.framework import tensor_shape -from tensorflow.python.keras.engine import base_layer -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import functional_ops -from tensorflow.python.ops import init_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import nn -from tensorflow.python.ops import random_ops -from tensorflow.python.ops import state_ops -from tensorflow.python.ops import variable_scope -from tensorflow.python.summary import summary - - -class EntropyBottleneck(base_layer.Layer): - """Entropy bottleneck layer. - - This layer can be used to model the entropy (the amount of information - conveyed) of the tensor passing through it. During training, this can be used - to impose a (soft) entropy constraint on its activations, limiting the amount - of information flowing through the layer. Note that this is distinct from - other types of bottlenecks, which reduce the dimensionality of the space, for - example. Dimensionality reduction does not limit the amount of information, - and does not enable efficient data compression per se. - - After training, this layer can be used to compress any input tensor to a - string, which may be written to a file, and to decompress a file which it - previously generated back to a reconstructed tensor (possibly on a different - machine having access to the same model checkpoint). The entropies estimated - during training or evaluation are approximately equal to the average length of - the strings in bits. - - The layer implements a flexible probability density model to estimate entropy, - which is described in the appendix of the paper (please cite the paper if you - use this code for scientific work): - - "Variational image compression with a scale hyperprior" - - Johannes Ballé, David Minnen, Saurabh Singh, Sung Jin Hwang, Nick Johnston - - https://arxiv.org/abs/1802.01436 - - The layer assumes that the input tensor is at least 2D, with a batch dimension - at the beginning and a channel dimension as specified by `data_format`. The - layer trains an independent probability density model for each channel, but - assumes that across all other dimensions, the inputs are i.i.d. (independent - and identically distributed). Because the entropy (and hence, average - codelength) is a function of the densities, this assumption may have a direct - effect on the compression performance. - - Because data compression always involves discretization, the outputs of the - layer are generally only approximations of its inputs. During training, - discretization is modeled using additive uniform noise to ensure - differentiability. The entropies computed during training are differential - entropies. During evaluation, the data is actually quantized, and the - entropies are discrete (Shannon entropies). To make sure the approximated - tensor values are good enough for practical purposes, the training phase must - be used to balance the quality of the approximation with the entropy, by - adding an entropy term to the training loss, as in the following example. - - Here, we use the entropy bottleneck to compress the latent representation of - an autoencoder. The data vectors `x` in this case are 4D tensors in - `'channels_last'` format (for example, 16x16 pixel grayscale images). - - The layer always produces exactly one auxiliary loss and one update op which - are only significant for compression and decompression. To use the compression - feature, the auxiliary loss must be minimized during or after training. After - that, the update op must be executed at least once. Here, we simply attach - them to the main training step. - - Training: - ``` - # Build autoencoder. - x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) - y = forward_transform(x) - entropy_bottleneck = EntropyBottleneck() - y_, likelihoods = entropy_bottleneck(y, training=True) - x_ = backward_transform(y_) - - # Information content (= predicted codelength) in bits of each batch element - # (note that taking the natural logarithm and dividing by `log(2)` is - # equivalent to taking base-2 logarithms): - bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2) - - # Squared difference of each batch element: - squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3)) - - # The loss is a weighted sum of mean squared error and entropy (average - # information content), where the weight controls the trade-off between - # approximation error and entropy. - main_loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits) - - # Minimize loss and auxiliary loss, and execute update op. - main_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) - main_step = optimizer.minimize(main_loss) - # 1e-2 is a good starting point for the learning rate of the auxiliary loss, - # assuming Adam is used. - aux_optimizer = tf.train.AdamOptimizer(learning_rate=1e-2) - aux_step = optimizer.minimize(entropy_bottleneck.losses[0]) - step = tf.group(main_step, aux_step, entropy_bottleneck.updates[0]) - ``` - - Evaluation: - ``` - # Build autoencoder. - x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) - y = forward_transform(x) - y_, likelihoods = EntropyBottleneck()(y, training=False) - x_ = backward_transform(y_) - - # Information content (= predicted codelength) in bits of each batch element: - bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2) - - # Squared difference of each batch element: - squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3)) - - # The loss is a weighted sum of mean squared error and entropy (average - # information content), where the weight controls the trade-off between - # approximation error and entropy. - loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits) - ``` - - To be able to compress the bottleneck tensor and decompress it in a different - session, or on a different machine, you need three items: - - The compressed representations stored as strings. - - The shape of the bottleneck for these string representations as a `Tensor`, - as well as the number of channels of the bottleneck at graph construction - time. - - The checkpoint of the trained model that was used for compression. Note: - It is crucial that the auxiliary loss produced by this layer is minimized - during or after training, and that the update op is run after training and - minimization of the auxiliary loss, but *before* the checkpoint is saved. - - Compression: - ``` - x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) - y = forward_transform(x) - strings = EntropyBottleneck().compress(y) - shape = tf.shape(y)[1:] - ``` - - Decompression: - ``` - strings = tf.placeholder(tf.string, shape=[None]) - shape = tf.placeholder(tf.int32, shape=[3]) - entropy_bottleneck = EntropyBottleneck(dtype=tf.float32) - y_ = entropy_bottleneck.decompress(strings, shape, channels=5) - x_ = backward_transform(y_) - ``` - Here, we assumed that the tensor produced by the forward transform has 5 - channels. - - The above four use cases can also be implemented within the same session (i.e. - on the same `EntropyBottleneck` instance), for testing purposes, etc., by - calling the object more than once. - - Arguments: - init_scale: Float. A scaling factor determining the initial width of the - probability densities. This should be chosen big enough so that the - range of values of the layer inputs roughly falls within the interval - [`-init_scale`, `init_scale`] at the beginning of training. - filters: An iterable of ints, giving the number of filters at each layer of - the density model. Generally, the more filters and layers, the more - expressive is the density model in terms of modeling more complicated - distributions of the layer inputs. For details, refer to the paper - referenced above. The default is `[3, 3, 3]`, which should be sufficient - for most practical purposes. - tail_mass: Float, between 0 and 1. The bottleneck layer automatically - determines the range of input values that should be represented based on - their frequency of occurrence. Values occurring in the tails of the - distributions will be clipped to that range during compression. - `tail_mass` determines the amount of probability mass in the tails which - is cut off in the worst case. For example, the default value of `1e-9` - means that at most 1 in a billion input samples will be clipped to the - range. - optimize_integer_offset: Boolean. Typically, the input values of this layer - are floats, which means that quantization during evaluation can be - performed with an arbitrary offset. By default, the layer determines that - offset automatically. In special situations, such as when it is known that - the layer will receive only full integer values during evaluation, it can - be desirable to set this argument to `False` instead, in order to always - quantize to full integer values. - likelihood_bound: Float. If positive, the returned likelihood values are - ensured to be greater than or equal to this value. This prevents very - large gradients with a typical entropy loss (defaults to 1e-9). - range_coder_precision: Integer, between 1 and 16. The precision of the range - coder used for compression and decompression. This trades off computation - speed with compression efficiency, where 16 is the slowest but most - efficient setting. Choosing lower values may increase the average - codelength slightly compared to the estimated entropies. - data_format: Either `'channels_first'` or `'channels_last'` (default). - trainable: Boolean. Whether the layer should be trained. - name: String. The name of the layer. - dtype: Default dtype of the layer's parameters (default of `None` means use - the type of the first input). - - Read-only properties: - init_scale: See above. - filters: See above. - tail_mass: See above. - optimize_integer_offset: See above. - likelihood_bound: See above. - range_coder_precision: See above. - data_format: See above. - name: String. See above. - dtype: See above. - trainable_variables: List of trainable variables. - non_trainable_variables: List of non-trainable variables. - variables: List of all variables of this layer, trainable and non-trainable. - updates: List of update ops of this layer. Always contains exactly one - update op, which must be run once after the last training step, before - `compress` or `decompress` is used. - losses: List of losses added by this layer. Always contains exactly one - auxiliary loss, which must be added to the training loss. - - Mutable properties: - trainable: Boolean. Whether the layer should be trained. - input_spec: Optional `InputSpec` object specifying the constraints on inputs - that can be accepted by the layer. - """ - - def __init__(self, init_scale=10, filters=(3, 3, 3), tail_mass=1e-9, - optimize_integer_offset=True, likelihood_bound=1e-9, - range_coder_precision=16, data_format="channels_last", **kwargs): - super(EntropyBottleneck, self).__init__(**kwargs) - self._init_scale = float(init_scale) - self._filters = tuple(int(f) for f in filters) - self._tail_mass = float(tail_mass) - if not 0 < self.tail_mass < 1: - raise ValueError( - "`tail_mass` must be between 0 and 1, got {}.".format(self.tail_mass)) - self._optimize_integer_offset = bool(optimize_integer_offset) - self._likelihood_bound = float(likelihood_bound) - self._range_coder_precision = int(range_coder_precision) - self._data_format = data_format - self._channel_axis(2) # trigger ValueError early - self.input_spec = base_layer.InputSpec(min_ndim=2) - - @property - def init_scale(self): - return self._init_scale - - @property - def filters(self): - return self._filters - - @property - def tail_mass(self): - return self._tail_mass - - @property - def optimize_integer_offset(self): - return self._optimize_integer_offset - - @property - def likelihood_bound(self): - return self._likelihood_bound - - @property - def range_coder_precision(self): - return self._range_coder_precision - - @property - def data_format(self): - return self._data_format - - def _channel_axis(self, ndim): - try: - return {"channels_first": 1, "channels_last": ndim - 1}[self.data_format] - except KeyError: - raise ValueError("Unsupported `data_format` for {} layer: {}.".format( - self.__class__.__name__, self.data_format)) - - def _logits_cumulative(self, inputs, stop_gradient): - """Evaluate logits of the cumulative densities. - - Args: - inputs: The values at which to evaluate the cumulative densities, expected - to be a `Tensor` of shape `(channels, 1, batch)`. - stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so - that the gradient of the output with respect to the density model - parameters is disconnected (the gradient with respect to `inputs` is - left untouched). - - Returns: - A `Tensor` of the same shape as `inputs`, containing the logits of the - cumulative densities evaluated at the given inputs. - """ - logits = inputs - - for i in range(len(self.filters) + 1): - matrix = self._matrices[i] - if stop_gradient: - matrix = array_ops.stop_gradient(matrix) - logits = math_ops.matmul(matrix, logits) - - bias = self._biases[i] - if stop_gradient: - bias = array_ops.stop_gradient(bias) - logits += bias - - if i < len(self._factors): - factor = self._factors[i] - if stop_gradient: - factor = array_ops.stop_gradient(factor) - logits += factor * math_ops.tanh(logits) - - return logits - - def build(self, input_shape): - """Builds the layer. - - Creates the variables for the network modeling the densities, creates the - auxiliary loss estimating the median and tail quantiles of the densities, - and then uses that to create the probability mass functions and the update - op that produces the discrete cumulative density functions used by the range - coder. - - Args: - input_shape: Shape of the input tensor, used to get the number of - channels. - - Raises: - ValueError: if `input_shape` doesn't specify the length of the channel - dimension. - """ - input_shape = tensor_shape.TensorShape(input_shape) - channel_axis = self._channel_axis(input_shape.ndims) - channels = input_shape[channel_axis].value - if channels is None: - raise ValueError("The channel dimension of the inputs must be defined.") - self.input_spec = base_layer.InputSpec( - ndim=input_shape.ndims, axes={channel_axis: channels}) - filters = (1,) + self.filters + (1,) - scale = self.init_scale ** (1 / (len(self.filters) + 1)) - - # Create variables. - self._matrices = [] - self._biases = [] - self._factors = [] - for i in range(len(self.filters) + 1): - init = np.log(np.expm1(1 / scale / filters[i + 1])) - matrix = self.add_variable( - "matrix_{}".format(i), dtype=self.dtype, - shape=(channels, filters[i + 1], filters[i]), - initializer=init_ops.Constant(init)) - matrix = nn.softplus(matrix) - self._matrices.append(matrix) - - bias = self.add_variable( - "bias_{}".format(i), dtype=self.dtype, - shape=(channels, filters[i + 1], 1), - initializer=init_ops.RandomUniform(-.5, .5)) - self._biases.append(bias) - - if i < len(self.filters): - factor = self.add_variable( - "factor_{}".format(i), dtype=self.dtype, - shape=(channels, filters[i + 1], 1), - initializer=init_ops.Zeros()) - factor = math_ops.tanh(factor) - self._factors.append(factor) - - # To figure out what range of the densities to sample, we need to compute - # the quantiles given by `tail_mass / 2` and `1 - tail_mass / 2`. Since we - # can't take inverses of the cumulative directly, we make it an optimization - # problem: - # `quantiles = argmin(|logit(cumulative) - target|)` - # where `target` is `logit(tail_mass / 2)` or `logit(1 - tail_mass / 2)`. - # Taking the logit (inverse of sigmoid) of the cumulative makes the - # representation of the right target more numerically stable. - - # Numerically stable way of computing logits of `tail_mass / 2` - # and `1 - tail_mass / 2`. - target = np.log(2 / self.tail_mass - 1) - # Compute lower and upper tail quantile as well as median. - target = constant_op.constant([-target, 0, target], dtype=self.dtype) - - def quantiles_initializer(shape, dtype=None, partition_info=None): - del partition_info # unused - assert tuple(shape[1:]) == (1, 3) - init = constant_op.constant( - [[[-self.init_scale, 0, self.init_scale]]], dtype=dtype) - return array_ops.tile(init, (shape[0], 1, 1)) - - quantiles = self.add_variable( - "quantiles", shape=(channels, 1, 3), dtype=self.dtype, - initializer=quantiles_initializer) - logits = self._logits_cumulative(quantiles, stop_gradient=True) - loss = math_ops.reduce_sum(abs(logits - target)) - self.add_loss(loss, inputs=None) - - # Save medians for `call`, `compress`, and `decompress`. - self._medians = quantiles[:, :, 1:2] - if not self.optimize_integer_offset: - self._medians = math_ops.round(self._medians) - - # Largest distance observed between lower tail quantile and median, - # or between median and upper tail quantile. - minima = math_ops.reduce_max(self._medians - quantiles[:, :, 0:1]) - maxima = math_ops.reduce_max(quantiles[:, :, 2:3] - self._medians) - minmax = math_ops.maximum(minima, maxima) - minmax = math_ops.ceil(minmax) - minmax = math_ops.maximum(minmax, 1) - - # Sample the density up to `minmax` around the median. - samples = math_ops.range(-minmax, minmax + 1, dtype=self.dtype) - samples += self._medians - - half = constant_op.constant(.5, dtype=self.dtype) - # We strip the sigmoid from the end here, so we can use the special rule - # below to only compute differences in the left tail of the sigmoid. - # This increases numerical stability (see explanation in `call`). - lower = self._logits_cumulative(samples - half, stop_gradient=True) - upper = self._logits_cumulative(samples + half, stop_gradient=True) - # Flip signs if we can move more towards the left tail of the sigmoid. - sign = -math_ops.sign(math_ops.add_n([lower, upper])) - pmf = abs(math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) - # Add tail masses to first and last bin of pmf, as we clip values for - # compression, meaning that out-of-range values get mapped to these bins. - pmf = array_ops.concat([ - math_ops.add_n([pmf[:, 0, :1], math_ops.sigmoid(lower[:, 0, :1])]), - pmf[:, 0, 1:-1], - math_ops.add_n([pmf[:, 0, -1:], math_ops.sigmoid(-upper[:, 0, -1:])]), - ], axis=-1) - self._pmf = pmf - - cdf = coder_ops.pmf_to_quantized_cdf( - pmf, precision=self.range_coder_precision) - def cdf_getter(*args, **kwargs): - del args, kwargs # ignored - return variable_scope.get_variable( - "quantized_cdf", dtype=dtypes.int32, initializer=cdf, - trainable=False, validate_shape=False, collections=()) - # Need to provide a fake shape here since add_variable insists on it. - self._quantized_cdf = self.add_variable( - "quantized_cdf", shape=(channels, 1), dtype=dtypes.int32, - getter=cdf_getter, trainable=False) - - update_op = state_ops.assign( - self._quantized_cdf, cdf, validate_shape=False) - self.add_update(update_op, inputs=None) - - super(EntropyBottleneck, self).build(input_shape) - - def call(self, inputs, training): - """Pass a tensor through the bottleneck. - - Args: - inputs: The tensor to be passed through the bottleneck. - training: Boolean. If `True`, returns a differentiable approximation of - the inputs, and their likelihoods under the modeled probability - densities. If `False`, returns the quantized inputs and their - likelihoods under the corresponding probability mass function. These - quantities can't be used for training, as they are not differentiable, - but represent actual compression more closely. - - Returns: - values: `Tensor` with the same shape as `inputs` containing the perturbed - or quantized input values. - likelihood: `Tensor` with the same shape as `inputs` containing the - likelihood of `values` under the modeled probability distributions. - - Raises: - ValueError: if `inputs` has different `dtype` or number of channels than - a previous set of inputs the model was invoked with earlier. - """ - inputs = ops.convert_to_tensor(inputs) - ndim = self.input_spec.ndim - channel_axis = self._channel_axis(ndim) - half = constant_op.constant(.5, dtype=self.dtype) - - # Convert to (channels, 1, batch) format by commuting channels to front - # and then collapsing. - order = list(range(ndim)) - order.pop(channel_axis) - order.insert(0, channel_axis) - values = array_ops.transpose(inputs, order) - shape = array_ops.shape(values) - values = array_ops.reshape(values, (shape[0], 1, -1)) - - # Add noise or quantize. - if training: - noise = random_ops.random_uniform(array_ops.shape(values), -half, half) - values = math_ops.add_n([values, noise]) - elif self.optimize_integer_offset: - values = math_ops.round(values - self._medians) + self._medians - else: - values = math_ops.round(values) - - # Evaluate densities. - # We can use the special rule below to only compute differences in the left - # tail of the sigmoid. This increases numerical stability: sigmoid(x) is 1 - # for large x, 0 for small x. Subtracting two numbers close to 0 can be done - # with much higher precision than subtracting two numbers close to 1. - lower = self._logits_cumulative(values - half, stop_gradient=False) - upper = self._logits_cumulative(values + half, stop_gradient=False) - # Flip signs if we can move more towards the left tail of the sigmoid. - sign = -math_ops.sign(math_ops.add_n([lower, upper])) - sign = array_ops.stop_gradient(sign) - likelihood = abs( - math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) - if self.likelihood_bound > 0: - likelihood_bound = constant_op.constant( - self.likelihood_bound, dtype=self.dtype) - # TODO(jballe): Override gradients. - likelihood = math_ops.maximum(likelihood, likelihood_bound) - - # Convert back to input tensor shape. - order = list(range(1, ndim)) - order.insert(channel_axis, 0) - values = array_ops.reshape(values, shape) - values = array_ops.transpose(values, order) - likelihood = array_ops.reshape(likelihood, shape) - likelihood = array_ops.transpose(likelihood, order) - - if not context.executing_eagerly(): - values_shape, likelihood_shape = self.compute_output_shape(inputs.shape) - values.set_shape(values_shape) - likelihood.set_shape(likelihood_shape) - - return values, likelihood - - def compress(self, inputs): - """Compress inputs and store their binary representations into strings. - - Args: - inputs: `Tensor` with values to be compressed. - - Returns: - String `Tensor` vector containing the compressed representation of each - batch element of `inputs`. - """ - with ops.name_scope(self._name_scope()): - inputs = ops.convert_to_tensor(inputs) - if not self.built: - # Check input assumptions set before layer building, e.g. input rank. - self._assert_input_compatibility(inputs) - if self.dtype is None: - self._dtype = inputs.dtype.base_dtype.name - self.build(inputs.shape) - - # Check input assumptions set after layer building, e.g. input shape. - if not context.executing_eagerly(): - self._assert_input_compatibility(inputs) - - ndim = self.input_spec.ndim - channel_axis = self._channel_axis(ndim) - # Tuple of slices for expanding dimensions of tensors below. - slices = ndim * [None] + [slice(None)] - slices[channel_axis] = slice(None) - slices = tuple(slices) - - # Expand dimensions of CDF to input dimensions, keeping the channels along - # the right dimension. - cdf = self._quantized_cdf[slices[1:]] - num_levels = array_ops.shape(cdf)[-1] - 1 - - # Bring inputs to the right range by centering the range on the medians. - half = constant_op.constant(.5, dtype=self.dtype) - medians = array_ops.squeeze(self._medians, [1, 2]) - offsets = (math_ops.cast(num_levels // 2, self.dtype) + half) - medians - # Expand offsets to input dimensions and add to inputs. - values = inputs + offsets[slices[:-1]] - - # Clip to range and cast to integers. Because we have added .5 above, and - # all values are positive, the cast effectively implements rounding. - values = math_ops.maximum(values, half) - values = math_ops.minimum( - values, math_ops.cast(num_levels, self.dtype) - half) - values = math_ops.cast(values, dtypes.int16) - - def loop_body(tensor): - return coder_ops.range_encode( - tensor, cdf, precision=self.range_coder_precision) - strings = functional_ops.map_fn( - loop_body, values, dtype=dtypes.string, back_prop=False) - - if not context.executing_eagerly(): - strings.set_shape(inputs.shape[:1]) - - return strings - - def decompress(self, strings, shape, channels=None): - """Decompress values from their compressed string representations. - - Args: - strings: A string `Tensor` vector containing the compressed data. - shape: A `Tensor` vector of int32 type. Contains the shape of the tensor - to be decompressed, excluding the batch dimension. - channels: Integer. Specifies the number of channels statically. Needs only - be set if the layer hasn't been built yet (i.e., this is the first input - it receives). - - Returns: - The decompressed `Tensor`. Its shape will be equal to `shape` prepended - with the batch dimension from `strings`. - - Raises: - ValueError: If the length of `shape` isn't available at graph construction - time. - """ - with ops.name_scope(self._name_scope()): - strings = ops.convert_to_tensor(strings) - shape = ops.convert_to_tensor(shape) - if self.built: - ndim = self.input_spec.ndim - channel_axis = self._channel_axis(ndim) - if channels is None: - channels = self.input_spec.axes[channel_axis] - else: - if not (shape.shape.is_fully_defined() and shape.shape.ndims == 1): - raise ValueError("`shape` must be a vector with known length.") - ndim = shape.shape[0].value + 1 - channel_axis = self._channel_axis(ndim) - input_shape = ndim * [None] - input_shape[channel_axis] = channels - self.build(input_shape) - - # Tuple of slices for expanding dimensions of tensors below. - slices = ndim * [None] + [slice(None)] - slices[channel_axis] = slice(None) - slices = tuple(slices) - - # Expand dimensions of CDF to input dimensions, keeping the channels along - # the right dimension. - cdf = self._quantized_cdf[slices[1:]] - num_levels = array_ops.shape(cdf)[-1] - 1 - - def loop_body(string): - return coder_ops.range_decode( - string, shape, cdf, precision=self.range_coder_precision) - outputs = functional_ops.map_fn( - loop_body, strings, dtype=dtypes.int16, back_prop=False) - outputs = math_ops.cast(outputs, self.dtype) - - medians = array_ops.squeeze(self._medians, [1, 2]) - offsets = math_ops.cast(num_levels // 2, self.dtype) - medians - outputs -= offsets[slices[:-1]] - - if not context.executing_eagerly(): - outputs_shape = ndim * [None] - outputs_shape[0] = strings.shape[0] - outputs_shape[channel_axis] = channels - outputs.set_shape(outputs_shape) - - return outputs - - def visualize(self): - """Multi-channel visualization of densities as images. - - Creates and returns an image summary visualizing the current probabilty - density estimates. The image contains one row for each channel. Within each - row, the pixel intensities are proportional to probability values, and each - row is centered on the median of the corresponding distribution. - - Returns: - The created image summary. - """ - with ops.name_scope(self._name_scope()): - image = self._pmf - image *= 255 / math_ops.reduce_max(image, axis=1, keepdims=True) - image = math_ops.cast(image + .5, dtypes.uint8) - image = image[None, :, :, None] - return summary.image("pmf", image, max_outputs=1) - - def compute_output_shape(self, input_shape): - input_shape = tensor_shape.TensorShape(input_shape) - return input_shape, input_shape diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py deleted file mode 100644 index 798b0234ebcce7df108a0da65d1305502ce0253a..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py +++ /dev/null @@ -1,315 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests of EntropyBottleneck class.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - -from tensorflow.contrib.coder.python.layers import entropybottleneck - -from tensorflow.python.framework import dtypes -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import variables -from tensorflow.python.platform import test -from tensorflow.python.training import gradient_descent - - -class EntropyBottleneckTest(test.TestCase): - - def test_noise(self): - # Tests that the noise added is uniform noise between -0.5 and 0.5. - inputs = array_ops.placeholder(dtypes.float32, (None, 1)) - layer = entropybottleneck.EntropyBottleneck() - noisy, _ = layer(inputs, training=True) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - values = np.linspace(-50, 50, 100)[:, None] - noisy, = sess.run([noisy], {inputs: values}) - self.assertFalse(np.allclose(values, noisy, rtol=0, atol=.49)) - self.assertAllClose(values, noisy, rtol=0, atol=.5) - - def test_quantization(self): - # Tests that inputs are quantized to full integer values, even after - # quantiles have been updated. - inputs = array_ops.placeholder(dtypes.float32, (None, 1)) - layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=False) - quantized, _ = layer(inputs, training=False) - opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) - self.assertTrue(len(layer.losses) == 1) - step = opt.minimize(layer.losses[0]) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - sess.run(step) - values = np.linspace(-50, 50, 100)[:, None] - quantized, = sess.run([quantized], {inputs: values}) - self.assertAllClose(np.around(values), quantized, rtol=0, atol=1e-6) - - def test_quantization_optimized_offset(self): - # Tests that inputs are not quantized to full integer values after quantiles - # have been updated. However, the difference between input and output should - # be between -0.5 and 0.5, and the offset must be consistent. - inputs = array_ops.placeholder(dtypes.float32, (None, 1)) - layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=True) - quantized, _ = layer(inputs, training=False) - opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) - self.assertTrue(len(layer.losses) == 1) - step = opt.minimize(layer.losses[0]) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - sess.run(step) - values = np.linspace(-50, 50, 100)[:, None] - quantized, = sess.run([quantized], {inputs: values}) - self.assertAllClose(values, quantized, rtol=0, atol=.5) - diff = np.ravel(np.around(values) - quantized) % 1 - self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6) - self.assertNotEqual(diff[0], 0) - - def test_codec(self): - # Tests that inputs are compressed and decompressed correctly, and quantized - # to full integer values, even after quantiles have been updated. - inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) - layer = entropybottleneck.EntropyBottleneck( - data_format="channels_last", init_scale=60, - optimize_integer_offset=False) - bitstrings = layer.compress(inputs) - decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) - opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) - self.assertTrue(len(layer.losses) == 1) - step = opt.minimize(layer.losses[0]) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - sess.run(step) - self.assertTrue(len(layer.updates) == 1) - sess.run(layer.updates[0]) - values = np.linspace(-50, 50, 100)[None, :, None] - decoded, = sess.run([decoded], {inputs: values}) - self.assertAllClose(np.around(values), decoded, rtol=0, atol=1e-6) - - def test_codec_optimized_offset(self): - # Tests that inputs are compressed and decompressed correctly, and not - # quantized to full integer values after quantiles have been updated. - # However, the difference between input and output should be between -0.5 - # and 0.5, and the offset must be consistent. - inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) - layer = entropybottleneck.EntropyBottleneck( - data_format="channels_last", init_scale=60, - optimize_integer_offset=True) - bitstrings = layer.compress(inputs) - decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) - opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) - self.assertTrue(len(layer.losses) == 1) - step = opt.minimize(layer.losses[0]) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - sess.run(step) - self.assertTrue(len(layer.updates) == 1) - sess.run(layer.updates[0]) - values = np.linspace(-50, 50, 100)[None, :, None] - decoded, = sess.run([decoded], {inputs: values}) - self.assertAllClose(values, decoded, rtol=0, atol=.5) - diff = np.ravel(np.around(values) - decoded) % 1 - self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6) - self.assertNotEqual(diff[0], 0) - - def test_codec_clipping(self): - # Tests that inputs are compressed and decompressed correctly, and clipped - # to the expected range. - inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) - layer = entropybottleneck.EntropyBottleneck( - data_format="channels_last", init_scale=40) - bitstrings = layer.compress(inputs) - decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - self.assertTrue(len(layer.updates) == 1) - sess.run(layer.updates[0]) - values = np.linspace(-50, 50, 100)[None, :, None] - decoded, = sess.run([decoded], {inputs: values}) - expected = np.clip(np.around(values), -40, 40) - self.assertAllClose(expected, decoded, rtol=0, atol=1e-6) - - def test_channels_last(self): - # Test the layer with more than one channel and multiple input dimensions, - # with the channels in the last dimension. - inputs = array_ops.placeholder(dtypes.float32, (None, None, None, 2)) - layer = entropybottleneck.EntropyBottleneck( - data_format="channels_last", init_scale=50) - noisy, _ = layer(inputs, training=True) - quantized, _ = layer(inputs, training=False) - bitstrings = layer.compress(inputs) - decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - self.assertTrue(len(layer.updates) == 1) - sess.run(layer.updates[0]) - values = 5 * np.random.normal(size=(7, 5, 3, 2)) - noisy, quantized, decoded = sess.run( - [noisy, quantized, decoded], {inputs: values}) - self.assertAllClose(values, noisy, rtol=0, atol=.5) - self.assertAllClose(values, quantized, rtol=0, atol=.5) - self.assertAllClose(values, decoded, rtol=0, atol=.5) - - def test_channels_first(self): - # Test the layer with more than one channel and multiple input dimensions, - # with the channel dimension right after the batch dimension. - inputs = array_ops.placeholder(dtypes.float32, (None, 3, None, None)) - layer = entropybottleneck.EntropyBottleneck( - data_format="channels_first", init_scale=50) - noisy, _ = layer(inputs, training=True) - quantized, _ = layer(inputs, training=False) - bitstrings = layer.compress(inputs) - decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - self.assertTrue(len(layer.updates) == 1) - sess.run(layer.updates[0]) - values = 5 * np.random.normal(size=(2, 3, 5, 7)) - noisy, quantized, decoded = sess.run( - [noisy, quantized, decoded], {inputs: values}) - self.assertAllClose(values, noisy, rtol=0, atol=.5) - self.assertAllClose(values, quantized, rtol=0, atol=.5) - self.assertAllClose(values, decoded, rtol=0, atol=.5) - - def test_compress(self): - # Test compression and decompression, and produce test data for - # `test_decompress`. If you set the constant at the end to `True`, this test - # will fail and the log will contain the new test data. - inputs = array_ops.placeholder(dtypes.float32, (2, 3, 10)) - layer = entropybottleneck.EntropyBottleneck( - data_format="channels_first", filters=(), init_scale=2) - bitstrings = layer.compress(inputs) - decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - self.assertTrue(len(layer.updates) == 1) - sess.run(layer.updates[0]) - values = 5 * np.random.uniform(size=(2, 3, 10)) - 2.5 - bitstrings, quantized_cdf, decoded = sess.run( - [bitstrings, layer._quantized_cdf, decoded], {inputs: values}) - self.assertAllClose(values, decoded, rtol=0, atol=.5) - # Set this constant to `True` to log new test data for `test_decompress`. - if False: # pylint:disable=using-constant-test - assert False, (bitstrings, quantized_cdf, decoded) - - # Data generated by `test_compress`. - # pylint:disable=g-inconsistent-quotes,bad-whitespace - bitstrings = np.array([ - b'\x1e\xbag}\xc2\xdaN\x8b\xbd.', - b'\x8dF\xf0%\x1cv\xccllW' - ], dtype=object) - - quantized_cdf = np.array([ - [ 0, 15636, 22324, 30145, 38278, 65536], - [ 0, 19482, 26927, 35052, 42904, 65535], - [ 0, 21093, 28769, 36919, 44578, 65536] - ], dtype=np.int32) - - expected = np.array([ - [[-2., 1., 0., -2., -1., -2., -2., -2., 2., -1.], - [ 1., 2., 1., 0., -2., -2., 1., 2., 0., 1.], - [ 2., 0., -2., 2., 0., -1., -2., 0., 2., 0.]], - [[ 1., 2., 0., -1., 1., 2., 1., 1., 2., -2.], - [ 2., -1., -1., 0., -1., 2., 0., 2., -2., 2.], - [ 2., -2., -2., -1., -2., 1., -2., 0., 0., 0.]] - ], dtype=np.float32) - # pylint:enable=g-inconsistent-quotes,bad-whitespace - - def test_decompress(self): - # Test that decompression of values compressed with a previous version - # works, i.e. that the file format doesn't change across revisions. - bitstrings = array_ops.placeholder(dtypes.string) - input_shape = array_ops.placeholder(dtypes.int32) - quantized_cdf = array_ops.placeholder(dtypes.int32) - layer = entropybottleneck.EntropyBottleneck( - data_format="channels_first", filters=(), dtype=dtypes.float32) - layer.build(self.expected.shape) - layer._quantized_cdf = quantized_cdf - decoded = layer.decompress(bitstrings, input_shape[1:]) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - decoded, = sess.run([decoded], { - bitstrings: self.bitstrings, input_shape: self.expected.shape, - quantized_cdf: self.quantized_cdf}) - self.assertAllClose(self.expected, decoded, rtol=0, atol=1e-6) - - def test_build_decompress(self): - # Test that layer can be built when `decompress` is the first call to it. - bitstrings = array_ops.placeholder(dtypes.string) - input_shape = array_ops.placeholder(dtypes.int32, shape=[3]) - layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) - layer.decompress(bitstrings, input_shape[1:], channels=5) - self.assertTrue(layer.built) - - def test_pmf_normalization(self): - # Test that probability mass functions are normalized correctly. - layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) - layer.build((None, 10)) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - pmf, = sess.run([layer._pmf]) - self.assertAllClose(np.ones(10), np.sum(pmf, axis=-1), rtol=0, atol=1e-6) - - def test_visualize(self): - # Test that summary op can be constructed. - layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) - layer.build((None, 10)) - summary = layer.visualize() - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - sess.run([summary]) - - def test_normalization(self): - # Test that densities are normalized correctly. - inputs = array_ops.placeholder(dtypes.float32, (None, 1)) - layer = entropybottleneck.EntropyBottleneck(filters=(2,)) - _, likelihood = layer(inputs, training=True) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - x = np.repeat(np.arange(-200, 201), 1000)[:, None] - likelihood, = sess.run([likelihood], {inputs: x}) - self.assertEqual(x.shape, likelihood.shape) - integral = np.sum(likelihood) * .001 - self.assertAllClose(1, integral, rtol=0, atol=1e-4) - - def test_entropy_estimates(self): - # Test that entropy estimates match actual range coding. - inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) - layer = entropybottleneck.EntropyBottleneck( - filters=(2, 3), data_format="channels_last") - _, likelihood = layer(inputs, training=True) - diff_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2) - _, likelihood = layer(inputs, training=False) - disc_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2) - bitstrings = layer.compress(inputs) - with self.test_session() as sess: - sess.run(variables.global_variables_initializer()) - self.assertTrue(len(layer.updates) == 1) - sess.run(layer.updates[0]) - diff_entropy, disc_entropy, bitstrings = sess.run( - [diff_entropy, disc_entropy, bitstrings], - {inputs: np.random.normal(size=(1, 10000, 1))}) - codelength = 8 * sum(len(bitstring) for bitstring in bitstrings) - self.assertAllClose(diff_entropy, disc_entropy, rtol=5e-3, atol=0) - self.assertAllClose(disc_entropy, codelength, rtol=5e-3, atol=0) - self.assertGreater(codelength, disc_entropy) - - -if __name__ == "__main__": - test.main() diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py index 3791dae8d7f6b03bc1115bca97811dfc4775c45b..ff846b191a34e3f3b4aa35671ca22b96b963db80 100644 --- a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py +++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py @@ -150,7 +150,7 @@ def _project_stochastic_matrix_wrt_euclidean_norm(matrix): "matrix must be two dimensional (instead is %d-dimensional)" % matrix_shape.ndims) if matrix_shape[0] != matrix_shape[1]: - raise ValueError("matrix must be be square (instead has shape (%d,%d))" % + raise ValueError("matrix must be square (instead has shape (%d,%d))" % (matrix_shape[0], matrix_shape[1])) dimension = matrix_shape[0].value if dimension is None: diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py index f56a973f6f80b81697e9f58578e60a2efb90154e..8cfe14205927bf7763cf36fa31012ab10fce995c 100644 --- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py +++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py @@ -158,7 +158,7 @@ class CrfTest(test.TestCase): # Test both the length-1 and regular cases. sequence_lengths_list = [ np.array(3, dtype=np.int32), - np.array(1, dtype=np.int32) + np.array(1, dtype=np.int64) ] inputs_list = [ np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], @@ -291,7 +291,7 @@ class CrfTest(test.TestCase): # Test both the length-1 and regular cases. sequence_lengths_list = [ np.array(3, dtype=np.int32), - np.array(1, dtype=np.int32) + np.array(1, dtype=np.int64) ] inputs_list = [ np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py index 8a7ff61bc8391efe453ee37019c23bd6ccbdf066..2a91dcb63a80016e62d10d1310ca57e3e54434c5 100644 --- a/tensorflow/contrib/crf/python/ops/crf.py +++ b/tensorflow/contrib/crf/python/ops/crf.py @@ -548,7 +548,9 @@ def crf_decode(potentials, transition_params, sequence_length): initial_state = array_ops.squeeze(initial_state, axis=[1]) # [B, O] inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1]) # [B, T-1, O] # Sequence length is not allowed to be less than zero. - sequence_length_less_one = math_ops.maximum(0, sequence_length - 1) + sequence_length_less_one = math_ops.maximum( + constant_op.constant(0, dtype=sequence_length.dtype), + sequence_length - 1) backpointers, last_score = rnn.dynamic_rnn( # [B, T - 1, O], [B, O] crf_fwd_cell, inputs=inputs, diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py index d58198faf353aab68430d2fa153a18de359112de..e26d56c8579e110d61c73c6154b82f47f0093687 100644 --- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py +++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py @@ -56,7 +56,7 @@ class _CudnnRNN(base_layer.Layer): Cudnn RNNs have two major differences from other platform-independent RNNs tf provides: * Cudnn LSTM and GRU are mathematically different from their tf counterparts. - (e.g. @{tf.contrib.rnn.LSTMBlockCell} and @{tf.nn.rnn_cell.GRUCell}. + (e.g. `tf.contrib.rnn.LSTMBlockCell` and `tf.nn.rnn_cell.GRUCell`. * Cudnn-trained checkpoints are not directly compatible with tf RNNs: * They use a single opaque parameter buffer for the entire (possibly) multi-layer multi-directional RNN; Whereas tf RNN weights are per-cell and @@ -182,7 +182,7 @@ class _CudnnRNN(base_layer.Layer): dropout: dropout rate, a number between [0, 1]. Dropout is applied between each layer (no dropout is applied for a model with a single layer). When set to 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. dtype: tf.float16, tf.float32 or tf.float64 kernel_initializer: starting value to initialize the weight. diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py index 748d7cd011f32fdebd781176b560b9b7498f327e..2c92f31788378c2a9f01183bc04b035668b59b59 100644 --- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py +++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py @@ -61,8 +61,8 @@ _WEIGHTS_VARIABLE_NAME = rnn_cell_impl._WEIGHTS_VARIABLE_NAME class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell): """Cudnn Compatible LSTMCell. - A simple wrapper around @{tf.contrib.rnn.LSTMBlockCell} to use along with - @{tf.contrib.cudnn_rnn.CudnnLSTM}. The latter's params can be used by + A simple wrapper around `tf.contrib.rnn.LSTMBlockCell` to use along with + `tf.contrib.cudnn_rnn.CudnnLSTM`. The latter's params can be used by this cell seamlessly. """ @@ -76,8 +76,8 @@ class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell): class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell): """Cudnn Compatible GRUCell. - A GRU impl akin to @{tf.nn.rnn_cell.GRUCell} to use along with - @{tf.contrib.cudnn_rnn.CudnnGRU}. The latter's params can be used by + A GRU impl akin to `tf.nn.rnn_cell.GRUCell` to use along with + `tf.contrib.cudnn_rnn.CudnnGRU`. The latter's params can be used by it seamlessly. It differs from platform-independent GRUs in how the new memory gate is @@ -97,7 +97,7 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell): $$h_t = (1 - u_t) .* h'_t + u_t .* h_t-1$$ ``` - Other GRU (see @{tf.nn.rnn_cell.GRUCell} and @{tf.contrib.rnn.GRUBlockCell}): + Other GRU (see `tf.nn.rnn_cell.GRUCell` and `tf.contrib.rnn.GRUBlockCell`): ```python # new memory gate \\(h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_{Wh})\\) @@ -891,7 +891,7 @@ def _cudnn_rnn(inputs, direction: the direction model that the model operates. Could be either 'unidirectional' or 'bidirectional' dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. name: name of the operation. Returns: @@ -957,7 +957,7 @@ def cudnn_lstm(inputs, direction: the direction model that the model operates. Could be either 'unidirectional' or 'bidirectional' dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. name: name of the operation. Returns: @@ -998,7 +998,7 @@ def _cudnn_rnn_no_input_c(inputs, direction: the direction model that the model operates. Could be either 'unidirectional' or 'bidirectional' dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. name: name of the operation. Returns: @@ -1040,7 +1040,7 @@ def cudnn_gru(inputs, direction: the direction model that the model operates. Could be either 'unidirectional' or 'bidirectional' dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. name: name of the operation. Returns: @@ -1079,7 +1079,7 @@ def cudnn_rnn_relu(inputs, direction: the direction model that the model operates. Could be either 'unidirectional' or 'bidirectional' dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. name: name of the operation. Returns: @@ -1119,7 +1119,7 @@ def cudnn_rnn_tanh(inputs, direction: the direction model that the model operates. Could be either 'unidirectional' or 'bidirectional' dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. name: name of the operation. Returns: @@ -1161,7 +1161,7 @@ def cudnn_rnn_opaque_params_to_canonical(rnn_mode, direction: the direction model that the model operates. Could be either 'unidirectional' or 'bidirectional' dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. name: name of the operation. Returns: @@ -1224,7 +1224,7 @@ def cudnn_rnn_canonical_to_opaque_params(rnn_mode, direction: the direction model that the model operates. Could be either 'unidirectional' or 'bidirectional' dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. name: name of the operation. Returns: @@ -1282,7 +1282,7 @@ def cudnn_rnn_opaque_params_size(rnn_mode, 'unidirectional' or 'bidirectional' dtype: one of tf.float32 or tf.float64. dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. name: name of the operation. Returns: @@ -1349,7 +1349,7 @@ class _CudnnRNN(object): 'unidirectional' or 'bidirectional' dtype: dtype of params, tf.float32 or tf.float64. dropout: whether to enable dropout. With it is 0, dropout is disabled. - seed: the op seed used for initializing dropout. See @{tf.set_random_seed} + seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. Raises: ValueError: if direction is invalid. diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py index 7878e46e88b2ea8b0012768342c218baeda80eaa..dbfff9b4f86065de9736eed72de173bc1bef35d6 100644 --- a/tensorflow/contrib/data/__init__.py +++ b/tensorflow/contrib/data/__init__.py @@ -15,7 +15,7 @@ """Experimental API for building input pipelines. This module contains experimental `Dataset` sources and transformations that can -be used in conjunction with the @{tf.data.Dataset} API. Note that the +be used in conjunction with the `tf.data.Dataset` API. Note that the `tf.contrib.data` API is not subject to the same backwards compatibility guarantees as `tf.data`, but we will provide deprecation advice in advance of removing existing functionality. diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD index 566cbb246a104d1e6cfc284d220ca8386b8897e1..2e249f5c14ab111ae412ff3288acc25de8d7aa11 100644 --- a/tensorflow/contrib/data/kernels/BUILD +++ b/tensorflow/contrib/data/kernels/BUILD @@ -37,6 +37,7 @@ cc_library( "//third_party/eigen3", "@protobuf_archive//:protobuf_headers", ], + alwayslink = 1, ) cc_library( @@ -58,6 +59,7 @@ cc_library( "//third_party/eigen3", "@protobuf_archive//:protobuf_headers", ], + alwayslink = 1, ) cc_library( @@ -68,6 +70,7 @@ cc_library( "//third_party/eigen3", "@protobuf_archive//:protobuf_headers", ], + alwayslink = 1, ) cc_library( @@ -78,6 +81,7 @@ cc_library( "//third_party/eigen3", "@protobuf_archive//:protobuf_headers", ], + alwayslink = 1, ) cc_library( diff --git a/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc b/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc index 95b8e1f7fd487119d77a5f708de42b014c55f79d..e36c9c0634235022362b59a6699b4d550d6d0eee 100644 --- a/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc +++ b/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc @@ -42,13 +42,13 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: Dataset(OpKernelContext* ctx, const DatasetBase* input, const std::vector& transformations, const DataTypeVector& output_types, const std::vector& output_shapes) - : GraphDatasetBase(ctx), + : DatasetBase(DatasetContext(ctx)), input_(input), transformations_(transformations), output_types_(output_types), @@ -76,10 +76,11 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel { } protected: - Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b, + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, Node** output) const override { Node* input_graph_node = nullptr; - TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node)); + TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); Node* transformations_node = nullptr; TF_RETURN_IF_ERROR(b->AddVector(transformations_, &transformations_node)); TF_RETURN_IF_ERROR(b->AddDataset( @@ -121,13 +122,13 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel { protected: Status SaveInternal(IteratorStateWriter* writer) override { - TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_)); + TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); return Status::OK(); } Status RestoreInternal(IteratorContext* ctx, IteratorStateReader* reader) override { - TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_)); + TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); return Status::OK(); } diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc index f7e3ed886c6655cdc07e08bbe2fbe82e671a6802..d242cfdf4911ee43051b8aa2f7b960916b40374a 100644 --- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc +++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc @@ -131,7 +131,7 @@ class CSVDatasetOp : public DatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: Dataset(OpKernelContext* ctx, std::vector filenames, bool header, string compression_type, io::ZlibCompressionOptions options, @@ -139,7 +139,7 @@ class CSVDatasetOp : public DatasetOpKernel { const std::vector& output_shapes, std::vector record_defaults, std::vector select_cols, bool use_quote_delim, char delim, string na_value) - : GraphDatasetBase(ctx), + : DatasetBase(DatasetContext(ctx)), filenames_(std::move(filenames)), header_(header), out_type_(output_types), @@ -168,7 +168,8 @@ class CSVDatasetOp : public DatasetOpKernel { string DebugString() const override { return "CSVDatasetOp::Dataset"; } protected: - Status AsGraphDefInternal(DatasetGraphDefBuilder* b, + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, Node** output) const override { Node* filenames = nullptr; Node* compression_type = nullptr; diff --git a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc index 6a12ca06f4d6cc2096aaf8191a01a899881b43db..ccf7ec1f842f5a1ad9b304c904f046ad49ed1757 100644 --- a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc +++ b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc @@ -63,11 +63,11 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: Dataset(OpKernelContext* ctx, const DatasetBase* selector_input, std::vector data_inputs) - : GraphDatasetBase(ctx), + : DatasetBase(DatasetContext(ctx)), selector_input_(selector_input), data_inputs_(std::move(data_inputs)) { selector_input_->Ref(); @@ -110,15 +110,16 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel { } protected: - Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b, + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, Node** output) const override { Node* selector_input_node; TF_RETURN_IF_ERROR( - b->AddParentDataset(ctx, selector_input_, &selector_input_node)); + b->AddInputDataset(ctx, selector_input_, &selector_input_node)); std::vector data_input_nodes(data_inputs_.size()); for (size_t i = 0; i < data_inputs_.size(); ++i) { TF_RETURN_IF_ERROR( - b->AddParentDataset(ctx, data_inputs_[i], &data_input_nodes[i])); + b->AddInputDataset(ctx, data_inputs_[i], &data_input_nodes[i])); } TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, selector_input_node}}, {{1, data_input_nodes}}, {}, output)); @@ -204,7 +205,7 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel { Status SaveInternal(IteratorStateWriter* writer) override { mutex_lock l(mu_); if (selector_input_impl_) { - TF_RETURN_IF_ERROR(SaveParent(writer, selector_input_impl_)); + TF_RETURN_IF_ERROR(SaveInput(writer, selector_input_impl_)); } else { TF_RETURN_IF_ERROR( writer->WriteScalar(full_name("selector_input_impl_empty"), "")); @@ -212,7 +213,7 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel { for (size_t i = 0; i < data_input_impls_.size(); ++i) { const auto& data_input_impl = data_input_impls_[i]; if (data_input_impl) { - TF_RETURN_IF_ERROR(SaveParent(writer, data_input_impl)); + TF_RETURN_IF_ERROR(SaveInput(writer, data_input_impl)); } else { TF_RETURN_IF_ERROR(writer->WriteScalar( full_name(strings::StrCat("data_input_impl_empty[", i, "]")), @@ -226,15 +227,14 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel { IteratorStateReader* reader) override { mutex_lock l(mu_); if (!reader->Contains(full_name("selector_input_impl_empty"))) { - TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, selector_input_impl_)); + TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, selector_input_impl_)); } else { selector_input_impl_.reset(); } for (size_t i = 0; i < data_input_impls_.size(); ++i) { if (!reader->Contains(full_name( strings::StrCat("data_input_impl_empty[", i, "]")))) { - TF_RETURN_IF_ERROR( - RestoreParent(ctx, reader, data_input_impls_[i])); + TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, data_input_impls_[i])); } else { data_input_impls_[i].reset(); } diff --git a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc index bbec50681c6f5decec5a3b5fbf09cc3011a21199..db24e608463224f05159b57eb721718afd7cbb20 100644 --- a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc +++ b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc @@ -35,10 +35,10 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: explicit Dataset(OpKernelContext* ctx, const DatasetBase* input) - : GraphDatasetBase(ctx), input_(input) { + : DatasetBase(DatasetContext(ctx)), input_(input) { input_->Ref(); } @@ -62,10 +62,11 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel { } protected: - Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b, + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, Node** output) const override { Node* input_graph_node = nullptr; - TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node)); + TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output)); return Status::OK(); } @@ -106,7 +107,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel { Status SaveInternal(IteratorStateWriter* writer) override { mutex_lock l(mu_); if (input_impl_) - TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_)); + TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); else TF_RETURN_IF_ERROR( writer->WriteScalar(full_name("input_impls_empty"), "")); @@ -119,7 +120,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel { if (reader->Contains(full_name("input_impls_empty"))) input_impl_.reset(); else - TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_)); + TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); return Status::OK(); } diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/contrib/data/kernels/prefetching_kernels.cc index 32f03ca68364e40c6fd6769f05d0566f50119240..74df1e42a8fbca9b6a65aa4800424d27aa90de24 100644 --- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc +++ b/tensorflow/contrib/data/kernels/prefetching_kernels.cc @@ -526,6 +526,15 @@ string SanitizeThreadSuffix(string suffix) { return clean; } +struct HostBufferElement { + Status status; + bool end_of_sequence; + std::vector value; +}; + +using MultiDeviceIteratorCallback = + std::function; + class MultiDeviceIterator : public ResourceBase { public: MultiDeviceIterator(const DataTypeVector& output_types, @@ -539,83 +548,45 @@ class MultiDeviceIterator : public ResourceBase { devices_(devices), flib_def_(std::move(flib_def)), pflr_(std::move(pflr)), - lib_(lib) { - buffer_.resize(devices_.size()); - } + lib_(lib) {} string DebugString() override { - return strings::StrCat("MultiDeviceIterator"); + return strings::StrCat("MultiDeviceIterator for ", devices_.size(), + " devices"); } - Status Init(std::unique_ptr iterator, int64* incarnation_id) { - mutex_lock l(mu_); + Status Init(std::unique_ptr iterator, int64 max_buffer_size, + int64* incarnation_id) { if (iterator) { TF_RETURN_IF_ERROR( VerifyTypesMatch(output_types_, iterator->output_dtypes())); TF_RETURN_IF_ERROR( VerifyShapesCompatible(output_shapes_, iterator->output_shapes())); } - host_iterator_.reset(iterator.release()); - incarnation_id_++; + + mutex_lock l(mu_); + if (multi_device_buffer_) { + multi_device_buffer_->Reset(); + } + + ++incarnation_id_; *incarnation_id = incarnation_id_; - max_buffer_size_ = 0; - num_elements_ = 0; - buffer_.clear(); - buffer_.resize(devices_.size()); + + multi_device_buffer_.reset( + new MultiDeviceBuffer(devices_.size(), max_buffer_size, incarnation_id_, + std::move(iterator))); return Status::OK(); } - Status GetNextFromShard(IteratorContext* ctx, int shard_num, - int64 incarnation_id, - std::vector* out_tensors, - bool* end_of_sequence) { - // TODO(rohanj): This might potentially strand elements in other shards. - // Opportunity to do smarter locking semantics. - mutex_lock l(mu_); - // Make sure we're in the right incarnation. - if (incarnation_id != incarnation_id_) { - return errors::InvalidArgument( - "Current incarnation: ", incarnation_id_, - "; Supplied incarnation: ", incarnation_id); - } - // Then look it up in the buffer. - if (!buffer_[shard_num].empty()) { - const HostBufferElement& elem = buffer_[shard_num].front(); - *out_tensors = elem.value; - *end_of_sequence = elem.end_of_sequence; - Status s = elem.status; - buffer_[shard_num].pop_front(); - return s; - } - std::shared_ptr captured_iterator(host_iterator_); - if (captured_iterator) { - if (lib_ != nullptr) { - ctx->set_lib(lib_); - } - while (true) { - HostBufferElement elem; - elem.status = - captured_iterator->GetNext(ctx, &elem.value, &elem.end_of_sequence); - int buffer_index = num_elements_ % devices_.size(); - num_elements_++; - if (buffer_index == shard_num) { - out_tensors->swap(elem.value); - *end_of_sequence = elem.end_of_sequence; - return elem.status; - } else { - buffer_[buffer_index].push_back(std::move(elem)); - // TODO(rohanj): Put an upper bound to buffer size. - if (buffer_[buffer_index].size() > max_buffer_size_) { - max_buffer_size_ = buffer_[buffer_index].size(); - VLOG(1) << "MultiDeviceIterator: Max buffer size increased to: " - << max_buffer_size_; - } - } - } - } else { - return errors::FailedPrecondition("Iterator not initialized"); + void GetNextFromShard(IteratorContext* ctx, int shard_num, + int64 incarnation_id, + MultiDeviceIteratorCallback callback) { + if (lib_ != nullptr) { + ctx->set_lib(lib_); } - return Status::OK(); + tf_shared_lock l(mu_); + multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id, + std::move(callback)); } const DataTypeVector& output_types() const { return output_types_; } @@ -630,25 +601,218 @@ class MultiDeviceIterator : public ResourceBase { } private: - struct HostBufferElement { - Status status; - bool end_of_sequence; - std::vector value; + // A private class that uses a background thread to keep a per device buffer + // full. + class MultiDeviceBuffer { + public: + MultiDeviceBuffer(size_t size, int64 max_buffer_size, int64 incarnation_id, + std::unique_ptr host_iterator) + : buffer_(size), + size_(size), + max_buffer_size_(max_buffer_size), + incarnation_id_(incarnation_id), + host_iterator_(std::move(host_iterator)) {} + + ~MultiDeviceBuffer() { Reset(); } + + void Reset() LOCKS_EXCLUDED(mu_) { + { + mutex_lock l(mu_); + if (background_thread_finished_) { + return; + } + + cancelled_ = true; + // Wake up the background thread. + for (int i = 0; i < size_; ++i) { + buffer_[i].cond_var.notify_all(); + } + + // Make sure background thread has finished first. + while (!background_thread_finished_) { + shutdown_cond_var_.wait(l); + } + } + RunPendingCallbacks(); + } + + void GetNextFromShard(IteratorContext* ctx, int shard_num, + int64 incarnation_id, + MultiDeviceIteratorCallback callback) { + HostBufferElement elem; + if (incarnation_id_ != incarnation_id) { + elem.status = errors::InvalidArgument("Invalid incarnation id"); + callback(elem); + return; + } + + bool produced_output = false; + { + mutex_lock l(mu_); + if (cancelled_) { + elem.status = errors::Cancelled("Cancelled Multidevice iterator"); + callback(elem); + return; + } + + EnsureBackgroundThreadStarted(ctx); + + if (!buffer_[shard_num].data.empty()) { + produced_output = true; + std::swap(elem, buffer_[shard_num].data.front()); + buffer_[shard_num].data.pop_front(); + // Wake up background thread if it is blocked on this element. + if (buffer_[shard_num].data.size() == max_buffer_size_ - 1) { + buffer_[shard_num].cond_var.notify_all(); + } + } else { + if (background_thread_finished_) { + produced_output = true; + elem.end_of_sequence = true; + } else { + buffer_[shard_num].callbacks.push_back(std::move(callback)); + callback = nullptr; + } + } + } + + if (produced_output) { + callback(elem); + } + } + + private: + void EnsureBackgroundThreadStarted(IteratorContext* ctx) + EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (!background_thread_) { + background_thread_.reset(ctx->env()->StartThread( + {}, "multi_device_iterator_background_thread", + std::bind(&MultiDeviceIterator::MultiDeviceBuffer::BackgroundThread, + this, new IteratorContext(*ctx)))); + } + } + + void RunPendingCallbacks() LOCKS_EXCLUDED(mu_) { + // Run all remaining callbacks. + std::vector cancellation_callbacks; + std::vector cancellation_elements; + { + mutex_lock l(mu_); + + for (int i = 0; i < size_; ++i) { + while (!buffer_[i].callbacks.empty()) { + if (buffer_[i].data.empty()) { + HostBufferElement elem; + elem.status = + errors::Cancelled("Cancelled and buffer not filled."); + cancellation_elements.push_back(std::move(elem)); + } else { + cancellation_elements.push_back( + std::move(buffer_[i].data.front())); + buffer_[i].data.pop_front(); + } + cancellation_callbacks.push_back( + std::move(buffer_[i].callbacks.front())); + buffer_[i].callbacks.pop_front(); + } + } + } + for (int i = 0; i < cancellation_callbacks.size(); ++i) { + cancellation_callbacks[i](cancellation_elements[i]); + } + } + + void BackgroundThread(IteratorContext* ctx) { + std::unique_ptr cleanup(ctx); + int shard_to_fetch = 0; + while (true) { + HostBufferElement elem; + MultiDeviceIteratorCallback callback = nullptr; + bool end_of_iterator = false; + + { + mutex_lock l(mu_); + while (!cancelled_ && + buffer_[shard_to_fetch].data.size() >= max_buffer_size_) { + buffer_[shard_to_fetch].cond_var.wait(l); + } + + if (cancelled_) { + background_thread_finished_ = true; + shutdown_cond_var_.notify_all(); + return; + } + } + + elem.status = + host_iterator_->GetNext(ctx, &elem.value, &elem.end_of_sequence); + + if (elem.status.ok() && elem.end_of_sequence) { + end_of_iterator = true; + } + + { + mutex_lock l(mu_); + // Try to find a callback, else just push stuff into buffer. + if (!buffer_[shard_to_fetch].callbacks.empty()) { + callback = buffer_[shard_to_fetch].callbacks.front(); + buffer_[shard_to_fetch].callbacks.pop_front(); + } else { + buffer_[shard_to_fetch].data.push_back(std::move(elem)); + elem = HostBufferElement(); + } + } + + if (callback) { + (*ctx->runner())(std::bind(std::move(callback), std::move(elem))); + } + + // Finish off the thread if we reach the end of the iterator. Runs + // pending callbacks. + if (end_of_iterator) { + { + mutex_lock l(mu_); + background_thread_finished_ = true; + shutdown_cond_var_.notify_all(); + } + RunPendingCallbacks(); + return; + } + shard_to_fetch = (shard_to_fetch + 1) % size_; + } + } + + struct HostBuffer { + condition_variable cond_var; + std::deque data; + std::deque callbacks; + }; + + mutex mu_; + std::unique_ptr background_thread_ GUARDED_BY(mu_); + bool background_thread_finished_ GUARDED_BY(mu_) = false; + bool cancelled_ GUARDED_BY(mu_) = false; + condition_variable shutdown_cond_var_ GUARDED_BY(mu_); + + std::vector buffer_; + + const size_t size_; + const int64 max_buffer_size_; + const int64 incarnation_id_; + const std::unique_ptr host_iterator_; }; mutex mu_; const DataTypeVector output_types_; const std::vector output_shapes_; const std::vector devices_; - int64 num_elements_ GUARDED_BY(mu_) = 0; - int64 max_buffer_size_ GUARDED_BY(mu_) = 0; - int64 incarnation_id_ GUARDED_BY(mu_) = 0; - std::vector> buffer_ GUARDED_BY(mu_); - std::unique_ptr flib_def_; - std::unique_ptr pflr_; - FunctionLibraryRuntime* lib_ = nullptr; // not owned. - std::shared_ptr host_iterator_; + const std::unique_ptr flib_def_; + const std::unique_ptr pflr_; + FunctionLibraryRuntime* const lib_ = nullptr; // not owned. std::shared_ptr lib_def_ GUARDED_BY(mu_); + + int64 incarnation_id_ GUARDED_BY(mu_) = 0; + std::unique_ptr multi_device_buffer_ GUARDED_BY(mu_); }; // Just creates a MultiDeviceIterator and returns it. @@ -754,6 +918,10 @@ class MultiDeviceIteratorInitOp : public OpKernel { : OpKernel(ctx) {} void Compute(OpKernelContext* ctx) override { + const Tensor* tensor_max_buffer_size; + OP_REQUIRES_OK(ctx, ctx->input("max_buffer_size", &tensor_max_buffer_size)); + int64 max_buffer_size = tensor_max_buffer_size->scalar()(); + DatasetBase* dataset; OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset)); MultiDeviceIterator* resource; @@ -761,12 +929,12 @@ class MultiDeviceIteratorInitOp : public OpKernel { LookupResource(ctx, HandleFromInput(ctx, 1), &resource)); core::ScopedUnref unref(resource); - IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx); std::unique_ptr iterator; - OP_REQUIRES_OK(ctx, - dataset->MakeIterator(&iter_ctx, "Iterator", &iterator)); + OP_REQUIRES_OK(ctx, dataset->MakeIterator(IteratorContext(ctx), "Iterator", + &iterator)); int64 incarnation_id; - OP_REQUIRES_OK(ctx, resource->Init(std::move(iterator), &incarnation_id)); + OP_REQUIRES_OK(ctx, resource->Init(std::move(iterator), max_buffer_size, + &incarnation_id)); Tensor tensor_incarnation_id(DT_INT64, TensorShape({})); tensor_incarnation_id.scalar()() = incarnation_id; OP_REQUIRES_OK(ctx, @@ -804,9 +972,6 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel { ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done); thread_pool_->Schedule(std::bind( [ctx, iterator, shard_num, incarnation_id](DoneCallback done) { - std::vector components; - bool end_of_sequence = false; - IteratorContext::Params params; params.env = ctx->env(); params.runner = *(ctx->runner()); @@ -817,22 +982,26 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel { }; IteratorContext iter_ctx(std::move(params)); - Status s = - iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id, - &components, &end_of_sequence); - iterator->Unref(); + MultiDeviceIteratorCallback callback = std::bind( + [ctx](const HostBufferElement& elem, DoneCallback done) { + // iterator->Unref(); + Status s = elem.status; + if (!s.ok()) { + ctx->SetStatus(s); + } else if (elem.end_of_sequence) { + ctx->SetStatus(errors::OutOfRange("End of sequence")); + } else { + for (int i = 0; i < elem.value.size(); ++i) { + ctx->set_output(i, elem.value[i]); + } + } + done(); + }, + std::placeholders::_1, std::move(done)); - if (!s.ok()) { - ctx->SetStatus(s); - } else if (end_of_sequence) { - ctx->SetStatus(errors::OutOfRange("End of sequence")); - } else { - for (int i = 0; i < components.size(); ++i) { - // TODO(mrry): Check that the shapes match the shape attrs. - ctx->set_output(i, components[i]); - } - } - done(); + iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id, + callback); + iterator->Unref(); }, std::move(done))); } diff --git a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc index 141706f393b076d9f55898ca4bdbe7438f7c3625..ab584504a05369105d080df73750974af9fc70bb 100644 --- a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc +++ b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc @@ -130,11 +130,13 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: Dataset(OpKernelContext* ctx, const DatasetBase* input, ThreadPoolResource* threadpool) - : GraphDatasetBase(ctx), input_(input), threadpool_(threadpool) { + : DatasetBase(DatasetContext(ctx)), + input_(input), + threadpool_(threadpool) { input_->Ref(); threadpool_->Ref(); } @@ -162,11 +164,11 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel { } protected: - Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b, + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, Node** output) const override { - return errors::Unimplemented( - "Cannot currently serialize the thread pool for a " - "ThreadPoolDataset."); + return errors::Unimplemented("%s does not support serialization", + DebugString()); } private: diff --git a/tensorflow/contrib/data/kernels/unique_dataset_op.cc b/tensorflow/contrib/data/kernels/unique_dataset_op.cc index 67c237799c10a2724f18bb0df99e4bf8f5cd2b8a..6fbf5d2ebb598132a7e8433608e67436a172b615 100644 --- a/tensorflow/contrib/data/kernels/unique_dataset_op.cc +++ b/tensorflow/contrib/data/kernels/unique_dataset_op.cc @@ -47,10 +47,10 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: Dataset(OpKernelContext* ctx, const DatasetBase* input) - : GraphDatasetBase(ctx), input_(input) { + : DatasetBase(DatasetContext(ctx)), input_(input) { input_->Ref(); } @@ -75,10 +75,11 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel { } protected: - Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b, + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, Node** output) const override { Node* input_graph_node = nullptr; - TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node)); + TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output)); return Status::OK(); } @@ -116,7 +117,7 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel { Status SaveInternal(IteratorStateWriter* writer) override { mutex_lock l(mu_); if (input_impl_) { - TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_)); + TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); } else { TF_RETURN_IF_ERROR( writer->WriteScalar(full_name("input_impl_empty"), "")); @@ -135,7 +136,7 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel { IteratorStateReader* reader) override { mutex_lock l(mu_); if (!reader->Contains(full_name("input_impl_empty"))) { - TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_)); + TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); } else { input_impl_.reset(); } diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc index 66a7c7fdcd5e0ab77596177c209470e17f63bc10..cc5e250ea15bf89be2db9aba14e3b29b72512a73 100644 --- a/tensorflow/contrib/data/ops/dataset_ops.cc +++ b/tensorflow/contrib/data/ops/dataset_ops.cc @@ -168,9 +168,11 @@ output_shapes: The list of shapes being produced. REGISTER_OP("MultiDeviceIteratorInit") .Input("dataset: variant") .Input("multi_device_iterator: resource") + .Input("max_buffer_size: int64") .Output("incarnation_id: int64") .Doc(R"doc( Initializes the multi device iterator with the given dataset. +max_buffer_size: The maximum size of the host side per device buffer to keep. incarnation_id: An int64 indicating which incarnation of the MultiDeviceIterator is running. dataset: Dataset to be iterated upon. diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index 2de1a79d28c16706e3c237d62935212ce387c776..2b75aa2ca54509b42f431db2dd39261cf025588a 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -175,7 +175,7 @@ py_test( "//tensorflow/python:variables", "//tensorflow/python/data/ops:dataset_ops", "//tensorflow/python/estimator", - "//tensorflow/python/estimator:model_fn", + "//tensorflow/python/estimator:estimator_py", ], ) @@ -198,21 +198,46 @@ py_test( "//tensorflow/python:errors", "//tensorflow/python:framework_ops", "//tensorflow/python:io_ops", + "//tensorflow/python:math_ops", "//tensorflow/python:util", "//tensorflow/python/data/ops:dataset_ops", "//third_party/py/numpy", ], ) +py_test( + name = "map_defun_op_test", + size = "small", + srcs = ["map_defun_op_test.py"], + srcs_version = "PY2AND3", + tags = ["no_pip"], + deps = [ + "//tensorflow/contrib/data/python/ops:map_defun", + "//tensorflow/python:array_ops", + "//tensorflow/python:check_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework_ops", + "//tensorflow/python:function", + "//tensorflow/python:math_ops", + ], +) + py_test( name = "optimize_dataset_op_test", size = "small", srcs = ["optimize_dataset_op_test.py"], srcs_version = "PY2AND3", deps = [ + ":stats_dataset_test_base", "//tensorflow/contrib/data/python/ops:optimization", + "//tensorflow/contrib/data/python/ops:stats_ops", "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", "//tensorflow/python:errors", + "//tensorflow/python:math_ops", "//tensorflow/python/data/ops:dataset_ops", "@absl_py//absl/testing:parameterized", ], @@ -239,7 +264,7 @@ cuda_py_test( tags = [ "manual", "no_oss", - "no_windows_gpu" + + "no_windows_gpu", "notap", ], ) @@ -431,8 +456,8 @@ py_test( tags = ["no_pip"], deps = [ ":reader_dataset_ops_test_base", + ":stats_dataset_test_base", "//tensorflow/contrib/data/python/ops:stats_ops", - "//tensorflow/core:protos_all_py", "//tensorflow/python:array_ops", "//tensorflow/python:client_testlib", "//tensorflow/python:errors", @@ -442,6 +467,16 @@ py_test( ], ) +py_library( + name = "stats_dataset_test_base", + srcs = ["stats_dataset_test_base.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/core:protos_all_py", + "//tensorflow/python:client_testlib", + ], +) + py_test( name = "threadpool_dataset_ops_test", size = "small", diff --git a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py index 30a993b1f7056b9726f524b2279131339c80c5eb..77148aceec7fa90f927a9c009671c2939460877b 100644 --- a/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py @@ -28,6 +28,7 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import variables from tensorflow.python.platform import test +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import saver as saver_lib from tensorflow.python.training import training_util @@ -55,7 +56,7 @@ class CheckpointInputPipelineHookTest(test.TestCase): def _read_vars(self, model_dir): """Returns (global_step, latest_feature).""" with ops.Graph().as_default() as g: - ckpt_path = saver_lib.latest_checkpoint(model_dir) + ckpt_path = checkpoint_management.latest_checkpoint(model_dir) meta_filename = ckpt_path + '.meta' saver_lib.import_meta_graph(meta_filename) saver = saver_lib.Saver() diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py index 48adc98e9a4caee1651d5c7bca9dd813f11dfb01..009e21a34c8df86af6abbb7599dbcfa23ddf90a7 100644 --- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py @@ -80,6 +80,7 @@ class MapDatasetTest(test.TestCase): sess.run(get_next) def testReadFileIgnoreError(self): + def write_string_to_file(value, filename): with open(filename, "w") as f: f.write(value) @@ -308,5 +309,50 @@ class MapDatasetBenchmark(test.Benchmark): opt_mark, chain_length)) +class MapAndFilterBenchmark(test.Benchmark): + + # This benchmark compares the performance of pipeline with multiple chained + # map + filter with and without map fusion. + def benchmarkMapAndFilter(self): + chain_lengths = [0, 1, 2, 5, 10, 20, 50] + for chain_length in chain_lengths: + self._benchmarkMapAndFilter(chain_length, False) + self._benchmarkMapAndFilter(chain_length, True) + + def _benchmarkMapAndFilter(self, chain_length, optimize_dataset): + with ops.Graph().as_default(): + dataset = dataset_ops.Dataset.from_tensors(0).repeat(None) + for _ in range(chain_length): + dataset = dataset.map(lambda x: x + 5).filter( + lambda x: math_ops.greater_equal(x - 5, 0)) + if optimize_dataset: + dataset = dataset.apply( + optimization.optimize(["map_and_filter_fusion"])) + + iterator = dataset.make_one_shot_iterator() + next_element = iterator.get_next() + + with session.Session() as sess: + for _ in range(10): + sess.run(next_element.op) + deltas = [] + for _ in range(100): + start = time.time() + for _ in range(100): + sess.run(next_element.op) + end = time.time() + deltas.append(end - start) + + median_wall_time = np.median(deltas) / 100 + opt_mark = "opt" if optimize_dataset else "no-opt" + print("Map and filter dataset {} chain length: {} Median wall time: {}". + format(opt_mark, chain_length, median_wall_time)) + self.report_benchmark( + iters=1000, + wall_time=median_wall_time, + name="benchmark_map_and_filter_dataset_chain_latency_{}_{}".format( + opt_mark, chain_length)) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py new file mode 100644 index 0000000000000000000000000000000000000000..a711325daed12f45e4e533f18ee81adc7dec93be --- /dev/null +++ b/tensorflow/contrib/data/python/kernel_tests/map_defun_op_test.py @@ -0,0 +1,126 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for MapDefunOp.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.data.python.ops import map_defun +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors +from tensorflow.python.framework import function +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import check_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import test + + +class MapDefunTest(test.TestCase): + + def testMapDefun_Simple(self): + + @function.Defun(dtypes.int32) + def simple_fn(x): + return x * 2 + 3 + + with self.test_session(): + nums = [[1, 2], [3, 4], [5, 6]] + elems = constant_op.constant(nums, dtype=dtypes.int32, name="data") + r = map_defun.map_defun(simple_fn, [elems], [dtypes.int32], [(2,)])[0] + expected = elems * 2 + 3 + self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + + def testMapDefun_MismatchedTypes(self): + + @function.Defun(dtypes.int32) + def fn(x): + return math_ops.cast(x, dtypes.float64) + + with self.test_session(): + nums = [1, 2, 3, 4, 5, 6] + elems = constant_op.constant(nums, dtype=dtypes.int32, name="data") + r = map_defun.map_defun(fn, [elems], [dtypes.int32], [()])[0] + with self.assertRaises(errors.InvalidArgumentError): + self.evaluate(r) + + def testMapDefun_MultipleOutputs(self): + + @function.Defun(dtypes.int32) + def fn(x): + return (x, math_ops.cast(x * 2 + 3, dtypes.float64)) + + with self.test_session(): + nums = [[1, 2], [3, 4], [5, 6]] + elems = constant_op.constant(nums, dtype=dtypes.int32, name="data") + r = map_defun.map_defun(fn, [elems], [dtypes.int32, dtypes.float64], + [(2,), (2,)]) + expected = [elems, elems * 2 + 3] + self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + + def testMapDefun_ShapeInference(self): + + @function.Defun(dtypes.int32) + def fn(x): + return x + + nums = [[1, 2], [3, 4], [5, 6]] + elems = constant_op.constant(nums, dtype=dtypes.int32, name="data") + result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)])[0] + self.assertEqual(result.get_shape(), (3, 2)) + + def testMapDefun_PartialShapeInference(self): + + @function.Defun(dtypes.int32) + def fn(x): + return x + + elems = array_ops.placeholder(dtypes.int64, (None, 2)) + result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)]) + self.assertEqual(result[0].get_shape().as_list(), [None, 2]) + + def testMapDefun_RaisesErrorOnRuntimeShapeMismatch(self): + + @function.Defun(dtypes.int32, dtypes.int32) + def fn(x, y): + return x, y + + elems1 = array_ops.placeholder(dtypes.int32) + elems2 = array_ops.placeholder(dtypes.int32) + result = map_defun.map_defun(fn, [elems1, elems2], + [dtypes.int32, dtypes.int32], [(), ()]) + with self.test_session() as sess: + with self.assertRaisesWithPredicateMatch( + errors.InvalidArgumentError, + "All inputs must have the same dimension 0."): + sess.run(result, feed_dict={elems1: [1, 2, 3, 4, 5], elems2: [1, 2, 3]}) + + def testMapDefun_RaisesDefunError(self): + + @function.Defun(dtypes.int32) + def fn(x): + with ops.control_dependencies([check_ops.assert_equal(x, 0)]): + return array_ops.identity(x) + + elems = constant_op.constant([0, 0, 0, 37, 0]) + result = map_defun.map_defun(fn, [elems], [dtypes.int32], [()]) + with self.test_session(): + with self.assertRaises(errors.InvalidArgumentError): + self.evaluate(result) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py index d8156dc9c7bf187d7399aede44c41c8c50670248..ae147b4fa79c5fc8e63e1860f45036709ecc9777 100644 --- a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py @@ -19,9 +19,14 @@ from __future__ import print_function from absl.testing import parameterized +from tensorflow.contrib.data.python.kernel_tests import stats_dataset_test_base from tensorflow.contrib.data.python.ops import optimization +from tensorflow.contrib.data.python.ops import stats_ops from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.ops import math_ops from tensorflow.python.platform import test @@ -46,8 +51,7 @@ class OptimizeDatasetTest(test.TestCase, parameterized.TestCase): with self.assertRaisesRegexp( errors.InvalidArgumentError, "Asserted Whoops transformation at offset 0 but encountered " - "Map transformation instead." - ): + "Map transformation instead."): sess.run(get_next) def testAssertSuffixShort(self): @@ -123,19 +127,30 @@ class OptimizeDatasetTest(test.TestCase, parameterized.TestCase): functions = [identity, increment, increment_and_square] tests = [] - - for fun1 in functions: - for fun2 in functions: - tests.append(([fun1, fun2],)) - for fun3 in functions: - tests.append(([fun1, fun2, fun3],)) + for i, fun1 in enumerate(functions): + for j, fun2 in enumerate(functions): + tests.append(( + "test_{}_{}".format(i, j), + [fun1, fun2], + )) + for k, fun3 in enumerate(functions): + tests.append(( + "test_{}_{}_{}".format(i, j, k), + [fun1, fun2, fun3], + )) swap = lambda x, n: (n, x) - tests.append(([lambda x: (x, 42), swap],)) - tests.append(([lambda x: (x, 42), swap, swap],)) + tests.append(( + "swap1", + [lambda x: (x, 42), swap], + )) + tests.append(( + "swap2", + [lambda x: (x, 42), swap, swap], + )) return tuple(tests) - @parameterized.parameters(*map_functions.__func__()) + @parameterized.named_parameters(*map_functions.__func__()) def testMapFusion(self, functions): dataset = dataset_ops.Dataset.range(5).apply( optimization.assert_next(["Map", "Prefetch"])) @@ -159,6 +174,108 @@ class OptimizeDatasetTest(test.TestCase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) + @staticmethod + def map_and_filter_functions(): + identity = lambda x: x + increment = lambda x: x + 1 + minus_five = lambda x: x - 5 + + def increment_and_square(x): + y = x + 1 + return y * y + + take_all = lambda x: constant_op.constant(True) + is_zero = lambda x: math_ops.equal(x, 0) + is_odd = lambda x: math_ops.equal(x % 2, 0) + greater = lambda x: math_ops.greater(x + 5, 0) + + functions = [identity, increment, minus_five, increment_and_square] + filters = [take_all, is_zero, is_odd, greater] + tests = [] + + for x, fun in enumerate(functions): + for y, predicate in enumerate(filters): + tests.append(("mixed_{}_{}".format(x, y), fun, predicate)) + + # Multi output + tests.append(("multiOne", lambda x: (x, x), + lambda x, y: constant_op.constant(True))) + tests.append( + ("multiTwo", lambda x: (x, 2), + lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0))) + return tuple(tests) + + @parameterized.named_parameters(*map_and_filter_functions.__func__()) + def testMapFilterFusion(self, function, predicate): + dataset = dataset_ops.Dataset.range(10).apply( + optimization.assert_next( + ["Map", + "FilterByLastComponent"])).map(function).filter(predicate).apply( + optimization.optimize(["map_and_filter_fusion"])) + self._testMapAndFilter(dataset, function, predicate) + + def _testMapAndFilter(self, dataset, function, predicate): + iterator = dataset.make_one_shot_iterator() + get_next = iterator.get_next() + with self.test_session() as sess: + for x in range(10): + r = function(x) + if isinstance(r, tuple): + b = predicate(*r) # Pass tuple as multiple arguments. + else: + b = predicate(r) + if sess.run(b): + result = sess.run(get_next) + self.assertAllEqual(r, result) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + def testAdditionalInputs(self): + a = constant_op.constant(3, dtype=dtypes.int64) + b = constant_op.constant(4, dtype=dtypes.int64) + some_tensor = math_ops.mul(a, b) + function = lambda x: x * x + + def predicate(y): + return math_ops.less(math_ops.cast(y, dtypes.int64), some_tensor) + + # We are currently not supporting functions with additional inputs. + dataset = dataset_ops.Dataset.range(10).apply( + optimization.assert_next( + ["Map", "Filter"])).map(function).filter(predicate).apply( + optimization.optimize(["map_and_filter_fusion"])) + + self._testMapAndFilter(dataset, function, predicate) + + +class OptimizeStatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): + + def testLatencyStatsOptimization(self): + + stats_aggregator = stats_ops.StatsAggregator() + dataset = dataset_ops.Dataset.from_tensors(1).apply( + optimization.assert_next( + ["LatencyStats", "Map", "LatencyStats", "Prefetch", + "LatencyStats"])).map(lambda x: x * x).prefetch(1).apply( + optimization.optimize(["latency_all_edges"])).apply( + stats_ops.set_stats_aggregator(stats_aggregator)) + iterator = dataset.make_initializable_iterator() + get_next = iterator.get_next() + summary_t = stats_aggregator.get_summary() + + with self.test_session() as sess: + sess.run(iterator.initializer) + self.assertEqual(1 * 1, sess.run(get_next)) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + summary_str = sess.run(summary_t) + self._assertSummaryHasCount(summary_str, + "record_latency_TensorDataset/_1", 1) + self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", + 1) + self._assertSummaryHasCount(summary_str, + "record_latency_PrefetchDataset/_6", 1) + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py index 2da6131e8e60ca53723da7f66a7ee52151640129..361fe0dd39bb3f855c3b0b11281a9909fd601232 100644 --- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py @@ -907,6 +907,42 @@ class CopyToDeviceTest(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) + def testIteratorGetNextAsOptionalOnGPU(self): + if not test_util.is_gpu_available(): + self.skipTest("No GPU available") + + host_dataset = dataset_ops.Dataset.range(3) + device_dataset = host_dataset.apply( + prefetching_ops.copy_to_device("/gpu:0")) + with ops.device("/gpu:0"): + iterator = device_dataset.make_initializable_iterator() + next_elem = iterator_ops.get_next_as_optional(iterator) + elem_has_value_t = next_elem.has_value() + elem_value_t = next_elem.get_value() + + with self.test_session() as sess: + # Before initializing the iterator, evaluating the optional fails with + # a FailedPreconditionError. + with self.assertRaises(errors.FailedPreconditionError): + sess.run(elem_has_value_t) + with self.assertRaises(errors.FailedPreconditionError): + sess.run(elem_value_t) + + # For each element of the dataset, assert that the optional evaluates to + # the expected value. + sess.run(iterator.initializer) + for i in range(3): + elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t]) + self.assertTrue(elem_has_value) + self.assertEqual(i, elem_value) + + # After exhausting the iterator, `next_elem.has_value()` will evaluate to + # false, and attempting to get the value will fail. + for _ in range(2): + self.assertFalse(sess.run(elem_has_value_t)) + with self.assertRaises(errors.InvalidArgumentError): + sess.run(elem_value_t) + class MultiDeviceIteratorTest(test.TestCase): @@ -985,7 +1021,7 @@ class MultiDeviceIteratorTest(test.TestCase): def testUneven(self): dataset = dataset_ops.Dataset.range(10) multi_device_iterator = prefetching_ops.MultiDeviceIterator( - dataset, ["/cpu:1", "/cpu:2"]) + dataset, ["/cpu:1", "/cpu:2"], max_buffer_size=4) elem_on_1, elem_on_2 = multi_device_iterator.get_next() config = config_pb2.ConfigProto(device_count={"CPU": 3}) @@ -1043,7 +1079,7 @@ class MultiDeviceIteratorTest(test.TestCase): with compat.forward_compatibility_horizon(2018, 8, 4): dataset = dataset_ops.Dataset.range(10) multi_device_iterator = prefetching_ops.MultiDeviceIterator( - dataset, ["/cpu:1", "/gpu:0"]) + dataset, ["/cpu:1", "/gpu:0"], max_buffer_size=4) elem_on_1, elem_on_2 = multi_device_iterator.get_next() config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1}) diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py index 851a33dfc849a2d935887def44734aace5dcaf7f..15b342d30f85a05b3827998565ba5f84021ac885 100644 --- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py @@ -173,15 +173,23 @@ class ReadBatchFeaturesTest( for num_epochs in [1, 10]: with ops.Graph().as_default(): # Basic test: read from file 0. - self.outputs = self.make_batch_feature( + outputs = self.make_batch_feature( filenames=self.test_filenames[0], num_epochs=num_epochs, batch_size=batch_size, drop_final_batch=True).make_one_shot_iterator().get_next() - for _, tensor in self.outputs.items(): + for _, tensor in outputs.items(): if isinstance(tensor, ops.Tensor): # Guard against SparseTensor. self.assertEqual(tensor.shape[0], batch_size) + def testIndefiniteRepeatShapeInference(self): + dataset = self.make_batch_feature( + filenames=self.test_filenames[0], num_epochs=None, batch_size=32) + for shape, clazz in zip(nest.flatten(dataset.output_shapes), + nest.flatten(dataset.output_classes)): + if issubclass(clazz, ops.Tensor): + self.assertEqual(32, shape[0]) + class MakeCsvDatasetTest(test.TestCase): @@ -795,6 +803,16 @@ class MakeCsvDatasetTest(test.TestCase): all_equal = all_equal and np.array_equal(batch1[i], batch2[i]) self.assertFalse(all_equal) + def testIndefiniteRepeatShapeInference(self): + column_names = ["col%d" % i for i in range(5)] + inputs = [[",".join(x for x in column_names), "0,1,2,3,4", "5,6,7,8,9"], [ + ",".join(x for x in column_names), "10,11,12,13,14", "15,16,17,18,19" + ]] + filenames = self._setup_files(inputs) + dataset = self._make_csv_dataset(filenames, batch_size=32, num_epochs=None) + for shape in nest.flatten(dataset.output_shapes): + self.assertEqual(32, shape[0]) + class MakeTFRecordDatasetTest( reader_dataset_ops_test_base.TFRecordDatasetTestBase): @@ -1002,5 +1020,12 @@ class MakeTFRecordDatasetTest( self._shuffle_test(batch_size, num_epochs, num_parallel_reads, seed=21345) + def testIndefiniteRepeatShapeInference(self): + dataset = readers.make_tf_record_dataset( + file_pattern=self.test_filenames, num_epochs=None, batch_size=32) + for shape in nest.flatten(dataset.output_shapes): + self.assertEqual(32, shape[0]) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD b/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD index 3c3f23f9a984c702abfdacf11bef0e5d4066782f..7b9ea191a4524891d1b589e1e228e29241fda7f8 100644 --- a/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/serialization/BUILD @@ -56,6 +56,7 @@ py_test( "//tensorflow/python:client_testlib", "//tensorflow/python:errors", "//tensorflow/python/data/ops:dataset_ops", + "@absl_py//absl/testing:parameterized", ], ) diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/cache_dataset_serialization_test.py b/tensorflow/contrib/data/python/kernel_tests/serialization/cache_dataset_serialization_test.py index a0a1100893c7384b0e2bd9fcfdaa8d3698b95d28..1b6059ccbcc81937696e1b0ebb269f213adbb976 100644 --- a/tensorflow/contrib/data/python/kernel_tests/serialization/cache_dataset_serialization_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/serialization/cache_dataset_serialization_test.py @@ -19,6 +19,8 @@ from __future__ import print_function import os +from absl.testing import parameterized + from tensorflow.contrib.data.python.kernel_tests.serialization import dataset_serialization_test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import errors @@ -26,7 +28,8 @@ from tensorflow.python.platform import test class CacheDatasetSerializationTest( - dataset_serialization_test_base.DatasetSerializationTestBase): + dataset_serialization_test_base.DatasetSerializationTestBase, + parameterized.TestCase): def setUp(self): self.range_size = 10 @@ -34,88 +37,123 @@ class CacheDatasetSerializationTest( self.num_outputs = self.range_size * self.num_repeats self.cache_file_prefix = 'test' - def ds_fn(self): - return dataset_ops.Dataset.range(self.range_size).cache( - os.path.join(self.get_temp_dir(), - self.cache_file_prefix)).repeat(self.num_repeats) + def make_dataset_fn(self, is_memory): + if is_memory: + filename = '' + else: + filename = os.path.join(self.get_temp_dir(), self.cache_file_prefix) + + def ds_fn(): + return dataset_ops.Dataset.range(self.range_size).cache(filename).repeat( + self.num_repeats) + + return ds_fn def expected_outputs(self): return list(range(self.range_size)) * self.num_repeats - def testCheckpointBeforeOneEpoch(self): + @parameterized.named_parameters( + ('Memory', True), + ('File', False), + ) + def testCheckpointBeforeOneEpoch(self, is_memory): + ds_fn = self.make_dataset_fn(is_memory) + # Generate 5 entries from iterator and save checkpoint. - outputs = self.gen_outputs(self.ds_fn, [], 5, verify_exhausted=False) + outputs = self.gen_outputs(ds_fn, [], 5, verify_exhausted=False) self.assertSequenceEqual(outputs, range(5)) # Restore from checkpoint and produce the rest of the elements from the # iterator. outputs.extend( self.gen_outputs( - self.ds_fn, [], + ds_fn, [], self.num_outputs - 5, ckpt_saved=True, verify_exhausted=False)) self.assertSequenceEqual(outputs, self.expected_outputs()) - def testCheckpointBeforeOneEpochThenRunFewSteps(self): - # Generate 8 entries from iterator but save checkpoint after producing - # 5. + @parameterized.named_parameters( + ('Memory', True), + ('File', False), + ) + def testCheckpointBeforeOneEpochThenRunFewSteps(self, is_memory): + ds_fn = self.make_dataset_fn(is_memory) + + # Generate 8 entries from iterator but save checkpoint after producing 5. outputs = self.gen_outputs( - self.ds_fn, [5], - 8, - verify_exhausted=False, - save_checkpoint_at_end=False) + ds_fn, [5], 8, verify_exhausted=False, save_checkpoint_at_end=False) self.assertSequenceEqual(outputs, range(8)) - # Restoring from checkpoint and running GetNext should return a - # `AlreadExistsError` now because the lockfile already exists. - with self.assertRaises(errors.AlreadyExistsError): - self.gen_outputs( - self.ds_fn, [], - self.num_outputs - 5, - ckpt_saved=True, - verify_exhausted=False) + if is_memory: + outputs = outputs[:5] + outputs.extend( + self.gen_outputs( + ds_fn, [], + self.num_outputs - 5, + ckpt_saved=True, + verify_exhausted=False)) + self.assertSequenceEqual(outputs, self.expected_outputs()) + else: + # Restoring from checkpoint and running GetNext should return + # `AlreadExistsError` now because the lockfile already exists. + with self.assertRaises(errors.AlreadyExistsError): + self.gen_outputs( + ds_fn, [], + self.num_outputs - 5, + ckpt_saved=True, + verify_exhausted=False) + + @parameterized.named_parameters( + ('Memory', True), + ('File', False), + ) + def testCheckpointAfterOneEpoch(self, is_memory): + ds_fn = self.make_dataset_fn(is_memory) - def testCheckpointAfterOneEpoch(self): # Generate 15 entries from iterator and save checkpoint. - outputs = self.gen_outputs(self.ds_fn, [], 15, verify_exhausted=False) + outputs = self.gen_outputs(ds_fn, [], 15, verify_exhausted=False) self.assertSequenceEqual(outputs, list(range(10)) + list(range(5))) # Restore from checkpoint and produce the rest of the elements from the # iterator. outputs.extend( self.gen_outputs( - self.ds_fn, [], + ds_fn, [], self.num_outputs - 15, ckpt_saved=True, verify_exhausted=False)) self.assertSequenceEqual(outputs, self.expected_outputs()) - def testCheckpointAfterOneEpochThenRunFewSteps(self): - # Generate 18 entries from iterator but save checkpoint after producing - # 15. + @parameterized.named_parameters( + ('Memory', True), + ('File', False), + ) + def testCheckpointAfterOneEpochThenRunFewSteps(self, is_memory): + ds_fn = self.make_dataset_fn(is_memory) + + # Generate 18 entries from iterator but save checkpoint after producing 15. outputs = self.gen_outputs( - self.ds_fn, [15], - 18, - verify_exhausted=False, - save_checkpoint_at_end=False) + ds_fn, [15], 18, verify_exhausted=False, save_checkpoint_at_end=False) self.assertSequenceEqual(outputs, list(range(10)) + list(range(8))) outputs = list(range(10)) + list(range(5)) + self.gen_outputs( - self.ds_fn, [], + ds_fn, [], self.num_outputs - 15, ckpt_saved=True, verify_exhausted=False) self.assertSequenceEqual(outputs, list(range(10)) * 3) - def testCheckpointBeforeOneEpochButRunCompleteEpoch(self): - # Generate 13 entries from iterator but save checkpoint after producing - # 5. + @parameterized.named_parameters( + ('Memory', True), + ('File', False), + ) + def testCheckpointBeforeOneEpochButRunCompleteEpoch(self, is_memory): + ds_fn = self.make_dataset_fn(is_memory) + + # Generate 13 entries from iterator but save checkpoint after producing 5. outputs = self.gen_outputs( - self.ds_fn, [5], - 13, - verify_exhausted=False, - save_checkpoint_at_end=False) + ds_fn, [5], 13, verify_exhausted=False, save_checkpoint_at_end=False) self.assertSequenceEqual(outputs, list(range(10)) + list(range(3))) # Since we ran for more than one epoch, the cache was completely written. @@ -124,65 +162,90 @@ class CacheDatasetSerializationTest( # been completely written. outputs = list(range(5)) + self.gen_outputs( - self.ds_fn, [], + ds_fn, [], self.num_outputs - 5, ckpt_saved=True, verify_exhausted=False) self.assertSequenceEqual(outputs, list(range(10)) * 3) - def testCheckpointUnusedWriterIterator(self): + @parameterized.named_parameters( + ('Memory', True), + ('File', False), + ) + def testCheckpointUnusedWriterIterator(self, is_memory): + ds_fn = self.make_dataset_fn(is_memory) + # Checkpoint before get_next is called even once. - outputs = self.gen_outputs(self.ds_fn, [], 0, verify_exhausted=False) + outputs = self.gen_outputs(ds_fn, [], 0, verify_exhausted=False) self.assertSequenceEqual(outputs, []) outputs = self.gen_outputs( - self.ds_fn, [], - self.num_outputs, - ckpt_saved=True, - verify_exhausted=False) + ds_fn, [], self.num_outputs, ckpt_saved=True, verify_exhausted=False) self.assertSequenceEqual(outputs, list(range(10)) * 3) - def testCheckpointUnusedMidwayWriterIterator(self): + @parameterized.named_parameters( + ('Memory', True), + ('File', False), + ) + def testCheckpointUnusedMidwayWriterIterator(self, is_memory): + ds_fn = self.make_dataset_fn(is_memory) + # Produce 5 elements and checkpoint. - outputs = self.gen_outputs(self.ds_fn, [], 5, verify_exhausted=False) + outputs = self.gen_outputs(ds_fn, [], 5, verify_exhausted=False) self.assertSequenceEqual(outputs, range(5)) # Restore from checkpoint, then produce no elements and checkpoint. outputs.extend( - self.gen_outputs( - self.ds_fn, [], 0, ckpt_saved=True, verify_exhausted=False)) + self.gen_outputs(ds_fn, [], 0, ckpt_saved=True, verify_exhausted=False)) self.assertSequenceEqual(outputs, range(5)) # Restore from checkpoint and produce rest of the elements. outputs.extend( self.gen_outputs( - self.ds_fn, [], + ds_fn, [], self.num_outputs - 5, ckpt_saved=True, verify_exhausted=False)) self.assertSequenceEqual(outputs, list(range(10)) * 3) - def testUnusedCheckpointError(self): + @parameterized.named_parameters( + ('Memory', True), + ('File', False), + ) + def testUnusedCheckpointError(self, is_memory): + ds_fn = self.make_dataset_fn(is_memory) + # Produce 5 elements and save ckpt. - outputs = self.gen_outputs(self.ds_fn, [], 5, verify_exhausted=False) + outputs = self.gen_outputs(ds_fn, [], 5, verify_exhausted=False) self.assertSequenceEqual(outputs, range(5)) - # Since the complete cache has not been written, a new iterator which does - # not restore the checkpoint will throw an error since there is a partial - # cache shard. - with self.assertRaises(errors.AlreadyExistsError): + if is_memory: outputs = self.gen_outputs( - self.ds_fn, [], self.num_outputs, verify_exhausted=False) + ds_fn, [], self.num_outputs, verify_exhausted=False) + self.assertSequenceEqual(outputs, self.expected_outputs()) + else: + # Since the complete cache has not been written, a new iterator which does + # not restore the checkpoint will throw an error since there is a partial + # cache shard. + with self.assertRaises(errors.AlreadyExistsError): + outputs = self.gen_outputs( + ds_fn, [], self.num_outputs, verify_exhausted=False) + + @parameterized.named_parameters( + ('Memory', True), + ('File', False), + ) + def testIgnoreCheckpointIfCacheWritten(self, is_memory): + ds_fn = self.make_dataset_fn(is_memory) - def testIgnoreCheckpointIfCacheWritten(self): # Produce 15 elements and save ckpt. This will write the complete cache. - outputs = self.gen_outputs(self.ds_fn, [], 15, verify_exhausted=False) + outputs = self.gen_outputs(ds_fn, [], 15, verify_exhausted=False) self.assertSequenceEqual(outputs, list(range(10)) + list(range(5))) # Build the iterator again but do not restore from ckpt. Since the cache # has already been written we should be able to use it. outputs = self.gen_outputs( - self.ds_fn, [], self.num_outputs, verify_exhausted=False) + ds_fn, [], self.num_outputs, verify_exhausted=False) self.assertSequenceEqual(outputs, list(range(10)) * 3) diff --git a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py index 393f08850b1865180a8b94e9209b2445b54c8b69..3ed4dfb7295ca77c78ce5318bf31e16a354e16a8 100644 --- a/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py +++ b/tensorflow/contrib/data/python/kernel_tests/serialization/dataset_serialization_test_base.py @@ -32,6 +32,7 @@ from tensorflow.python.ops import lookup_ops from tensorflow.python.ops import variables from tensorflow.python.platform import gfile from tensorflow.python.platform import test +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import saver as saver_lib from tensorflow.python.util import nest @@ -655,7 +656,7 @@ class DatasetSerializationTestBase(test.TestCase): return os.path.join(self.get_temp_dir(), "iterator") def _latest_ckpt(self): - return saver_lib.latest_checkpoint(self.get_temp_dir()) + return checkpoint_management.latest_checkpoint(self.get_temp_dir()) def _save(self, sess, saver): saver.save(sess, self._ckpt_path()) diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py index b4945685c1d1062bf416b73f1541f351adf45604..a41d21f8c14ed6bec7626599a5aa7f365765ce8b 100644 --- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py @@ -20,8 +20,8 @@ from __future__ import print_function import numpy as np from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base +from tensorflow.contrib.data.python.kernel_tests import stats_dataset_test_base from tensorflow.contrib.data.python.ops import stats_ops -from tensorflow.core.framework import summary_pb2 from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -29,28 +29,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class StatsDatasetTestBase(test.TestCase): - - def _assertSummaryHasCount(self, summary_str, tag, expected_value): - summary_proto = summary_pb2.Summary() - summary_proto.ParseFromString(summary_str) - for value in summary_proto.value: - if tag == value.tag: - self.assertEqual(expected_value, value.histo.num) - return - self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto)) - - def _assertSummaryHasSum(self, summary_str, tag, expected_value): - summary_proto = summary_pb2.Summary() - summary_proto.ParseFromString(summary_str) - for value in summary_proto.value: - if tag == value.tag: - self.assertEqual(expected_value, value.histo.sum) - return - self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto)) - - -class StatsDatasetTest(StatsDatasetTestBase): +class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): def testBytesProduced(self): stats_aggregator = stats_ops.StatsAggregator() @@ -197,7 +176,7 @@ class StatsDatasetTest(StatsDatasetTestBase): class FeatureStatsDatasetTest( - StatsDatasetTestBase, + stats_dataset_test_base.StatsDatasetTestBase, reader_dataset_ops_test_base.ReadBatchFeaturesTestBase): def testFeaturesStats(self): diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_test_base.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_test_base.py new file mode 100644 index 0000000000000000000000000000000000000000..9a13acf8f0ac6690cad8847873768562da795496 --- /dev/null +++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_test_base.py @@ -0,0 +1,44 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Base class for testing the input pipeline statistics gathering ops.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +from tensorflow.core.framework import summary_pb2 +from tensorflow.python.platform import test + + +class StatsDatasetTestBase(test.TestCase): + """Base class for testing statistics gathered in `StatsAggregator`.""" + + def _assertSummaryHasCount(self, summary_str, tag, expected_value): + summary_proto = summary_pb2.Summary() + summary_proto.ParseFromString(summary_str) + for value in summary_proto.value: + if tag == value.tag: + self.assertEqual(expected_value, value.histo.num) + return + self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto)) + + def _assertSummaryHasSum(self, summary_str, tag, expected_value): + summary_proto = summary_pb2.Summary() + summary_proto.ParseFromString(summary_str) + for value in summary_proto.value: + if tag == value.tag: + self.assertEqual(expected_value, value.histo.sum) + return + self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto)) diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD index 1ad021ea037add48afee5bdfda9eea18485eca5d..ad9378dfb9d938c826f994da9bbb89101cfbd872 100644 --- a/tensorflow/contrib/data/python/ops/BUILD +++ b/tensorflow/contrib/data/python/ops/BUILD @@ -210,6 +210,17 @@ py_library( ], ) +py_library( + name = "map_defun", + srcs = ["map_defun.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/python:dataset_ops_gen", + "//tensorflow/python:framework_ops", + "//tensorflow/python:tensor_shape", + ], +) + py_library( name = "resampling", srcs = ["resampling.py"], @@ -370,6 +381,7 @@ py_library( ":get_single_element", ":grouping", ":interleave_ops", + ":map_defun", ":optimization", ":prefetching_ops", ":readers", diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py index 42fc20ec015a078ef8cd42065196f45438f19785..9f059942a65177186132164531237f838ecd63a2 100644 --- a/tensorflow/contrib/data/python/ops/batching.py +++ b/tensorflow/contrib/data/python/ops/batching.py @@ -31,7 +31,6 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape -from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops from tensorflow.python.ops import control_flow_ops @@ -186,7 +185,7 @@ def dense_to_sparse_batch(batch_size, row_shape): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): @@ -402,7 +401,7 @@ def unbatch(): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): @@ -439,54 +438,12 @@ def unbatch(): return _apply_fn -def _filter_irregular_batches(batch_size): - """Transformation that filters out batches that are not of size batch_size.""" - - def _apply_fn(dataset): - """Function from `Dataset` to `Dataset` that applies the transformation.""" - tensor_batch_size = ops.convert_to_tensor( - batch_size, dtype=dtypes.int64, name="batch_size") - - flattened = _RestructuredDataset( - dataset, - tuple(nest.flatten(dataset.output_types)), - output_classes=tuple(nest.flatten(dataset.output_classes))) - - def _predicate(*xs): - """Return `True` if this element is a full batch.""" - # Extract the dynamic batch size from the first component of the flattened - # batched element. - first_component = xs[0] - first_component_batch_size = array_ops.shape( - first_component, out_type=dtypes.int64)[0] - - return math_ops.equal(first_component_batch_size, tensor_batch_size) - - filtered = flattened.filter(_predicate) - - maybe_constant_batch_size = tensor_util.constant_value(tensor_batch_size) - - def _set_first_dimension(shape): - return shape.merge_with( - tensor_shape.vector(maybe_constant_batch_size).concatenate(shape[1:])) - - known_shapes = nest.map_structure(_set_first_dimension, - dataset.output_shapes) - return _RestructuredDataset( - filtered, - dataset.output_types, - known_shapes, - output_classes=dataset.output_classes) - - return _apply_fn - - @deprecation.deprecated( None, "Use `tf.data.Dataset.batch(..., drop_remainder=True)`.") def batch_and_drop_remainder(batch_size): """A batching transformation that omits the final small batch (if present). - Like @{tf.data.Dataset.batch}, this transformation combines + Like `tf.data.Dataset.batch`, this transformation combines consecutive elements of this dataset into batches. However, if the batch size does not evenly divide the input dataset size, this transformation will drop the final smaller element. @@ -510,7 +467,7 @@ def batch_and_drop_remainder(batch_size): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply} + `tf.data.Dataset.apply` """ def _apply_fn(dataset): @@ -527,25 +484,25 @@ def padded_batch_and_drop_remainder(batch_size, padding_values=None): """A batching and padding transformation that omits the final small batch. - Like @{tf.data.Dataset.padded_batch}, this transformation combines + Like `tf.data.Dataset.padded_batch`, this transformation combines consecutive elements of this dataset into batches. However, if the batch size does not evenly divide the input dataset size, this transformation will drop the final smaller element. - See `@{tf.contrib.data.batch_and_drop_remainder}` for more details. + See `tf.contrib.data.batch_and_drop_remainder` for more details. Args: batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of consecutive elements of this dataset to combine in a single batch. padded_shapes: A nested structure of `tf.TensorShape` or `tf.int64` vector tensor-like objects. See - @{tf.data.Dataset.padded_batch} for details. + `tf.data.Dataset.padded_batch` for details. padding_values: (Optional.) A nested structure of scalar-shaped - `tf.Tensor`. See @{tf.data.Dataset.padded_batch} for details. + `tf.Tensor`. See `tf.data.Dataset.padded_batch` for details. Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply} + `tf.data.Dataset.apply` """ def _apply_fn(dataset): @@ -704,7 +661,7 @@ def assert_element_shape(expected_shapes): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply} + `tf.data.Dataset.apply` """ def _check_shape(*elements): @@ -803,7 +760,7 @@ def map_and_batch(map_func, Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. Raises: ValueError: If both `num_parallel_batches` and `num_parallel_calls` are diff --git a/tensorflow/contrib/data/python/ops/enumerate_ops.py b/tensorflow/contrib/data/python/ops/enumerate_ops.py index ac2b386b81532b801139baa00fd5edd4ecd6ef0a..490281e0d2da7a454a2f63f95753c7c436b87a76 100644 --- a/tensorflow/contrib/data/python/ops/enumerate_ops.py +++ b/tensorflow/contrib/data/python/ops/enumerate_ops.py @@ -47,7 +47,7 @@ def enumerate_dataset(start=0): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py index d46d96c461ad4cc0ac25a8ddc285cec23d09c682..b4a7521e0875089c39ac7aa8b7b49e44feb2b4ad 100644 --- a/tensorflow/contrib/data/python/ops/error_ops.py +++ b/tensorflow/contrib/data/python/ops/error_ops.py @@ -42,7 +42,7 @@ def ignore_errors(): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): diff --git a/tensorflow/contrib/data/python/ops/get_single_element.py b/tensorflow/contrib/data/python/ops/get_single_element.py index ef9284456eb35099db804e0680abfacd6384d503..a6713b017afa315edec9389d0a6c1c7135e6aeb9 100644 --- a/tensorflow/contrib/data/python/ops/get_single_element.py +++ b/tensorflow/contrib/data/python/ops/get_single_element.py @@ -29,8 +29,8 @@ from tensorflow.python.ops import gen_dataset_ops def get_single_element(dataset): """Returns the single element in `dataset` as a nested structure of tensors. - This function enables you to use a @{tf.data.Dataset} in a stateless - "tensor-in tensor-out" expression, without creating a @{tf.data.Iterator}. + This function enables you to use a `tf.data.Dataset` in a stateless + "tensor-in tensor-out" expression, without creating a `tf.data.Iterator`. This can be useful when your preprocessing transformations are expressed as a `Dataset`, and you want to use the transformation at serving time. For example: @@ -50,10 +50,10 @@ def get_single_element(dataset): ``` Args: - dataset: A @{tf.data.Dataset} object containing a single element. + dataset: A `tf.data.Dataset` object containing a single element. Returns: - A nested structure of @{tf.Tensor} objects, corresponding to the single + A nested structure of `tf.Tensor` objects, corresponding to the single element of `dataset`. Raises: @@ -77,11 +77,11 @@ def reduce_dataset(dataset, reducer): """Returns the result of reducing the `dataset` using `reducer`. Args: - dataset: A @{tf.data.Dataset} object. - reducer: A @{tf.contrib.data.Reducer} object representing the reduce logic. + dataset: A `tf.data.Dataset` object. + reducer: A `tf.contrib.data.Reducer` object representing the reduce logic. Returns: - A nested structure of @{tf.Tensor} objects, corresponding to the result + A nested structure of `tf.Tensor` objects, corresponding to the result of reducing `dataset` using `reducer`. Raises: diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py index bd8d398c58cc1825616c1ab5337cf6668c66697e..6edc1d79902c571b34b6a0a108c4d62cb6097ccb 100644 --- a/tensorflow/contrib/data/python/ops/grouping.py +++ b/tensorflow/contrib/data/python/ops/grouping.py @@ -50,7 +50,7 @@ def group_by_reducer(key_func, reducer): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): @@ -92,7 +92,7 @@ def group_by_window(key_func, Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. Raises: ValueError: if neither or both of {`window_size`, `window_size_func`} are @@ -142,11 +142,11 @@ def bucket_by_sequence_length(element_length_func, bucket_batch_sizes: `list`, batch size per bucket. Length should be `len(bucket_boundaries) + 1`. padded_shapes: Nested structure of `tf.TensorShape` to pass to - @{tf.data.Dataset.padded_batch}. If not provided, will use + `tf.data.Dataset.padded_batch`. If not provided, will use `dataset.output_shapes`, which will result in variable length dimensions being padded out to the maximum length in each batch. padding_values: Values to pad with, passed to - @{tf.data.Dataset.padded_batch}. Defaults to padding with 0. + `tf.data.Dataset.padded_batch`. Defaults to padding with 0. pad_to_bucket_boundary: bool, if `False`, will pad dimensions with unknown size to maximum length in batch. If `True`, will pad dimensions with unknown size to bucket boundary minus 1 (i.e., the maximum length in each @@ -155,7 +155,7 @@ def bucket_by_sequence_length(element_length_func, Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. Raises: ValueError: if `len(bucket_batch_sizes) != len(bucket_boundaries) + 1`. diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py index bcc959594a6b311a3c60bb4696ac97be5c448756..5a1a35199abecc3890d5733ddf678af8d4098f33 100644 --- a/tensorflow/contrib/data/python/ops/interleave_ops.py +++ b/tensorflow/contrib/data/python/ops/interleave_ops.py @@ -42,7 +42,7 @@ def parallel_interleave(map_func, `parallel_interleave()` maps `map_func` across its input to produce nested datasets, and outputs their elements interleaved. Unlike - @{tf.data.Dataset.interleave}, it gets elements from `cycle_length` nested + `tf.data.Dataset.interleave`, it gets elements from `cycle_length` nested datasets in parallel, which increases the throughput, especially in the presence of stragglers. Furthermore, the `sloppy` argument can be used to improve performance, by relaxing the requirement that the outputs are produced @@ -79,7 +79,7 @@ def parallel_interleave(map_func, Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): return readers.ParallelInterleaveDataset( @@ -138,7 +138,7 @@ def sloppy_interleave(map_func, cycle_length, block_length=1): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): return readers.ParallelInterleaveDataset( @@ -196,15 +196,15 @@ def sample_from_datasets(datasets, weights=None, seed=None): """Samples elements at random from the datasets in `datasets`. Args: - datasets: A list of @{tf.data.Dataset} objects with compatible structure. + datasets: A list of `tf.data.Dataset` objects with compatible structure. weights: (Optional.) A list of `len(datasets)` floating-point values where `weights[i]` represents the probability with which an element should be - sampled from `datasets[i]`, or a @{tf.data.Dataset} object where each + sampled from `datasets[i]`, or a `tf.data.Dataset` object where each element is such a list. Defaults to a uniform distribution across `datasets`. seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random seed that will be used to create the distribution. See - @{tf.set_random_seed} for behavior. + `tf.set_random_seed` for behavior. Returns: A dataset that interleaves elements from `datasets` at random, according to @@ -262,8 +262,8 @@ def choose_from_datasets(datasets, choice_dataset): ``` Args: - datasets: A list of @{tf.data.Dataset} objects with compatible structure. - choice_dataset: A @{tf.data.Dataset} of scalar `tf.int64` tensors between + datasets: A list of `tf.data.Dataset` objects with compatible structure. + choice_dataset: A `tf.data.Dataset` of scalar `tf.int64` tensors between `0` and `len(datasets) - 1`. Returns: diff --git a/tensorflow/contrib/data/python/ops/iterator_ops.py b/tensorflow/contrib/data/python/ops/iterator_ops.py index 0d71be66018eeebe60de9deff24ceb6854d209d9..18515e21edfe0449514ab4f21683a600eaf48910 100644 --- a/tensorflow/contrib/data/python/ops/iterator_ops.py +++ b/tensorflow/contrib/data/python/ops/iterator_ops.py @@ -20,6 +20,7 @@ from tensorflow.python.data.ops import iterator_ops from tensorflow.python.framework import ops from tensorflow.python.ops import gen_dataset_ops from tensorflow.python.training import basic_session_run_hooks +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import saver as saver_lib from tensorflow.python.training import session_run_hook @@ -117,7 +118,7 @@ class CheckpointInputPipelineHook(session_run_hook.SessionRunHook): pipeline. For saving the input pipeline checkpoint alongside the model weights use - @{tf.contrib.data.make_saveable_from_iterator} directly to create a + `tf.contrib.data.make_saveable_from_iterator` directly to create a `SaveableObject` and add to the `SAVEABLE_OBJECTS` collection. Note, however, that you will need to be careful not to restore the training iterator during eval. You can do that by not adding the iterator to the SAVEABLE_OBJECTS @@ -206,7 +207,7 @@ class CheckpointInputPipelineHook(session_run_hook.SessionRunHook): # Check if there is an existing checkpoint. If so, restore from it. # pylint: disable=protected-access - latest_checkpoint_path = saver_lib.latest_checkpoint( + latest_checkpoint_path = checkpoint_management.latest_checkpoint( self._checkpoint_saver_hook._checkpoint_dir, latest_filename=self._latest_filename) if latest_checkpoint_path: diff --git a/tensorflow/contrib/data/python/ops/map_defun.py b/tensorflow/contrib/data/python/ops/map_defun.py new file mode 100644 index 0000000000000000000000000000000000000000..54d5cd6da068fa5471b7beafcc66d76b5972e7d5 --- /dev/null +++ b/tensorflow/contrib/data/python/ops/map_defun.py @@ -0,0 +1,58 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Experimental API for optimizing `tf.data` pipelines.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import gen_dataset_ops + + +def map_defun(fn, elems, output_dtypes, output_shapes): + """Map a function on the list of tensors unpacked from `elems` on dimension 0. + + Args: + fn: A function (`function.Defun`) that takes a list of tensors and returns + another list of tensors. The output list has the same types as + output_dtypes. The elements of the output list have the same dimension 0 + as `elems`, and the remaining dimensions correspond to those of + `fn_output_shapes`. + elems: A list of tensors. + output_dtypes: A list of dtypes corresponding to the output types of the + function. + output_shapes: A list of `TensorShape`s corresponding to the output + shapes from each invocation of the function on slices of inputs. + + Raises: + ValueError: if any of the inputs are malformed. + + Returns: + A list of `Tensor` objects with the same types as `output_dtypes`. + """ + if not isinstance(elems, list): + raise ValueError("`elems` must be a list of tensors.") + if not isinstance(output_dtypes, list): + raise ValueError("`output_dtypes` must be a list of tensors.") + if not isinstance(output_shapes, list): + raise ValueError("`output_shapes` must be a list of tensors.") + + elems = [ops.convert_to_tensor(e) for e in elems] + output_shapes = [tensor_shape.TensorShape(s) for s in output_shapes] + if not all(s.is_fully_defined() for s in output_shapes): + raise ValueError("All fn output shapes must be fully defined.") + return gen_dataset_ops.map_defun(elems, output_dtypes, output_shapes, fn) diff --git a/tensorflow/contrib/data/python/ops/optimization.py b/tensorflow/contrib/data/python/ops/optimization.py index 018c5115e1d5599e48bf99ccf832c7962794fc40..fa1b851ad74bcf2cff69d42bce3eaa38822cd663 100644 --- a/tensorflow/contrib/data/python/ops/optimization.py +++ b/tensorflow/contrib/data/python/ops/optimization.py @@ -36,7 +36,7 @@ def assert_next(transformations): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): @@ -56,7 +56,7 @@ def optimize(optimizations=None): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py index 0edd7c9fe974784f199c272a649b302e72d8c218..5222011d045efd9a64b4e89b248303cffbcb0b37 100644 --- a/tensorflow/contrib/data/python/ops/prefetching_ops.py +++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py @@ -92,7 +92,7 @@ def function_buffering_resource_reset(function_buffer_resource, name=None): # pylint: disable=protected-access class _PrefetchToDeviceIterator(object): - """A replacement for @{tf.data.Iterator} that prefetches to another device. + """A replacement for `tf.data.Iterator` that prefetches to another device. Args: input_dataset: The input dataset @@ -158,7 +158,7 @@ class _PrefetchToDeviceIterator(object): self._input_dataset) def get_next(self, name=None): - """See @{tf.data.Iterator.get_next}.""" + """See `tf.data.Iterator.get_next`.""" self._get_next_call_count += 1 if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD: warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE) @@ -199,7 +199,7 @@ class _PrefetchToDeviceIterator(object): class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator): - """A replacement for @{tf.data.Iterator} that prefetches to another device. + """A replacement for `tf.data.Iterator` that prefetches to another device. Args: input_dataset: The input dataset @@ -334,7 +334,7 @@ class _PrefetchToDeviceDataset(dataset_ops.Dataset): def prefetch_to_device(device, buffer_size=None): """A transformation that prefetches dataset values to the given `device`. - NOTE: Although the transformation creates a @{tf.data.Dataset}, the + NOTE: Although the transformation creates a `tf.data.Dataset`, the transformation must be the final `Dataset` in the input pipeline. Args: @@ -344,7 +344,7 @@ def prefetch_to_device(device, buffer_size=None): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): return _PrefetchToDeviceDataset(dataset, device, buffer_size) @@ -361,7 +361,7 @@ def copy_to_device(target_device, source_device="/cpu:0"): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): @@ -631,8 +631,19 @@ class MultiDeviceIterator(object): def __init__(self, dataset, devices, + max_buffer_size=1, prefetch_buffer_size=1, source_device="/cpu:0"): + """Constructs a MultiDeviceIterator. + + Args: + dataset: The input dataset to be iterated over. + devices: The list of devices to fetch data to. + max_buffer_size: Maximum size of the host side per device buffer to keep. + prefetch_buffer_size: if > 1, then we setup a buffer on each device + to prefetch into. + source_device: The host device to place the `dataset` on. + """ self._dataset = dataset self._devices = devices self._source_device = source_device @@ -659,7 +670,8 @@ class MultiDeviceIterator(object): # iterators and the multi-device iterator. self._incarnation_id = gen_dataset_ops.multi_device_iterator_init( self._dataset._as_variant_tensor(), # pylint: disable=protected-access - self._multi_device_iterator_resource) + self._multi_device_iterator_resource, + max_buffer_size=max_buffer_size) # TODO(rohanj): Explore the possibility of the MultiDeviceIterator to # initialize the device side of the pipeline. This would allow the @@ -673,7 +685,8 @@ class MultiDeviceIterator(object): i, self._multi_device_iterator_resource, self._incarnation_id, self._source_device_tensor, device, self._dataset.output_shapes, self._dataset.output_types, self._dataset.output_classes) - ds = ds.prefetch(prefetch_buffer_size) + if prefetch_buffer_size > 0: + ds = ds.prefetch(prefetch_buffer_size) with ops.device(device): self._device_iterators.append(ds.make_initializable_iterator()) i += 1 diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py index f018dd02e6ae9de69c7364677e1756d1e11bf484..3882d4bfdbe899c2ce92f829cb331b32d3d50398 100644 --- a/tensorflow/contrib/data/python/ops/readers.py +++ b/tensorflow/contrib/data/python/ops/readers.py @@ -234,7 +234,7 @@ def make_tf_record_dataset( Args: file_pattern: List of files or patterns of TFRecord file paths. - See @{tf.gfile.Glob} for pattern rules. + See `tf.gfile.Glob` for pattern rules. batch_size: An int representing the number of records to combine in a single batch. parser_fn: (Optional.) A function accepting string input to parse @@ -286,11 +286,14 @@ def make_tf_record_dataset( dataset = _maybe_shuffle_and_repeat( dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed) + # NOTE(mrry): We set `drop_final_batch=True` when `num_epochs is None` to + # improve the shape inference, because it makes the batch dimension static. + # It is safe to do this because in that case we are repeating the input + # indefinitely, and all batches will be full-sized. + drop_final_batch = drop_final_batch or num_epochs is None + if parser_fn is None: - if drop_final_batch: - dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size)) - else: - dataset = dataset.batch(batch_size) + dataset = dataset.batch(batch_size, drop_remainder=drop_final_batch) else: # TODO(josh11b): if num_parallel_parser_calls is None, use some function # of num cores instead of map_and_batch's default behavior of one batch. @@ -337,7 +340,7 @@ def make_csv_dataset( Args: file_pattern: List of files or patterns of file paths containing CSV - records. See @{tf.gfile.Glob} for pattern rules. + records. See `tf.gfile.Glob` for pattern rules. batch_size: An int representing the number of records to combine in a single batch. column_names: An optional list of strings that corresponds to the CSV @@ -493,8 +496,13 @@ def make_csv_dataset( dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed) # Apply batch before map for perf, because map has high overhead relative - # to the size of the computation in each map - dataset = dataset.batch(batch_size=batch_size) + # to the size of the computation in each map. + # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to + # improve the shape inference, because it makes the batch dimension static. + # It is safe to do this because in that case we are repeating the input + # indefinitely, and all batches will be full-sized. + dataset = dataset.batch(batch_size=batch_size, + drop_remainder=num_epochs is None) dataset = dataset.map(map_fn, num_parallel_calls=num_parallel_parser_calls) dataset = dataset.prefetch(prefetch_buffer_size) @@ -772,10 +780,12 @@ def make_batched_features_dataset(file_pattern, dataset = dataset.apply(stats_ops.feature_stats("record_stats")) - if drop_final_batch: - dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size)) - else: - dataset = dataset.batch(batch_size) + # NOTE(mrry): We set `drop_remainder=True` when `num_epochs is None` to + # improve the shape inference, because it makes the batch dimension static. + # It is safe to do this because in that case we are repeating the input + # indefinitely, and all batches will be full-sized. + dataset = dataset.batch( + batch_size, drop_remainder=drop_final_batch or num_epochs is None) # Parse `Example` tensors to a dictionary of `Feature` tensors. dataset = dataset.map( diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py index 182a5c6ff36fcda8c9e2c522cce07bed0c2daec9..75642f143e19c3d77e675384362c4dab94e10932 100644 --- a/tensorflow/contrib/data/python/ops/resampling.py +++ b/tensorflow/contrib/data/python/ops/resampling.py @@ -50,7 +50,7 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): """Function from `Dataset` to `Dataset` that applies the transformation.""" diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py index ea9dcfe68fa2630d915323fa295031af7d48cdfb..6b002b4a533669dd0f5e82a00aa29224a83a7e57 100644 --- a/tensorflow/contrib/data/python/ops/scan_ops.py +++ b/tensorflow/contrib/data/python/ops/scan_ops.py @@ -151,7 +151,7 @@ class _ScanDataset(dataset_ops.Dataset): def scan(initial_state, scan_func): """A transformation that scans a function across an input dataset. - This transformation is a stateful relative of @{tf.data.Dataset.map}. + This transformation is a stateful relative of `tf.data.Dataset.map`. In addition to mapping `scan_func` across the elements of the input dataset, `scan()` accumulates one or more state tensors, whose initial values are `initial_state`. @@ -166,7 +166,7 @@ def scan(initial_state, scan_func): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): return _ScanDataset(dataset, initial_state, scan_func) diff --git a/tensorflow/contrib/data/python/ops/shuffle_ops.py b/tensorflow/contrib/data/python/ops/shuffle_ops.py index d7f8a73fe3d67bb83e44e962832ce34c116aef66..4356721704046199e8ef2938bde6d7d8bce68cc1 100644 --- a/tensorflow/contrib/data/python/ops/shuffle_ops.py +++ b/tensorflow/contrib/data/python/ops/shuffle_ops.py @@ -92,11 +92,11 @@ def shuffle_and_repeat(buffer_size, count=None, seed=None): indefinitely. seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random seed that will be used to create the distribution. See - @{tf.set_random_seed} for behavior. + `tf.set_random_seed` for behavior. Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): # pylint: disable=missing-docstring diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py index e9dd74530ac64cd414d53eab5294eaa95c919131..8025dcdd16b0180aeb951a31de21e22b8e8c31c7 100644 --- a/tensorflow/contrib/data/python/ops/sliding.py +++ b/tensorflow/contrib/data/python/ops/sliding.py @@ -109,7 +109,7 @@ def sliding_window_batch(window_size, Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. Raises: ValueError: if invalid arguments are provided. diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py index 97931f75bd37d9e45864fe477c6e1620b5e4f193..3b4e98140234af0bf2128ac32f95dbdbf183cb54 100644 --- a/tensorflow/contrib/data/python/ops/stats_ops.py +++ b/tensorflow/contrib/data/python/ops/stats_ops.py @@ -29,7 +29,7 @@ class StatsAggregator(object): """A stateful resource that aggregates statistics from one or more iterators. To record statistics, use one of the custom transformation functions defined - in this module when defining your @{tf.data.Dataset}. All statistics will be + in this module when defining your `tf.data.Dataset`. All statistics will be aggregated by the `StatsAggregator` that is associated with a particular iterator (see below). For example, to record the total number of bytes produced by iterating over a dataset: @@ -39,7 +39,7 @@ class StatsAggregator(object): dataset = dataset.apply(stats_ops.bytes_produced_stats("total_bytes")) ``` - To associate a `StatsAggregator` with a @{tf.data.Iterator} object, use + To associate a `StatsAggregator` with a `tf.data.Iterator` object, use the following pattern: ```python @@ -55,7 +55,7 @@ class StatsAggregator(object): To get a protocol buffer summary of the currently aggregated statistics, use the `StatsAggregator.get_summary()` tensor. The easiest way to do this - is to add the returned tensor to the @{tf.GraphKeys.SUMMARIES} collection, + is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection, so that the summaries will be included with any existing summaries. ```python @@ -74,13 +74,13 @@ class StatsAggregator(object): self._resource = gen_dataset_ops.stats_aggregator_handle() def get_summary(self): - """Returns a string @{tf.Tensor} that summarizes the aggregated statistics. + """Returns a string `tf.Tensor` that summarizes the aggregated statistics. - The returned tensor will contain a serialized @{tf.summary.Summary} protocol + The returned tensor will contain a serialized `tf.summary.Summary` protocol buffer, which can be used with the standard TensorBoard logging facilities. Returns: - A scalar string @{tf.Tensor} that summarizes the aggregated statistics. + A scalar string `tf.Tensor` that summarizes the aggregated statistics. """ return gen_dataset_ops.stats_aggregator_summary(self._resource) @@ -122,7 +122,7 @@ def set_stats_aggregator(stats_aggregator): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): @@ -145,7 +145,7 @@ def bytes_produced_stats(tag): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): @@ -169,7 +169,7 @@ def latency_stats(tag): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): @@ -192,7 +192,7 @@ def feature_stats(tag): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): diff --git a/tensorflow/contrib/data/python/ops/threadpool.py b/tensorflow/contrib/data/python/ops/threadpool.py index 9af1e784ffb4f6d71da25f09d60343b649c5079b..dc67accdcfbc2692cbe0c961521897a316f40647 100644 --- a/tensorflow/contrib/data/python/ops/threadpool.py +++ b/tensorflow/contrib/data/python/ops/threadpool.py @@ -100,6 +100,6 @@ def override_threadpool(dataset, thread_pool): Returns: A dataset containing the same values as `dataset`, but which uses `thread_pool` to compute any of its parallel operations (such as - @{tf.data.Dataset.map}). + `tf.data.Dataset.map`). """ return _ThreadPoolDataset(dataset, thread_pool) diff --git a/tensorflow/contrib/data/python/ops/unique.py b/tensorflow/contrib/data/python/ops/unique.py index e0ce0a4ef15f6b9181bce92fb4d73bf1fab2e66c..e0d606311c4f2f678970113c1faa578dbf44b2ba 100644 --- a/tensorflow/contrib/data/python/ops/unique.py +++ b/tensorflow/contrib/data/python/ops/unique.py @@ -38,7 +38,7 @@ def unique(): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): diff --git a/tensorflow/contrib/data/python/ops/writers.py b/tensorflow/contrib/data/python/ops/writers.py index f53bd3f7383950d6cfdb35e12811fb1daf24b320..c455fdcba673853079ff0d162c4799e72bc8e627 100644 --- a/tensorflow/contrib/data/python/ops/writers.py +++ b/tensorflow/contrib/data/python/ops/writers.py @@ -38,13 +38,13 @@ class TFRecordWriter(object): argument_dtype=dtypes.string) def write(self, dataset): - """Returns a @{tf.Operation} to write a dataset to a file. + """Returns a `tf.Operation` to write a dataset to a file. Args: - dataset: a @{tf.data.Dataset} whose elements are to be written to a file + dataset: a `tf.data.Dataset` whose elements are to be written to a file Returns: - A @{tf.Operation} that, when run, writes contents of `dataset` to a file. + A `tf.Operation` that, when run, writes contents of `dataset` to a file. """ if not isinstance(dataset, dataset_ops.Dataset): raise TypeError("`dataset` must be a `tf.data.Dataset` object.") diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD index 1126f76f5854932bcb6a9550c100768069bbd1cc..d3628d480d31017f835b39f750df40cafa2cc0db 100644 --- a/tensorflow/contrib/distribute/BUILD +++ b/tensorflow/contrib/distribute/BUILD @@ -25,10 +25,13 @@ py_library( srcs = ["__init__.py"], visibility = ["//tensorflow:internal"], deps = [ + "//tensorflow/contrib/distribute/python:collective_all_reduce_strategy", "//tensorflow/contrib/distribute/python:cross_tower_ops", "//tensorflow/contrib/distribute/python:mirrored_strategy", "//tensorflow/contrib/distribute/python:monitor", + "//tensorflow/contrib/distribute/python:multi_worker_strategy", "//tensorflow/contrib/distribute/python:one_device_strategy", + "//tensorflow/contrib/distribute/python:parameter_server_strategy", "//tensorflow/contrib/distribute/python:step_fn", "//tensorflow/contrib/distribute/python:tpu_strategy", "//tensorflow/python:training", diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py index 2e2c3be853cc5503c86121c142394d49e5037405..9123ca749b68a1d0066313c77914fa3fb8006a9e 100644 --- a/tensorflow/contrib/distribute/__init__.py +++ b/tensorflow/contrib/distribute/__init__.py @@ -19,10 +19,13 @@ from __future__ import division from __future__ import print_function # pylint: disable=unused-import,wildcard-import +from tensorflow.contrib.distribute.python.collective_all_reduce_strategy import CollectiveAllReduceStrategy from tensorflow.contrib.distribute.python.cross_tower_ops import * from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy +from tensorflow.contrib.distribute.python.multi_worker_strategy import MultiWorkerMirroredStrategy from tensorflow.contrib.distribute.python.monitor import Monitor from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy +from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy from tensorflow.contrib.distribute.python.step_fn import * from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy from tensorflow.python.training.distribute import * @@ -32,11 +35,14 @@ from tensorflow.python.util.all_util import remove_undocumented _allowed_symbols = [ 'AllReduceCrossTowerOps', + 'CollectiveAllReduceStrategy', 'CrossTowerOps', 'DistributionStrategy', 'MirroredStrategy', + 'MultiWorkerMirroredStrategy', 'Monitor', 'OneDeviceStrategy', + 'ParameterServerStrategy', 'ReductionToOneDeviceCrossTowerOps', 'Step', 'StandardInputStep', diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD index cbe741de5a67c231c0982d6d389b3591cff001ec..40a1c1707cfdeaf5f5097ce661fa5f0613f804d0 100644 --- a/tensorflow/contrib/distribute/python/BUILD +++ b/tensorflow/contrib/distribute/python/BUILD @@ -57,7 +57,7 @@ cuda_py_test( "//tensorflow/python/eager:context", "//tensorflow/python:device_util", "//tensorflow/python/eager:test", - "//tensorflow/python/estimator:model_fn", + "//tensorflow/python/estimator:estimator_py", ], tags = [ "no_pip", @@ -133,6 +133,24 @@ py_library( ], ) +py_library( + name = "collective_all_reduce_strategy", + srcs = ["collective_all_reduce_strategy.py"], + visibility = ["//tensorflow:internal"], + deps = [ + ":cross_tower_ops", + ":cross_tower_utils", + ":mirrored_strategy", + ":values", + "//tensorflow/core:protos_all_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:collective_ops", + "//tensorflow/python:framework_ops", + "//tensorflow/python:training", + "//tensorflow/python/eager:context", + ], +) + py_library( name = "strategy_test_lib", testonly = 1, @@ -169,6 +187,7 @@ py_library( ":multi_worker_strategy", ":one_device_strategy", ":tpu_strategy", + "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip", "//tensorflow/contrib/optimizer_v2:training", "//tensorflow/python:distribute", "//tensorflow/python:framework_ops", @@ -248,7 +267,7 @@ py_test( "//tensorflow/python:variable_scope", "//tensorflow/python:variables", "//tensorflow/python/eager:context", - "//tensorflow/python/estimator:run_config", + "//tensorflow/python/estimator:estimator_py", "@absl_py//absl/testing:parameterized", ], ) @@ -293,11 +312,11 @@ py_library( ], deps = [ "//tensorflow/core:protos_all_py", + "//tensorflow/python:client_testlib", "//tensorflow/python:distributed_framework_test_lib", - "//tensorflow/python:platform", "//tensorflow/python:session", - "//tensorflow/python:training", - "//tensorflow/python/eager:test", + "//tensorflow/python/estimator:estimator_py", + "//third_party/py/numpy", ], ) @@ -318,8 +337,7 @@ py_library( deps = [ ":one_device_strategy", ":values", - "//tensorflow/contrib/tpu", - "//tensorflow/contrib/tpu:tpu_py", + "//tensorflow/contrib/tpu:tpu_lib", "//tensorflow/python:constant_op", "//tensorflow/python:control_flow_ops", "//tensorflow/python:framework_ops", @@ -327,6 +345,37 @@ py_library( ], ) +py_test( + name = "collective_all_reduce_strategy_test", + srcs = ["collective_all_reduce_strategy_test.py"], + srcs_version = "PY2AND3", + tags = [ + "no_pip", + ], + deps = [ + ":collective_all_reduce_strategy", + ":combinations", + ":cross_tower_utils", + ":multi_worker_test_base", + ":strategy_test_lib", + "//tensorflow/core:protos_all_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework_ops", + "//tensorflow/python:gradients", + "//tensorflow/python:init_ops", + "//tensorflow/python:layers", + "//tensorflow/python:variable_scope", + "//tensorflow/python:variables", + "//tensorflow/python/eager:context", + "//tensorflow/python/estimator:estimator_py", + "//third_party/py/numpy", + "@absl_py//absl/testing:parameterized", + ], +) + py_library( name = "minimize_loss_test_lib", testonly = 1, @@ -391,11 +440,7 @@ cuda_py_test( "//tensorflow/contrib/optimizer_v2:training", "//tensorflow/python/data/ops:dataset_ops", "//tensorflow/python/eager:test", - "//tensorflow/python/estimator:dnn_linear_combined", - "//tensorflow/python/estimator:export_export", - "//tensorflow/python/estimator:numpy_io", - "//tensorflow/python/estimator:prediction_keys", - "//tensorflow/python/estimator:run_config", + "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/feature_column", "//tensorflow/python:framework_ops", "//tensorflow/python:platform", @@ -421,17 +466,27 @@ py_library( ], ) -cuda_py_test( - name = "step_fn_test", +py_library( + name = "step_fn_test_lib", + testonly = 1, srcs = ["step_fn_test.py"], - additional_deps = [ - ":single_loss_example", + deps = [ ":combinations", - "@absl_py//absl/testing:parameterized", - "//third_party/py/numpy", + ":single_loss_example", + "//tensorflow/contrib/tpu:tpu_lib", "//tensorflow/python:variables", "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", + "//third_party/py/numpy", + "@absl_py//absl/testing:parameterized", + ], +) + +cuda_py_test( + name = "step_fn_test", + srcs = ["step_fn_test.py"], + additional_deps = [ + ":step_fn_test_lib", ], tags = [ "multi_and_single_gpu", @@ -497,8 +552,11 @@ py_library( "//tensorflow/contrib/all_reduce:all_reduce_py", "//tensorflow/contrib/nccl:nccl_py", "//tensorflow/python:array_ops", + "//tensorflow/python:collective_ops", + "//tensorflow/python:device", "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", + "//tensorflow/python:gradients", "//tensorflow/python:math_ops", ], ) @@ -533,7 +591,9 @@ py_library( "//tensorflow/python:framework_ops", "//tensorflow/python:math_ops", "//tensorflow/python:platform", + "//tensorflow/python:resource_variable_ops", "//tensorflow/python:training", + "//tensorflow/python:variable_scope", "//tensorflow/python/eager:context", "@six_archive//:six", ], @@ -541,6 +601,7 @@ py_library( cuda_py_test( name = "cross_tower_ops_test", + size = "large", srcs = ["cross_tower_ops_test.py"], additional_deps = [ ":combinations", @@ -555,7 +616,6 @@ cuda_py_test( "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", ], - shard_count = 15, tags = [ "multi_and_single_gpu", "no_pip", @@ -627,8 +687,7 @@ cuda_py_test( "//tensorflow/contrib/distribute/python:mirrored_strategy", "//tensorflow/python:client_testlib", "//tensorflow/python:training", - "//tensorflow/python/estimator:keras", - "//tensorflow/python/estimator:run_config", + "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/keras", ], tags = [ diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..9afcaecf78844b011a9dbc30bb95fa3bfeda8470 --- /dev/null +++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py @@ -0,0 +1,205 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Class CollectiveAllReduceStrategy implementing DistributionStrategy.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import os + +from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib +from tensorflow.contrib.distribute.python import cross_tower_utils +from tensorflow.contrib.distribute.python import mirrored_strategy +from tensorflow.contrib.distribute.python import values +from tensorflow.core.protobuf import cluster_pb2 +from tensorflow.python.eager import context +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import collective_ops +from tensorflow.python.training import server_lib + + +# TODO(yuefengz): move this function to a common util file. +def _normalize_cluster_spec(cluster_spec): + if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)): + return server_lib.ClusterSpec(cluster_spec) + elif not isinstance(cluster_spec, server_lib.ClusterSpec): + raise ValueError( + "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a " + "`tf.train.ClusterDef` object") + return cluster_spec + + +# TODO(yuefengz): shard the dataset. +# TODO(yuefengz): support in-graph replication. +# TODO(yuefengz): it only works with a cluster without a chief node, maybe +# support chief node? +class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy): + """Distribution strategy that uses collective ops for all-reduce. + + It is similar to the MirroredStrategy but it uses collective ops for + reduction. It currently only works for between-graph replication and its + reduction will reduce across all workers. + """ + + def __init__(self, + num_gpus_per_worker=0, + cluster_spec=None, + task_type="worker", + task_id=0): + """Initializes the object. + + Args: + num_gpus_per_worker: number of local GPUs or GPUs per worker. + cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the + cluster configurations. + task_type: the current task type, such as "worker". + task_id: the current task id. + + Raises: + ValueError: if `task_type` is not in the `cluster_spec`. + """ + self._num_gpus_per_worker = num_gpus_per_worker + self._initialize(cluster_spec, task_type, task_id) + + def _initialize(self, cluster_spec, task_type, task_id): + if task_type not in ["chief", "worker"]: + raise ValueError( + "Unrecognized task_type: %r, valid task types are: \"chief\", " + "\"worker\"." % task_type) + if cluster_spec: + self._cluster_spec = _normalize_cluster_spec(cluster_spec) + worker_device = "/job:%s/task:%d" % (task_type, task_id) + num_workers = len(self._cluster_spec.as_dict().get(task_type, [])) + if "chief" in self._cluster_spec.as_dict(): + num_workers += 1 + if not num_workers: + raise ValueError("`task_type` shoud be in `cluster_spec`.") + + # TODO(yuefengz): create a utility to infer chief. + if "chief" in self._cluster_spec.as_dict() and task_type == "chief": + assert task_id == 0 + self._is_chief = True + else: + assert task_type == "worker" + self._is_chief = task_id == 0 + else: + self._cluster_spec = None + self._is_chief = True + worker_device = "" + num_workers = 1 + self._num_workers = num_workers + + if self._num_gpus_per_worker: + local_devices = [ + "%s/device:GPU:%d" % (worker_device, i) + for i in range(self._num_gpus_per_worker) + ] + else: + local_devices = [worker_device] + + self._collective_keys = cross_tower_utils.CollectiveKeys() + super(CollectiveAllReduceStrategy, self).__init__( + devices=local_devices, + cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce( + num_workers=num_workers, + num_gpus_per_worker=self._num_gpus_per_worker, + collective_keys=self._collective_keys)) + + # Add a default device so that ops without specified devices will not end up + # on other workers. + if cluster_spec: + self._default_device = "/job:%s/replica:0/task:%d" % (task_type, task_id) + + def _create_variable(self, next_creator, *args, **kwargs): + colocate_with = kwargs.pop("colocate_with", None) + devices = self._get_devices_from(colocate_with) + group_size = len(devices) * self._num_workers + group_key = self._collective_keys.get_group_key(self._devices) + + def _real_mirrored_creator(devices, *args, **kwargs): + """Creates one MirroredVariable on the current worker.""" + index = {} + collective_instance_key = self._collective_keys.get_instance_key( + key_id=kwargs["name"]) + if "initial_value" not in kwargs: + raise ValueError("Initial value must be specified.") + initial_value = kwargs["initial_value"] + if callable(initial_value): + initial_value_fn = initial_value + else: + initial_value_fn = lambda: initial_value + + for i, d in enumerate(devices): + with ops.device(d): + if i > 0: + # Give replicas meaningful distinct names: + var0name = index[devices[0]].name.split(":")[0] + # We append a / to variable names created on towers with id > 0 to + # ensure that we ignore the name scope and instead use the given + # name as the absolute name of the variable. + kwargs["name"] = "%s/replica_%d/" % (var0name, i) + + # The initial value fn makes sure variables all initialized to + # same values. The first device of the chief worker will send their + # variable values to other devices and other workers. + def _overridden_initial_value_fn(device=d, index=i): # pylint: disable=g-missing-docstring + with ops.device(device): + initial_value = initial_value_fn() + assert not callable(initial_value) + initial_value = ops.convert_to_tensor(initial_value) + + if self._is_chief and index == 0: + bcast_send = collective_ops.broadcast_send( + initial_value, initial_value.shape, initial_value.dtype, + group_size, group_key, collective_instance_key) + with ops.control_dependencies([bcast_send]): + return array_ops.identity(initial_value) + else: + return collective_ops.broadcast_recv( + initial_value.shape, initial_value.dtype, group_size, + group_key, collective_instance_key) + + kwargs["initial_value"] = _overridden_initial_value_fn + + with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT): + v = next_creator(*args, **kwargs) + + assert not isinstance(v, values.DistributedVariable) + index[d] = v + return index + + # pylint: disable=protected-access + return mirrored_strategy._create_mirrored_variable( + devices, _real_mirrored_creator, *args, **kwargs) + + def configure(self, session_config=None): + # Use TF_CONFIG to get the cluster spec and the current job. + if not self._cluster_spec: + tf_config = json.loads(os.environ.get("TF_CONFIG", "{}")) + cluster_spec = _normalize_cluster_spec(tf_config.get("cluster", {})) + + task_env = tf_config.get("task", {}) + if task_env: + task_type = task_env.get("type", "worker") + task_id = int(task_env.get("index", "0")) + else: + task_type = "worker" + task_id = 0 + + if cluster_spec: + self._initialize(cluster_spec, task_type, task_id) diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py new file mode 100644 index 0000000000000000000000000000000000000000..b5e54e3b7d7156e87731e6f79aa66262d127232c --- /dev/null +++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py @@ -0,0 +1,217 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for CollectiveAllReduceStrategy.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl.testing import parameterized +import numpy as np + +from tensorflow.contrib.distribute.python import collective_all_reduce_strategy +from tensorflow.contrib.distribute.python import combinations +from tensorflow.contrib.distribute.python import cross_tower_utils +from tensorflow.contrib.distribute.python import multi_worker_test_base +from tensorflow.contrib.distribute.python import strategy_test_lib +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.eager import context +from tensorflow.python.estimator import run_config +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.layers import core +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gradients +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + + +class DistributedCollectiveAllReduceStrategyTest( + multi_worker_test_base.MultiWorkerTestBase, parameterized.TestCase): + + collective_key_base = 0 + + @classmethod + def setUpClass(cls): + """Create a local cluster with 2 workers.""" + cls._workers, cls._ps = multi_worker_test_base.create_in_process_cluster( + num_workers=3, num_ps=0) + cls._cluster_spec = { + run_config.TaskType.WORKER: [ + 'fake_worker_0', 'fake_worker_1', 'fake_worker_2' + ] + } + + def setUp(self): + self._run_options = config_pb2.RunOptions() + self._run_options.experimental.collective_graph_key = 6 + + self._sess_config = config_pb2.ConfigProto() + self._sess_config.experimental.collective_group_leader = ( + '/job:worker/replica:0/task:0') + + # We use a different key_base for each test so that collective keys won't be + # reused. + # TODO(yuefengz, tucker): enable it to reuse collective keys in different + # tests. + DistributedCollectiveAllReduceStrategyTest.collective_key_base += 100000 + super(DistributedCollectiveAllReduceStrategyTest, self).setUp() + + def _get_test_object(self, task_type, task_id, num_gpus=0): + distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy( + num_gpus_per_worker=num_gpus, + cluster_spec=self._cluster_spec, + task_type=task_type, + task_id=task_id) + collective_keys = cross_tower_utils.CollectiveKeys( + group_key_start=10 * num_gpus + + DistributedCollectiveAllReduceStrategyTest.collective_key_base, + instance_key_start=num_gpus * 100 + + DistributedCollectiveAllReduceStrategyTest.collective_key_base, + instance_key_with_id_start=num_gpus * 10000 + + DistributedCollectiveAllReduceStrategyTest.collective_key_base) + distribution._collective_keys = collective_keys + distribution._cross_tower_ops._collective_keys = collective_keys + return distribution, self._workers[task_id].target + + def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): + d, master_target = self._get_test_object(task_type, task_id, num_gpus) + with ops.Graph().as_default(), \ + self.test_session(config=self._sess_config, + target=master_target) as sess, \ + d.scope(): + l = core.Dense(1, use_bias=False, name='gpu_%d' % d._num_gpus_per_worker) + + def loss_fn(x): + y = array_ops.reshape(l(x), []) - constant_op.constant(1.) + return y * y + + # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for + # multiple graphs (b/111216820). + def grad_fn(x): + loss = loss_fn(x) + var_list = ( + variables.trainable_variables() + ops.get_collection( + ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) + grads = gradients.gradients(loss, var_list) + ret = list(zip(grads, var_list)) + return ret + + def update(v, g): + return v.assign_sub(0.05 * g, use_locking=True) + + one = d.broadcast(constant_op.constant([[1.]])) + + def step(): + """Perform one optimization step.""" + # Run forward & backward to get gradients, variables list. + g_v = d.call_for_each_tower(grad_fn, one) + # Update the variables using the gradients and the update() function. + before_list = [] + after_list = [] + for g, v in g_v: + fetched = d.read_var(v) + before_list.append(fetched) + with ops.control_dependencies([fetched]): + # TODO(yuefengz): support non-Mirrored variable as destinations. + g = d.reduce( + variable_scope.VariableAggregation.SUM, g, destinations=v) + with ops.control_dependencies(d.unwrap(d.update(v, update, g))): + after_list.append(d.read_var(v)) + return before_list, after_list + + before_out, after_out = step() + + if context.num_gpus() < d._num_gpus_per_worker: + return True + + sess.run( + variables.global_variables_initializer(), options=self._run_options) + + for i in range(10): + b, a = sess.run((before_out, after_out), options=self._run_options) + if i == 0: + before, = b + after, = a + + error_before = abs(before - 1) + error_after = abs(after - 1) + # Error should go down + self.assertLess(error_after, error_before) + return error_after < error_before + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testMinimizeLossGraph(self, num_gpus): + self._run_between_graph_clients(self._test_minimize_loss_graph, + self._cluster_spec, num_gpus) + + def _test_variable_initialization(self, task_type, task_id, num_gpus): + distribution, master_target = self._get_test_object(task_type, task_id, + num_gpus) + with ops.Graph().as_default(), \ + self.test_session(config=self._sess_config, + target=master_target) as sess, \ + distribution.scope(): + + def model_fn(): + x = variable_scope.get_variable( + 'x', + shape=(2, 3), + initializer=init_ops.random_uniform_initializer( + 1.0, 10.0, dtype=dtypes.float32)) + return array_ops.identity(x) + + x = distribution.call_for_each_tower(model_fn) + reduced_x = distribution.unwrap( + distribution.reduce( + variable_scope.VariableAggregation.MEAN, x, + destinations='/cpu:0'))[0] + + sess.run( + variables.global_variables_initializer(), options=self._run_options) + x_value, reduced_x_value = sess.run( + [x, reduced_x], options=self._run_options) + self.assertTrue(np.array_equal(x_value, reduced_x_value)) + return np.array_equal(x_value, reduced_x_value) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testVariableInitialization(self, num_gpus): + if context.num_gpus() < num_gpus: + return + self._run_between_graph_clients( + self._test_variable_initialization, + self._cluster_spec, + num_gpus=num_gpus) + + +class LocalCollectiveAllReduceStrategy(strategy_test_lib.DistributionTestBase, + parameterized.TestCase): + + def testMinimizeLossGraph(self, num_gpus=2): + # Collective ops doesn't support strategy with one device. + if context.num_gpus() < num_gpus: + return + distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy( + num_gpus_per_worker=num_gpus) + self._test_minimize_loss_graph(distribution) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py index 9a8ea4aa48b8cf4c5906f18d8bddacc224e0b644..a1efbcaf9ac300d7352efdb7babf4e6c1a529f3b 100644 --- a/tensorflow/contrib/distribute/python/combinations.py +++ b/tensorflow/contrib/distribute/python/combinations.py @@ -46,6 +46,7 @@ import unittest from absl.testing import parameterized import six +from tensorflow.contrib.cluster_resolver import TPUClusterResolver from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib from tensorflow.contrib.distribute.python import multi_worker_strategy from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib @@ -144,7 +145,7 @@ def _augment_with_special_arguments(test_method): """A wrapped test method that treats some arguments in a special way.""" mode = kwargs.pop("mode", "graph") - distribution = kwargs.pop("distribution", None) + distribution = kwargs.get("distribution", None) required_tpu = kwargs.pop("required_tpu", False) required_gpus = kwargs.pop("required_gpus", None) @@ -153,7 +154,6 @@ def _augment_with_special_arguments(test_method): "Do not use `required_gpus` and `distribution` together.") assert required_tpu is False, ( "Do not use `required_tpu` and `distribution` together.") - kwargs["distribution"] = distribution.strategy required_gpus = distribution.required_gpus required_tpu = distribution.required_tpu @@ -189,9 +189,13 @@ def _augment_with_special_arguments(test_method): if mode == "eager": with ops.Graph().as_default(), context.eager_mode(): + if distribution: + kwargs_to_pass["distribution"] = distribution.strategy test_method(**kwargs_to_pass) elif mode == "graph": with ops.Graph().as_default(), context.graph_mode(): + if distribution: + kwargs_to_pass["distribution"] = distribution.strategy test_method(**kwargs_to_pass) else: raise ValueError( @@ -321,7 +325,10 @@ default_strategy = NamedDistribution( one_device_strategy = NamedDistribution( "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"), required_gpus=None) -tpu_strategy = NamedDistribution("TPU", tpu_lib.TPUStrategy, required_tpu=True) +tpu_strategy = NamedDistribution( + "TPU", lambda: tpu_lib.TPUStrategy( + TPUClusterResolver(""), steps_per_run=5), + required_tpu=True) # Note that we disable prefetching for testing since prefetching makes # the input non-deterministic. mirrored_strategy_with_gpu_and_cpu = NamedDistribution( @@ -367,12 +374,14 @@ adam_optimizer_v1_fn = NamedObject( "AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1)) gradient_descent_optimizer_v1_fn = NamedObject( "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2)) +optimizers_v1 = [adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn] adam_optimizer_v2_fn = NamedObject( "AdamV2", lambda: adam_v2.AdamOptimizer(0.2, epsilon=1)) gradient_descent_optimizer_v2_fn = NamedObject( "GradientDescentV2", lambda: gradient_descent_v2.GradientDescentOptimizer(0.2)) +optimizers_v2 = [adam_optimizer_v2_fn, gradient_descent_optimizer_v2_fn] graph_and_eager_modes = ["graph", "eager"] @@ -384,7 +393,7 @@ def distributions_and_v1_optimizers(): one_device_strategy, mirrored_strategy_with_gpu_and_cpu, mirrored_strategy_with_two_gpus ], - optimizer_fn=[adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn]) + optimizer_fn=optimizers_v1) def distributions_and_v2_optimizers(): @@ -394,4 +403,4 @@ def distributions_and_v2_optimizers(): one_device_strategy, mirrored_strategy_with_gpu_and_cpu, mirrored_strategy_with_two_gpus ], - optimizer_fn=[adam_optimizer_v2_fn, gradient_descent_optimizer_v2_fn]) + optimizer_fn=optimizers_v2) diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py index b6037d2133e23841a7804ed84bca302faa9574e3..3a7addf2215d403cd94601f143d16a18d92b65af 100644 --- a/tensorflow/contrib/distribute/python/cross_tower_ops.py +++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py @@ -157,7 +157,7 @@ class CrossTowerOps(object): Args: aggregation: Indicates how a variable will be aggregated. Accepted values - are @{tf.VariableAggregation.SUM}, @{tf.VariableAggregation.MEAN}. + are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`. per_device_value: a PerDevice object. destinations: the reduction destinations. @@ -181,7 +181,7 @@ class CrossTowerOps(object): Args: aggregation: Indicates how a variable will be aggregated. Accepted values - are @{tf.VariableAggregation.SUM}, @{tf.VariableAggregation.MEAN}. + are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`. value_destination_pairs: a list or a tuple of tuples of PerDevice objects and destinations. If a destination is None, then the destinations are set to match the devices of the input PerDevice object. @@ -267,9 +267,9 @@ def _group_value_by_device(per_device_values): This grouping is needed to call the all-reduce library because it expects a list of the following form: - [(grad0_gpu0, v0_gpu0), (grad1_gpu0, v1_gpu0), (grad2_gpu0, v2_gpu0) ... - (grad0_gpu1, v0_gpu1), (grad1_gpu1, v1_gpu1), (grad2_gpu1, v2_gpu1) ... - (grad0_gpu2, v0_gpu2), (grad1_gpu0, v1_gpu2), (grad2_gpu0, v2_gpu2) ... + [[(grad0_gpu0, v0_gpu0), (grad1_gpu0, v1_gpu0), (grad2_gpu0, v2_gpu0) ...], + [(grad0_gpu1, v0_gpu1), (grad1_gpu1, v1_gpu1), (grad2_gpu1, v2_gpu1) ...], + [(grad0_gpu2, v0_gpu2), (grad1_gpu0, v1_gpu2), (grad2_gpu0, v2_gpu2) ...], ... ] @@ -290,7 +290,10 @@ def _group_value_by_device(per_device_values): return grouped -def _ungroup_and_make_mirrored(grouped_reduced, destinations, aggregation): +def _ungroup_and_make_mirrored(grouped_reduced, + destinations, + aggregation, + num_between_graph_workers=1): """Ungroup results from all-reduce and make Mirrored objects. Each all-reduce result will be divided by the number of destinations before @@ -302,7 +305,9 @@ def _ungroup_and_make_mirrored(grouped_reduced, destinations, aggregation): cross_tower_utils.aggregate_gradients_using*. destinations: a list of device strings for returned Mirrored objects. aggregation: Indicates how a variable will be aggregated. Accepted values - are @{tf.VariableAggregation.SUM}, @{tf.VariableAggregation.MEAN}. + are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`. + num_between_graph_workers: number of workers in the between-graph + replication. Returns: a list of Mirrored objects. @@ -311,7 +316,8 @@ def _ungroup_and_make_mirrored(grouped_reduced, destinations, aggregation): for d, per_device_reduced in enumerate(grouped_reduced): for i, (v, _) in enumerate(per_device_reduced): if aggregation == vs.VariableAggregation.MEAN: - index[i][destinations[d]] = v / len(destinations) + index[i][destinations[d]] = v / ( + len(destinations) * num_between_graph_workers) else: index[i][destinations[d]] = v return [value_lib.Mirrored(v) for v in index] @@ -561,12 +567,12 @@ class AllReduceCrossTowerOps(CrossTowerOps): def _batch_all_reduce(self, aggregation, per_device_values): """All reduce algorithm in a batch.""" - logging.info( - "batch_all_reduce invoked for batches size = %d with " + logging.log_first_n( + logging.INFO, "batch_all_reduce invoked for batches size = %d with " "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and " - "agg_small_grads_max_group = %d", len(per_device_values), - self._all_reduce_alg, self._num_packs, self._agg_small_grads_max_bytes, - self._agg_small_grads_max_group) + "agg_small_grads_max_group = %d" % + (len(per_device_values), self._all_reduce_alg, self._num_packs, + self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10) destinations = per_device_values[0].devices grouped = _group_value_by_device(per_device_values) @@ -671,12 +677,13 @@ class MultiWorkerAllReduce(AllReduceCrossTowerOps): def _batch_all_reduce(self, aggregation, per_device_values): """All reduce algorithm in a batch.""" - logging.info( + logging.log_first_n( + logging.INFO, "distributed batch_all_reduce invoked for batches size = %d with " "allreduce_spec = %r, num_packs = %d, agg_small_grads_max_bytes = %d " - "and agg_small_grads_max_group = %d", len(per_device_values), - self._all_reduce_spec, self._num_packs, self._agg_small_grads_max_bytes, - self._agg_small_grads_max_group) + "and agg_small_grads_max_group = %d" % + (len(per_device_values), self._all_reduce_spec, self._num_packs, + self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10) destinations = sorted(per_device_values[0].devices) device_grads = _group_value_by_device(per_device_values) @@ -719,6 +726,102 @@ class MultiWorkerAllReduce(AllReduceCrossTowerOps): aggregation) +# TODO(yuefengz): support in-graph collective all-reduce. +class CollectiveAllReduce(CrossTowerOps): + """All-reduce cross tower ops using collective ops. + + In the between-graph replicated training, it will still do all-reduces across + all workers and then put results on the right destinations. + """ + + def __init__(self, + num_workers=1, + num_gpus_per_worker=0, + all_reduce_merge_scope=1, + collective_keys=None): + """Initializes the object. + + Args: + num_workers: number of workers in the between-graph replicated training. + num_gpus_per_worker: number of GPUs per worker. + all_reduce_merge_scope: size of groups into which to partition consecutive + gradients grouped under a common 'allreduce' name scope. This is useful + for some optimization of collective ops. + collective_keys: an optional CollectiveKey object. + """ + self._num_workers = num_workers + self._num_gpus_per_worker = num_gpus_per_worker + self._all_reduce_merge_scope = all_reduce_merge_scope + self._collective_keys = collective_keys or cross_tower_utils.CollectiveKeys( + ) + super(CollectiveAllReduce, self).__init__() + + # TODO(yuefengz, tucker): is index slices supported by collective ops? + def _reduce(self, aggregation, per_device_value, destinations): + all_reduced = self._batch_all_reduce(aggregation, [per_device_value])[0] + if destinations is None or _devices_match(per_device_value, destinations): + return all_reduced + else: + index = {} + for d in get_devices_from(destinations): + # pylint: disable=protected-access + if d in all_reduced._index: + index[d] = all_reduced._index[d] + else: + with ops.device(d): + index[d] = array_ops.identity(list(all_reduced._index.values())[0]) + return value_lib.Mirrored(index) + + def _batch_reduce(self, aggregation, value_destination_pairs): + return [ + self._reduce(aggregation, t, destinations=v) + for t, v in value_destination_pairs + ] + + def _batch_all_reduce(self, aggregation, per_device_values): + """All-reduce across all workers in a batch.""" + if context.executing_eagerly(): + raise ValueError("Eager mode with collective ops is not supported yet.") + + logging.log_first_n( + logging.INFO, "Collective All-reduce invoked with batches size = %d, " + "num_workers = %d" % (len(per_device_values), self._num_workers), 10) + + grouped_by_tower = _group_value_by_device(per_device_values) + + grouped_by_var = list(zip(*grouped_by_tower)) + # grouped_by_var is grouped by variables and takes the following format: + # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..), + # ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..), + # ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..), + # ... + # ] + chunked_gv = [ + grouped_by_var[x:x + self._all_reduce_merge_scope] + for x in range(0, len(grouped_by_var), self._all_reduce_merge_scope) + ] + + reduced_gv_list = [] + for chunk in chunked_gv: + with ops.name_scope("allreduce"): + for grad_and_vars in chunk: + scaled_grads = [g for g, _ in grad_and_vars] + collective_reduced = cross_tower_utils.build_collective_reduce( + scaled_grads, self._num_workers, self._collective_keys, "Add", + "Id") + result = [] + for (_, v), g in zip(grad_and_vars, collective_reduced): + result.append([g, v]) + reduced_gv_list.append(result) + + new_tower_grads = [list(x) for x in zip(*reduced_gv_list)] + return _ungroup_and_make_mirrored( + new_tower_grads, + per_device_values[0].devices, + aggregation, + num_between_graph_workers=self._num_workers) + + _dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]] diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py index 6a780ff60ffcd59d416278bfde6d005d7ad37a68..aec53b01d7a089fec08eec6ea43373a2cd8267d6 100644 --- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py +++ b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py @@ -21,13 +21,17 @@ from __future__ import print_function import itertools from absl.testing import parameterized +import numpy as np from tensorflow.contrib.distribute.python import combinations from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib +from tensorflow.contrib.distribute.python import cross_tower_utils from tensorflow.contrib.distribute.python import multi_worker_test_base from tensorflow.contrib.distribute.python import values as value_lib +from tensorflow.core.protobuf import config_pb2 from tensorflow.python.eager import context from tensorflow.python.eager import test +from tensorflow.python.estimator import run_config from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -376,5 +380,166 @@ class MultiWorkerCrossTowerOpsTest(multi_worker_test_base.MultiWorkerTestBase, self._testReductionAndBroadcast(cross_tower_ops, distribution) +class MultiWorkerCollectiveAllReduceTest( + multi_worker_test_base.MultiWorkerTestBase, parameterized.TestCase): + + collective_key_base = 100000 + + @classmethod + def setUpClass(cls): + """Create a local cluster with 2 workers.""" + cls._workers, cls._ps = multi_worker_test_base.create_in_process_cluster( + num_workers=3, num_ps=0) + cls._cluster_spec = { + run_config.TaskType.WORKER: [ + "fake_worker_0", "fake_worker_1", "fake_worker_2" + ] + } + + def setUp(self): + super(MultiWorkerCollectiveAllReduceTest, self).setUp() + # Reusing keys are not supported well. So we have to give a different + # collective key base for different tests. + MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000 + + def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False): + collective_keys = cross_tower_utils.CollectiveKeys( + group_key_start=10 * num_gpus + + MultiWorkerCollectiveAllReduceTest.collective_key_base, + instance_key_start=num_gpus * 100 + + MultiWorkerCollectiveAllReduceTest.collective_key_base, + instance_key_with_id_start=num_gpus * 10000 + + MultiWorkerCollectiveAllReduceTest.collective_key_base) + if local_mode: + collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce( + 1, num_gpus, collective_keys=collective_keys) + if num_gpus: + devices = ["/device:GPU:%d" % i for i in range(num_gpus)] + else: + devices = ["/device:CPU:0"] + return collective_all_reduce_ops, devices, "local" + else: + collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce( + 3, num_gpus, collective_keys=collective_keys) + if num_gpus: + devices = [ + "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i) + for i in range(num_gpus) + ] + else: + devices = ["/job:%s/task:%d" % (task_type, task_id)] + return collective_all_reduce_ops, devices, self._workers[task_id].target + + def _assert_values_equal(self, left, right, sess): + if isinstance(left, list): + for l, r in zip(left, right): + self._assert_values_equal(l, r, sess) + else: + self.assertEqual(type(left), type(right)) + self.assertEqual(set(left.devices), set(right.devices)) + + run_options = config_pb2.RunOptions() + run_options.experimental.collective_graph_key = 6 + + left_values = np.array( + sess.run(list(left._index.values()), options=run_options)).flatten() + right_values = np.array(list(right._index.values())).flatten() + self.assertEqual(len(left_values), len(right_values)) + for l, r in zip(left_values, right_values): + self.assertEqual(l, r) + + def _test_reduction(self, task_type, task_id, num_gpus, local_mode=False): + collective_all_reduce, devices, master_target = self._get_test_objects( + task_type, task_id, num_gpus, local_mode=local_mode) + if local_mode: + num_workers = 1 + worker_device = None + else: + num_workers = len(self._workers) + worker_device = "/job:%s/task:%d" % (task_type, task_id) + with ops.Graph().as_default(), \ + ops.device(worker_device), \ + self.test_session(target=master_target) as sess: + # Collective ops doesn't support scalar tensors, so we have to construct + # 1-d tensors. + values = [constant_op.constant([float(d)]) for d in range(len(devices))] + per_device = _make_per_device(values, devices) + mean = np.array([(len(devices) - 1.) / 2.]) + + values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))] + per_device_2 = _make_per_device(values_2, devices) + mean_2 = np.array([mean[0] + 1.]) + + destination_mirrored = _fake_mirrored(1., devices) + destination_different = _fake_mirrored(1., _cpu_device) + destination_str = _cpu_device + destination_list = devices + + all_destinations = [ + None, destination_mirrored, destination_different, destination_str, + destination_list + ] + + # test reduce() + for destinations in all_destinations: + self._assert_values_equal( + collective_all_reduce.reduce( + vs.VariableAggregation.MEAN, + per_device, + destinations=destinations), + _fake_mirrored(mean, destinations or per_device), sess) + self._assert_values_equal( + collective_all_reduce.reduce( + vs.VariableAggregation.MEAN, + per_device_2, + destinations=destinations), + _fake_mirrored(mean_2, destinations or per_device), sess) + self._assert_values_equal( + collective_all_reduce.reduce( + vs.VariableAggregation.SUM, + per_device, + destinations=destinations), + _fake_mirrored(mean * len(devices) * num_workers, destinations or + per_device), sess) + self._assert_values_equal( + collective_all_reduce.reduce( + vs.VariableAggregation.SUM, + per_device_2, + destinations=destinations), + _fake_mirrored(mean_2 * len(devices) * num_workers, destinations or + per_device), sess) + + # test batch_reduce() + for d1, d2 in itertools.product(all_destinations, all_destinations): + self._assert_values_equal( + collective_all_reduce.batch_reduce(vs.VariableAggregation.MEAN, + [(per_device, d1), + (per_device_2, d2)]), + [ + _fake_mirrored(mean, d1 or per_device), + _fake_mirrored(mean_2, d2 or per_device_2) + ], sess) + self._assert_values_equal( + collective_all_reduce.batch_reduce(vs.VariableAggregation.SUM, + [(per_device, d1), + (per_device_2, d2)]), + [ + _fake_mirrored(mean * len(devices) * num_workers, d1 or + per_device), + _fake_mirrored(mean_2 * len(devices) * num_workers, d2 or + per_device_2) + ], sess) + + return True + + @combinations.generate( + combinations.combine(mode=["graph"], num_gpus=[0, 1, 2])) + def testReductionDistributed(self, num_gpus): + if context.num_gpus() < num_gpus: + return + self._run_between_graph_clients(self._test_reduction, self._cluster_spec, + num_gpus) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/contrib/distribute/python/cross_tower_utils.py index 2bb088e704c584598b863b1b836166af2a5bb12c..24cb08fb48f832572da5ae2113e6c224557c6a81 100644 --- a/tensorflow/contrib/distribute/python/cross_tower_utils.py +++ b/tensorflow/contrib/distribute/python/cross_tower_utils.py @@ -19,13 +19,16 @@ from __future__ import division from __future__ import print_function import collections as pycoll +import threading from tensorflow.contrib import nccl from tensorflow.contrib.all_reduce.python import all_reduce from tensorflow.contrib.distribute.python import values as value_lib +from tensorflow.python.framework import device as pydev from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops +from tensorflow.python.ops import collective_ops from tensorflow.python.ops import gradients_impl from tensorflow.python.ops import math_ops @@ -218,6 +221,146 @@ def split_grads_by_size(threshold_size, device_grads): return small_grads, large_grads +# threading.Lock() cannot be pickled and therefore cannot be a field of +# CollectiveKeys. +_lock = threading.Lock() + + +# TODO(yuefengz): use random key starts to avoid reusing keys? +class CollectiveKeys(object): + """Class that manages collective keys. + + We need to manage three different keys for collective: + + *Group key*: an integer key to identify the set of cooperative devices. + Collective ops work under the same set of devices must using the same group + key. + + *Instance key*: an integer key to identify the set of same counterpart of + tensors on different devices in a device group that need to be all-reduced. + + "Graph key": an integer key that is unique key graph. This is used to support + multiple graphs per client session. It must be non-zero and set in the + `config` argument of each call to `session.run`. + """ + + def __init__(self, + group_key_start=1, + instance_key_start=100, + instance_key_with_id_start=10000): + """Initializes the object. + + Args: + group_key_start: the starting integer of group key. + instance_key_start: the starting integer of instance key. + instance_key_with_id_start: the starting integer of instance key that is + recorded with an id. + """ + self._group_key = group_key_start + self._group_key_table = dict() + + # For instance keys with ids + self._instance_key_id_to_key_table = dict() + self._instance_key_with_id_counter = instance_key_with_id_start + + # For instance keys without ids + self._instance_key_start = instance_key_start + + self._thread_local = threading.local() + + def _get_thread_local_object(self): + # We make instance key without key ids thread local so that it will work + # with MirroredStrategy and distribute coordinator. + if not hasattr(self._thread_local, 'instance_key'): + self._thread_local.instance_key = self._instance_key_start + return self._thread_local + + def get_group_key(self, devices): + """Returns a group key for the set of devices. + + Args: + devices: list of strings naming devices in a collective group. + + Returns: + int key uniquely identifying the set of device names. + """ + parsed = [pydev.DeviceSpec.from_string(d) for d in devices] + # In the between-graph replicated training, different workers need to get + # the same device key. So we remove the task_type and task_id from the + # devices. + # TODO(yuefengz): in the in-graph replicated training, we need to include + # task_type and task_id. + names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed]) + key_id = ','.join(names) + with _lock: + if key_id not in self._group_key_table: + new_key = self._group_key + self._group_key += 1 + self._group_key_table[key_id] = new_key + return self._group_key_table[key_id] + + def get_instance_key(self, key_id=None): + """Returns a new instance key for use in defining a collective op. + + Args: + key_id: optional string. If set, key will be recorded and the same key + will be returned when the same key_id is provided. If not, an increasing + instance key will be returned. + """ + if key_id: + with _lock: + if key_id not in self._instance_key_id_to_key_table: + self._instance_key_with_id_counter += 1 + self._instance_key_id_to_key_table[key_id] = ( + self._instance_key_with_id_counter) + return self._instance_key_id_to_key_table[key_id] + else: + v = self._get_thread_local_object().instance_key + self._get_thread_local_object().instance_key += 1 + return v + + +def build_collective_reduce(input_tensors, + num_workers, + collective_keys, + reduction_op='Add', + unary_op='Id'): + """Build a subgraph that does one full all-reduce, using the collective Op. + + Args: + input_tensors: tensors within a single worker graph that are to be reduced + together; must be one per device. + num_workers: total number of workers with identical independent graphs that + will be doing this same reduction. The reduction will actually include + the corresponding tensors at all these workers. + collective_keys: a CollectiveKeys object. + reduction_op: string naming the reduction op. + unary_op: string naming the unary final op. + + Returns: + An array of final tensors, one per device, computed by the full reduction. + + Raises: + ValueError: There must be at least two tensors over all the workers. + """ + group_size = len(input_tensors) * num_workers + if group_size < 2: + raise ValueError('num_workers * len(input_tensors) must be 2 or greater') + devices = [t.device for t in input_tensors] + num_devices = len(devices) + group_key = collective_keys.get_group_key(devices) + instance_key = collective_keys.get_instance_key() + out_tensors = [] + subdiv_offsets = [0] # TODO(tucker): maybe support non-default subdiv spec + for d in range(num_devices): + with ops.device(devices[d]): + reduce_op = collective_ops.all_reduce( + input_tensors[d], group_size, group_key, instance_key, reduction_op, + unary_op, subdiv_offsets) + out_tensors.append(reduce_op) + return out_tensors + + def sum_grad_and_var_all_reduce(grad_and_vars, num_workers, alg, @@ -253,10 +396,10 @@ def sum_grad_and_var_all_reduce(grad_and_vars, else: raise ValueError('unsupported all_reduce alg: ', alg) - result = [] - for (_, v), g in zip(grad_and_vars, summed_grads): - result.append([g, v]) - return result + result = [] + for (_, v), g in zip(grad_and_vars, summed_grads): + result.append([g, v]) + return result def sum_gradients_all_reduce(dev_prefixes, tower_grads, num_workers, alg, diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py index 34410a6470185ac2821bc6a59de9230ff478aeb6..3e00cf4332da8cb18de0444704f88462d415c447 100644 --- a/tensorflow/contrib/distribute/python/estimator_integration_test.py +++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py @@ -75,8 +75,11 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase, y=data, batch_size=batch_size // len(distribution.worker_devices), shuffle=True) - eval_input_fn = numpy_io.numpy_input_fn( - x={'x': data}, y=data, batch_size=batch_size, shuffle=False) + eval_input_fn = self.dataset_input_fn( + x={'x': data}, + y=data, + batch_size=batch_size // len(distribution.worker_devices), + shuffle=False) predict_input_fn = numpy_io.numpy_input_fn( x={'x': data}, batch_size=batch_size, shuffle=False) @@ -96,7 +99,8 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase, # TODO(isaprykin): Work around the colocate_with error. dnn_optimizer=adagrad.AdagradOptimizer(0.001), linear_optimizer=adagrad.AdagradOptimizer(0.001), - config=run_config.RunConfig(train_distribute=distribution)) + config=run_config.RunConfig( + train_distribute=distribution, eval_distribute=distribution)) num_steps = 10 estimator.train(train_input_fn, steps=num_steps) diff --git a/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py index 00c25c7a2482a559c8b94ff3be86c4961dfb439f..44a69ed23a4e00ab81d5b51ae0c14550bd493f14 100644 --- a/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py +++ b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py @@ -59,7 +59,8 @@ def build_model_fn_optimizer(): def main(_): distribution = tf.contrib.distribute.MirroredStrategy( ["/device:GPU:0", "/device:GPU:1"]) - config = tf.estimator.RunConfig(train_distribute=distribution) + config = tf.estimator.RunConfig(train_distribute=distribution, + eval_distribute=distribution) def input_fn(): features = tf.data.Dataset.from_tensors([[1.]]).repeat(10) @@ -70,7 +71,7 @@ def main(_): model_fn=build_model_fn_optimizer(), config=config) estimator.train(input_fn=input_fn, steps=10) - eval_result = estimator.evaluate(input_fn=input_fn) + eval_result = estimator.evaluate(input_fn=input_fn, steps=10) print("Eval result: {}".format(eval_result)) def predict_input_fn(): diff --git a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py index 2b05884b9b93470ef9a764cbedbc91bd3912c611..518ec9c4232465c3ecd0e4161f707dac499430c7 100644 --- a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py +++ b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py @@ -57,7 +57,8 @@ def main(args): # tf.Estimator that utilizes the DistributionStrategy. strategy = tf.contrib.distribute.MirroredStrategy( ['/device:GPU:0', '/device:GPU:1']) - config = tf.estimator.RunConfig(train_distribute=strategy) + config = tf.estimator.RunConfig( + train_distribute=strategy, eval_distribute=strategy) keras_estimator = tf.keras.estimator.model_to_estimator( keras_model=model, config=config, model_dir=model_dir) diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py index 75ecd90dcffa7a786b78238ef453c4c8e4346afa..4facd72d12680a53cc3f5e2ded2585bc9716ea3c 100644 --- a/tensorflow/contrib/distribute/python/keras_test.py +++ b/tensorflow/contrib/distribute/python/keras_test.py @@ -12,33 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for Keras Sequential and Functional models.""" +"""Tests for tf.keras models using DistributionStrategy.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import os - import numpy as np from tensorflow.contrib.distribute.python import mirrored_strategy +from tensorflow.contrib.distribute.python import values from tensorflow.python import keras from tensorflow.python.data.ops import dataset_ops from tensorflow.python.estimator import keras as keras_lib from tensorflow.python.estimator import run_config as run_config_lib +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes from tensorflow.python.framework import test_util from tensorflow.python.keras import testing_utils +from tensorflow.python.keras.engine import distributed_training_utils from tensorflow.python.platform import gfile from tensorflow.python.platform import test from tensorflow.python.summary.writer import writer_cache +from tensorflow.python.training import gradient_descent from tensorflow.python.training import rmsprop + _RANDOM_SEED = 1337 _TRAIN_SIZE = 200 _INPUT_SIZE = (10,) _NUM_CLASS = 2 +# TODO(anjalisridhar): Add a decorator that will allow us to run these tests as +# part of the tf.keras unit tests suite. def simple_sequential_model(): model = keras.models.Sequential() model.add(keras.layers.Dense(16, activation='relu', input_shape=_INPUT_SIZE)) @@ -84,7 +91,7 @@ def get_ds_test_input_fn(): return dataset -class TestKerasDistributionStrategy(test_util.TensorFlowTestCase): +class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): def setUp(self): self._base_dir = os.path.join(self.get_temp_dir(), @@ -107,7 +114,8 @@ class TestKerasDistributionStrategy(test_util.TensorFlowTestCase): optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01)) config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir, - train_distribute=dist) + train_distribute=dist, + eval_distribute=dist) with self.test_session(): est_keras = keras_lib.model_to_estimator( keras_model=keras_model, config=config) @@ -144,5 +152,457 @@ class TestKerasDistributionStrategy(test_util.TensorFlowTestCase): writer_cache.FileWriterCache.clear() gfile.DeleteRecursively(self._config.model_dir) + def test_keras_optimizer_with_distribution_strategy(self): + dist = mirrored_strategy.MirroredStrategy( + devices=['/device:GPU:0', '/device:GPU:1']) + keras_model = simple_sequential_model() + keras_model.compile( + loss='categorical_crossentropy', + optimizer=keras.optimizers.rmsprop(lr=0.01)) + + config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED, + model_dir=self._base_dir, + train_distribute=dist) + with self.test_session(): + est_keras = keras_lib.model_to_estimator(keras_model=keras_model, + config=config) + with self.assertRaisesRegexp(ValueError, + 'Only TensorFlow native optimizers are ' + 'supported with DistributionStrategy.'): + est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16) + + writer_cache.FileWriterCache.clear() + gfile.DeleteRecursively(self._config.model_dir) + + +class TestWithDistributionStrategy(test.TestCase): + + def test_validating_dataset_input_tensors_with_shape_mismatch(self): + with self.test_session(): + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', + '/device:CPU:0']) + a = constant_op.constant([1, 2], shape=(1, 2)) + b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2)) + x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b}) + y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a}) + with strategy.scope(): + # Removed device and input tensor shape details from the error message + # since the order of the device and the corresponding input tensor shape + # is not deterministic over different runs. + with self.assertRaisesRegexp(ValueError, + 'Input tensor shapes do not match for ' + 'distributed tensor inputs ' + 'DistributedValues:.+'): + distributed_training_utils.validate_distributed_dataset_inputs( + strategy, x, y) + + def test_validating_dataset_input_tensors_with_dtype_mismatch(self): + with self.test_session(): + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', + '/device:CPU:0']) + a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32) + b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64) + x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b}) + y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a}) + with strategy.scope(): + # Removed device and input tensor dtype details from the error message + # since the order of the device and the corresponding input tensor dtype + # is not deterministic over different runs. + with self.assertRaisesRegexp(ValueError, + 'Input tensor dtypes do not match for ' + 'distributed tensor inputs ' + 'DistributedValues:.+'): + distributed_training_utils.validate_distributed_dataset_inputs( + strategy, x, y) + + def test_calling_model_on_same_dataset(self): + with self.test_session(): + x = keras.layers.Input(shape=(3,), name='input') + y = keras.layers.Dense(4, name='dense')(x) + model = keras.Model(x, y) + + optimizer = gradient_descent.GradientDescentOptimizer(0.001) + loss = 'mse' + metrics = ['mae'] + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', + '/device:GPU:0']) + model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + + inputs = np.zeros((10, 3), dtype=np.float32) + targets = np.zeros((10, 4), dtype=np.float32) + dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) + dataset = dataset.repeat(100) + dataset = dataset.batch(10) + + # Call fit with validation data + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0, + validation_data=dataset, validation_steps=2) + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0, + validation_data=dataset, validation_steps=2) + model.predict(dataset, steps=2) + + def test_fit_with_tuple_and_dict_dataset_inputs(self): + with self.test_session(): + a = keras.layers.Input(shape=(3,), name='input_a') + b = keras.layers.Input(shape=(3,), name='input_b') + + dense = keras.layers.Dense(4, name='dense') + c = dense(a) + d = dense(b) + e = keras.layers.Dropout(0.5, name='dropout')(c) + + model = keras.models.Model([a, b], [d, e]) + + optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001) + loss = 'mse' + metrics = ['mae'] + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', + '/device:CPU:0']) + model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + + input_a_np = np.random.random((10, 3)) + input_b_np = np.random.random((10, 3)) + output_d_np = np.random.random((10, 4)) + output_e_np = np.random.random((10, 4)) + + # Test with tuples + dataset_tuple = dataset_ops.Dataset.from_tensor_slices(( + (input_a_np, input_b_np), (output_d_np, output_e_np))) + dataset_tuple = dataset_tuple.repeat(100) + dataset_tuple = dataset_tuple.batch(10) + + model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1) + + # Test with dict + dataset_dict = dataset_ops.Dataset.from_tensor_slices(( + {'input_a': input_a_np, 'input_b': input_b_np}, + (output_d_np, output_e_np))) + dataset_dict = dataset_dict.repeat(100) + dataset_dict = dataset_dict.batch(10) + + model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1) + + def test_fit_eval_and_predict_methods_on_dataset(self): + with self.test_session(): + x = keras.layers.Input(shape=(3,), name='input') + y = keras.layers.Dense(4, name='dense')(x) + model = keras.Model(x, y) + + optimizer = gradient_descent.GradientDescentOptimizer(0.001) + loss = 'mse' + metrics = ['mae'] + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', + '/device:CPU:0']) + + model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + + inputs = np.zeros((10, 3), dtype=np.float32) + targets = np.zeros((10, 4), dtype=np.float32) + dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) + dataset = dataset.repeat(100) + dataset = dataset.batch(10) + + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1) + model.evaluate(dataset, steps=2, verbose=1) + model.predict(dataset, steps=2) + # Test with validation data + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0, + validation_data=dataset, validation_steps=2) + + def test_raise_error_for_stateful_metrics(self): + + class ExampleStatefulMetric(keras.layers.Layer): + + def __init__(self, name='true_positives', **kwargs): + super(ExampleStatefulMetric, self).__init__(name=name, **kwargs) + self.stateful = True + + def __call__(self, y_true, y_pred): + return y_pred - y_true + + with self.test_session(): + x = keras.layers.Input(shape=(3,), name='input') + y = keras.layers.Dense(4, name='dense')(x) + model = keras.Model(x, y) + + optimizer = gradient_descent.GradientDescentOptimizer(0.001) + loss = 'mse' + metrics = ['mae', ExampleStatefulMetric()] + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', + '/device:GPU:0']) + with self.assertRaisesRegexp( + NotImplementedError, 'Stateful metrics are not supported with ' + 'DistributionStrategy.'): + model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + + def test_unsupported_features(self): + with self.test_session(): + x = keras.layers.Input(shape=(3,), name='input') + y = keras.layers.Dense(4, name='dense')(x) + model = keras.Model(x, y) + + optimizer = gradient_descent.GradientDescentOptimizer(0.001) + loss = 'mse' + metrics = ['mae'] + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', + '/device:GPU:0']) + + model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + + inputs = np.zeros((10, 3), dtype=np.float32) + targets = np.zeros((10, 4), dtype=np.float32) + dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) + dataset = dataset.repeat(100) + dataset = dataset.batch(10) + + # Test with validation split + with self.assertRaisesRegexp( + ValueError, '`validation_split` argument is not ' + 'supported when input `x` is a dataset or a ' + 'dataset iterator.+'): + model.fit(dataset, + epochs=1, steps_per_epoch=2, verbose=0, + validation_split=0.5, validation_steps=2) + + # Test with sample weight. + sample_weight = np.random.random((10,)) + with self.assertRaisesRegexp( + NotImplementedError, 'sample_weight is currently not supported when ' + 'using DistributionStrategy.'): + model.fit( + dataset, + epochs=1, + steps_per_epoch=2, + verbose=0, + sample_weight=sample_weight) + + # Test with not specifying the `steps` argument. + with self.assertRaisesRegexp( + ValueError, 'you should specify the `steps_per_epoch` argument'): + model.fit(dataset, epochs=1, verbose=0) + with self.assertRaisesRegexp(ValueError, + 'you should specify the `steps` argument'): + model.evaluate(dataset, verbose=0) + + with self.assertRaisesRegexp(ValueError, + 'you should specify the `steps` argument'): + model.predict(dataset, verbose=0) + + def test_calling_with_unsupported_predefined_callbacks(self): + with self.test_session(): + x = keras.layers.Input(shape=(3,), name='input') + y = keras.layers.Dense(4, name='dense')(x) + model = keras.Model(x, y) + + optimizer = gradient_descent.GradientDescentOptimizer(0.001) + loss = 'mse' + metrics = ['mae'] + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', + '/device:GPU:0']) + model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + + inputs = np.zeros((10, 3), dtype=np.float32) + targets = np.zeros((10, 4), dtype=np.float32) + dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) + dataset = dataset.repeat(100) + dataset = dataset.batch(10) + + def schedule(_): + return 0.001 + with self.assertRaisesRegexp(ValueError, + 'LearningRateScheduler callback is not ' + 'supported with DistributionStrategy.'): + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0, + callbacks=[keras.callbacks.LearningRateScheduler(schedule)]) + + with self.assertRaisesRegexp(ValueError, + 'ReduceLROnPlateau callback is not ' + 'supported with DistributionStrategy.'): + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0, + callbacks=[keras.callbacks.ReduceLROnPlateau()]) + with self.assertRaisesRegexp(ValueError, + 'histogram_freq in the TensorBoard callback ' + 'is not supported when using ' + 'DistributionStrategy.'): + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0, + callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)]) + + def test_dataset_input_shape_validation(self): + with self.test_session(): + x = keras.layers.Input(shape=(3,), name='input') + y = keras.layers.Dense(4, name='dense')(x) + model = keras.Model(x, y) + + optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001) + loss = 'mse' + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', + '/device:GPU:0']) + + model.compile(optimizer, loss, distribute=strategy) + + # User forgets to batch the dataset + inputs = np.zeros((10, 3), dtype=np.float32) + targets = np.zeros((10, 4), dtype=np.float32) + dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) + dataset = dataset.repeat(100) + + with self.assertRaisesRegexp(ValueError, + 'expected input to have 2 dimensions'): + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0) + + # Wrong input shape + inputs = np.zeros((10, 5), dtype=np.float32) + targets = np.zeros((10, 4), dtype=np.float32) + dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) + dataset = dataset.repeat(100) + dataset = dataset.batch(10) + + with self.assertRaisesRegexp(ValueError, + 'expected input to have shape'): + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0) + + def test_learning_phase_value(self): + # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare + # meaningful values. Currently we don't pass the learning phase if the + # Lambda layer uses the learning phase. + with self.test_session(): + x = keras.layers.Input(shape=(16,), name='input') + y = keras.layers.Dense(16)(x) + z = keras.layers.Dropout(0.9999)(y) + model = keras.Model(x, z) + + optimizer = gradient_descent.GradientDescentOptimizer(0.005) + loss = 'mse' + metrics = ['acc'] + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', + '/device:CPU:0']) + + model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + + inputs = np.random.rand(10, 16) + targets = np.ones((10, 16), dtype=np.float32) + dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) + dataset = dataset.repeat(100) + dataset = dataset.batch(8) + + hist = model.fit(dataset, epochs=5, steps_per_epoch=20, verbose=1) + self.assertEqual(hist.history['acc'][0], 1) + + evaluate_output = model.evaluate(dataset, steps=20) + self.assertEqual(evaluate_output[1], 0) + + predict_output = model.predict(dataset, steps=1) + self.assertNotEqual(np.mean(predict_output), 0) + + +class LossMaskingWithDistributionStrategyTest(test.TestCase): + + def test_masking(self): + with self.test_session(): + np.random.seed(1337) + x = np.array([[[1], [1]], [[0], [0]]]) + model = keras.models.Sequential() + model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1))) + model.add( + keras.layers.TimeDistributed( + keras.layers.Dense(1, kernel_initializer='one'))) + strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', + '/device:GPU:0']) + + model.compile(loss='mse', + optimizer=gradient_descent.GradientDescentOptimizer(0.01), + distribute=strategy) + y = np.array([[[1], [1]], [[1], [1]]]) + dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) + dataset = dataset.repeat(100) + dataset = dataset.batch(10) + hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2) + self.assertEqual(hist.history['loss'][0], 0) + + +class NormalizationLayerWithDistributionStrategyTest(test.TestCase): + + def test_batchnorm_correctness(self): + with self.test_session(): + model = keras.models.Sequential() + norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8) + model.add(norm) + strategy = mirrored_strategy.MirroredStrategy(['/device:CPU:0', + '/device:GPU:0']) + model.compile(loss='mse', + optimizer=gradient_descent.GradientDescentOptimizer(0.01), + distribute=strategy) + + # centered on 5.0, variance 10.0 + x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10)) + dataset = dataset_ops.Dataset.from_tensor_slices((x, x)) + dataset = dataset.repeat(100) + dataset = dataset.batch(32) + + model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10) + out = model.predict(dataset, steps=2) + out -= keras.backend.eval(norm.beta) + out /= keras.backend.eval(norm.gamma) + np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1) + np.testing.assert_allclose(out.std(), 1.0, atol=1e-1) + + +class CorrectnessWithDistributionStrategyTest(test.TestCase): + + def test_correctness(self): + with self.test_session(): + keras.backend.set_image_data_format('channels_last') + num_samples = 10000 + x_train = np.random.rand(num_samples, 1) + y_train = 3 * x_train + x_train = x_train.astype('float32') + y_train = y_train.astype('float32') + + model = keras.Sequential() + model.add(keras.layers.Dense(1, input_shape=(1,))) + + # With DistributionStrategy + dataset_with = dataset_ops.Dataset.from_tensor_slices((x_train, y_train)) + dataset_with = dataset_with.batch(32) + strategy = mirrored_strategy.MirroredStrategy(devices=['/device:CPU:0', + '/device:GPU:0'], + prefetch_on_device=False) + + model.compile(loss=keras.losses.mean_squared_error, + optimizer=gradient_descent.GradientDescentOptimizer(0.5), + distribute=strategy) + model.fit(x=dataset_with, epochs=1, steps_per_epoch=310) + wts_with_ds = model.get_weights() + + x_predict = [[1], [2], [3], [4]] + predict_dataset_with = dataset_ops.Dataset.from_tensor_slices((x_predict, + x_predict)) + predict_dataset_with = predict_dataset_with.batch(2) + predict_with_ds = model.predict(predict_dataset_with, steps=1) + predict_with_ds = np.reshape(predict_with_ds, (4, 1)) + + # Without DistributionStrategy + dataset_without = dataset_ops.Dataset.from_tensor_slices((x_train, + y_train)) + dataset_without = dataset_without.batch(64) + + model.compile(loss=keras.losses.mean_squared_error, + optimizer=gradient_descent.GradientDescentOptimizer(0.5)) + model.fit(x=dataset_without, epochs=1, steps_per_epoch=310) + wts_without_ds = model.get_weights() + + x_predict = [[1], [2], [3], [4]] + predict_dataset_without = dataset_ops.Dataset.from_tensor_slices(( + x_predict, x_predict)) + predict_dataset_without = predict_dataset_without.batch(4) + predict_without_ds = model.predict(predict_dataset_without, steps=1) + + # Verify that the weights are the same within some limits of tolerance. + np.testing.assert_allclose(wts_with_ds[0], wts_without_ds[0], rtol=1e-3) + # Verify that the predicted outputs are the same within some limits of + # tolerance. + np.testing.assert_allclose(predict_with_ds, predict_without_ds, rtol=1e-3) + + if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py index 6c6bf143098c1bba64d47efce1bfface7682683d..2f3d6bdd3f4e4bc7352d7b378ed40b930608ef08 100644 --- a/tensorflow/contrib/distribute/python/metrics_v1_test.py +++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py @@ -19,7 +19,6 @@ from __future__ import print_function from absl.testing import parameterized -from tensorflow.contrib.data.python.ops import batching from tensorflow.contrib.distribute.python import combinations from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import test @@ -183,7 +182,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase): def _dataset_fn(): dataset = dataset_ops.Dataset.range(1000).map(math_ops.to_float) # Want to produce a fixed, known shape, so drop remainder when batching. - dataset = dataset.apply(batching.batch_and_drop_remainder(4)) + dataset = dataset.batch(4, drop_remainder=True) return dataset def _expected_fn(num_batches): diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py index aeeb9553e6044a0a928936597400e582e0329b95..aa7a61bb3b24df64dfc2a118611e96242a72b025 100644 --- a/tensorflow/contrib/distribute/python/minimize_loss_test.py +++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py @@ -25,11 +25,13 @@ from tensorflow.contrib.distribute.python import combinations from tensorflow.contrib.distribute.python import mirrored_strategy from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example -from tensorflow.contrib.tpu.python.tpu import tpu from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import context from tensorflow.python.eager import test +from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops +from tensorflow.python.layers import core +from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope @@ -43,32 +45,60 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): combinations.times( combinations.distributions_and_v1_optimizers(), combinations.combine(mode=["graph"], use_callable_loss=[True, False]) - + combinations.combine(mode=["eager"], use_callable_loss=[True]), - combinations.combine(is_tpu=[False])) + combinations.combine( - distribution=[combinations.tpu_strategy], - optimizer_fn=[ - combinations.adam_optimizer_v1_fn, - # TODO(isaprykin): Make Adam v2 work with while_loops - # and TPUs. - ], - mode=["graph"], - use_callable_loss=[False], - is_tpu=[True])) - def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss, - is_tpu): - # TODO(priyag): Remove this once the step TPU Strategy is stable. - if is_tpu: - self.skipTest("TPU tests are WIP.") + + combinations.combine(mode=["eager"], use_callable_loss=[True])) + + combinations.combine( + distribution=[combinations.tpu_strategy], + optimizer_fn=combinations.optimizers_v1, + mode=["graph"], + use_callable_loss=[True, False])) + def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss): + with distribution.scope(): + model_fn, dataset_fn, layer = minimize_loss_example( + optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) + + def step_fn(ctx, inputs): + del ctx # Unused + return distribution.group( + distribution.call_for_each_tower( + model_fn, inputs, run_concurrently=layer.built)) + + iterator = distribution.distribute_dataset( + dataset_fn).make_one_shot_iterator() + + def run_step(): + return distribution.run_steps_on_dataset( + step_fn, iterator, iterations=2).run_op + + self.evaluate(distribution.initialize()) + if not context.executing_eagerly(): + with self.test_session() as sess: + run_step = sess.make_callable(run_step()) + self.evaluate(variables_lib.global_variables_initializer()) + + weights, biases = [], [] + for _ in range(5): + run_step() + weights.append(self.evaluate(layer.kernel)) + biases.append(self.evaluate(layer.bias)) + + self.evaluate(distribution.finalize()) + + error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) + is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) + self.assertTrue(is_not_increasing) + + @combinations.generate( + combinations.times( + combinations.distributions_and_v1_optimizers(), + combinations.combine(mode=["graph"], use_callable_loss=[True, False]) + + combinations.combine(mode=["eager"], use_callable_loss=[True]))) + def testTrainNetworkByCallForEachTower(self, distribution, optimizer_fn, + use_callable_loss): with distribution.scope(): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) - # TODO(isaprykin): Eliminate `is_tpu`. Probably add a - # `DistributionStrategy.create_monitor` so that each DistributionStrategy - # could influence its training loop. That method would return an instance - # of Monitor. TPUMonitor would execute tpu.initialize_system() and - # tpu.shutdown_system(). iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() @@ -79,8 +109,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): if not context.executing_eagerly(): with self.test_session() as sess: - if is_tpu: - sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) @@ -91,10 +119,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): weights.append(self.evaluate(layer.kernel)) biases.append(self.evaluate(layer.bias)) - if is_tpu: - with self.test_session() as sess: - sess.run(tpu.shutdown_system()) - error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing) @@ -103,22 +127,12 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): combinations.times( combinations.distributions_and_v1_optimizers() + combinations.distributions_and_v2_optimizers(), - combinations.combine(mode=["graph", "eager"], is_tpu=[False])) + + combinations.combine(mode=["graph", "eager"])) + combinations.combine( distribution=[combinations.tpu_strategy], - optimizer_fn=[ - combinations.adam_optimizer_v1_fn, - combinations.gradient_descent_optimizer_v1_fn, - combinations.gradient_descent_optimizer_v2_fn, - ], - mode=["graph"], - is_tpu=[True])) - - def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu): - # TODO(priyag): Remove this once the step TPU Strategy is stable. - if is_tpu: - self.skipTest("TPU tests are WIP.") - + optimizer_fn=combinations.optimizers_v1+combinations.optimizers_v2, + mode=["graph"])) + def testOptimizerInsideModelFn(self, distribution, optimizer_fn): created_variables = [] trainable_variables = [] @@ -139,26 +153,28 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): use_callable_loss=True, create_optimizer_inside_model_fn=True) + def step_fn(ctx, inputs): + del ctx # Unused + return distribution.group( + distribution.call_for_each_tower( + model_fn, inputs, run_concurrently=layer.built)) + iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): - return distribution.group( - distribution.call_for_each_tower( - model_fn, iterator.get_next(), run_concurrently=layer.built)) + return distribution.run_steps_on_dataset( + step_fn, iterator, iterations=1).run_op + self.evaluate(distribution.initialize()) if not context.executing_eagerly(): with self.test_session() as sess: - if is_tpu: - sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) - self.evaluate(variables_lib.global_variables_initializer()) + self.evaluate(variables_lib.global_variables_initializer()) run_step() - if is_tpu: - with self.test_session() as sess: - sess.run(tpu.shutdown_system()) + self.evaluate(distribution.finalize()) def get_expected_variables(optimizer_fn, num_parameter_devices): variables_map = { @@ -189,27 +205,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): combinations.distributions_and_v1_optimizers(), combinations.combine( mode=["graph", "eager"], - is_tpu=[False], # TODO(isaprykin): Allow False here. Currently subsequent # towers will re-execute UPDATE_OPS of previous towers. update_ops_in_cross_tower_mode=[True])) + combinations.combine( distribution=[combinations.tpu_strategy], - optimizer_fn=[ - combinations.gradient_descent_optimizer_v1_fn, - combinations.gradient_descent_optimizer_v2_fn - ], + optimizer_fn=combinations.optimizers_v1, mode=["graph"], - is_tpu=[True], update_ops_in_cross_tower_mode=[False]))) def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum, - renorm, is_tpu, - update_ops_in_cross_tower_mode): + renorm, update_ops_in_cross_tower_mode): """Verifies that moving mean updates are reduced across towers.""" - # TODO(priyag): Remove this once the step TPU Strategy is stable. - if is_tpu: - self.skipTest("TPU tests are WIP.") - with distribution.scope(): num_towers = len(distribution.worker_devices) model_fn, dataset_fn, batchnorm = batchnorm_example( @@ -224,24 +230,28 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): # this test relies on specific input being on each device. if isinstance(distribution, mirrored_strategy.MirroredStrategy): self.assertFalse(distribution._prefetch_on_device) - iterator = distribution.distribute_dataset( - dataset_fn).make_one_shot_iterator() - def run_step(): + def step_fn(ctx, inputs): + del ctx # Unused fetches = distribution.unwrap( distribution.call_for_each_tower( - model_fn, iterator.get_next(), - run_concurrently=batchnorm.built)) + model_fn, inputs, run_concurrently=batchnorm.built)) if update_ops_in_cross_tower_mode: fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS) return control_flow_ops.group(fetches) + iterator = distribution.distribute_dataset( + dataset_fn).make_one_shot_iterator() + + def run_step(): + return distribution.run_steps_on_dataset( + step_fn, iterator, iterations=1).run_op + + self.evaluate(distribution.initialize()) if not context.executing_eagerly(): with self.test_session() as sess: - if is_tpu: - sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) - self.evaluate(variables_lib.global_variables_initializer()) + self.evaluate(variables_lib.global_variables_initializer()) expected_moving_means = [0.] * 8 @@ -263,9 +273,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum)) self.assertNear(expected_moving_means[i], moving_means[i], 0.0001) - if is_tpu: - with self.test_session() as sess: - sess.run(tpu.shutdown_system()) + self.evaluate(distribution.finalize()) @combinations.generate( combinations.times( @@ -285,22 +293,16 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, combinations.mirrored_strategy_with_two_gpus - ], - is_tpu=[False]), + ]), combinations.combine( mode=["graph"], use_callable_loss=[True, False]) + combinations.combine(mode=["eager"], use_callable_loss=[True])) + combinations.combine( distribution=[combinations.tpu_strategy], - is_tpu=[True], mode=["graph"], use_callable_loss=[True, False]))) def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction, - use_callable_loss, is_tpu): - # TODO(priyag): Remove this once the step TPU Strategy is stable. - if is_tpu: - self.skipTest("TPU tests are WIP.") - + use_callable_loss): with distribution.scope(): all_vars = [] @@ -326,20 +328,25 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) return dataset_ops.Dataset.zip((features, labels)).repeat() + def step_fn(ctx, inputs): + del ctx # Unused + x, y = inputs + return distribution.group( + distribution.call_for_each_tower( + model_fn, x, y, run_concurrently=False)) + iterator = distribution.distribute_dataset( dataset_fn).make_one_shot_iterator() def run_step(): - return distribution.group( - distribution.call_for_each_tower( - model_fn, *iterator.get_next(), run_concurrently=False)) + return distribution.run_steps_on_dataset( + step_fn, iterator, iterations=1).run_op + self.evaluate(distribution.initialize()) if not context.executing_eagerly(): with self.test_session() as sess: - if is_tpu: - sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) - self.evaluate(variables_lib.global_variables_initializer()) + self.evaluate(variables_lib.global_variables_initializer()) run_step() @@ -369,10 +376,132 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): # One of the mean loss reductions. self.assertNear(weight, 2 + 10.6, 0.0001) - if is_tpu: + self.evaluate(distribution.finalize()) + + @combinations.generate( + combinations.times( + combinations.distributions_and_v1_optimizers(), + combinations.combine(mode=["graph", "eager"]), + combinations.combine(is_tpu=[False])) + + combinations.combine( + distribution=[combinations.tpu_strategy], + optimizer_fn=combinations.optimizers_v1, + mode=["graph"], + is_tpu=[True])) + def testRunStepsWithOutputContext(self, distribution, optimizer_fn, is_tpu): + with distribution.scope(): + def dataset_fn(): + dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat() + # TODO(priyag): batch with drop_remainder=True causes shapes to be + # fully defined for TPU. Remove this when XLA supports dynamic shapes. + return dataset.batch(batch_size=1, drop_remainder=True) + + optimizer = optimizer_fn() + layer = core.Dense(1, use_bias=True) + + key1 = "foo" + value1 = "bar" + + def model_fn(output_context, x): + """A very simple model written by the user.""" + def loss_fn(): + y = array_ops.reshape(layer(x), []) - constant_op.constant(1.) + return y * y + + train_op = optimizer.minimize(loss_fn) + loss = loss_fn() + output_context.set_last_step_output( + name="tower_loss_agg", + output=loss, + aggregation=variables_lib.VariableAggregation.MEAN) + output_context.set_non_tensor_output(key1, value1) + return (train_op, loss) + + def step_fn(output_context, inputs): + (train_op, loss) = distribution.call_for_each_tower( + model_fn, output_context, inputs, run_concurrently=False) + output_context.set_last_step_output( + name="cross_tower_loss_agg", + output=loss, + aggregation=variables_lib.VariableAggregation.MEAN) + output_context.set_last_step_output( + name="cross_tower_loss_noagg", + output=loss) + return distribution.group(train_op) + + iterator = distribution.distribute_dataset( + dataset_fn).make_one_shot_iterator() + + def run_step(): + initial_loss = lambda: constant_op.constant(1e7) + # Initial values corresponding to aggregated losses are just single + # tensors. But for non aggregated losses, we need to have initial + # values that are of the same structure as non reduced losses. In + # MirroredStrategy, this will be a list of losses, in TPUStrategy + # it will be single tensor. Using `broadcast` followed by `unwrap` + # gives us the desired initial value structure. + initial_loop_values = { + "tower_loss_agg": initial_loss(), + "cross_tower_loss_agg": initial_loss(), + "cross_tower_loss_noagg": + distribution.unwrap(distribution.broadcast(initial_loss())) + } + ctx = distribution.run_steps_on_dataset( + step_fn, iterator, iterations=2, + initial_loop_values=initial_loop_values) + + self.assertEqual({key1: [value1]}, ctx.non_tensor_outputs) + self._verify_loss_output( + initial_loss(), + loss_output=ctx.last_step_outputs["tower_loss_agg"], + aggregated=True, distribution=distribution) + self._verify_loss_output( + initial_loss(), + loss_output=ctx.last_step_outputs["cross_tower_loss_agg"], + aggregated=True, distribution=distribution) + self._verify_loss_output( + initial_loss(), + loss_output=ctx.last_step_outputs["cross_tower_loss_noagg"], + aggregated=False, distribution=distribution) + return (ctx.run_op, ctx.last_step_outputs["tower_loss_agg"]) + + self.evaluate(distribution.initialize()) + if not context.executing_eagerly(): with self.test_session() as sess: - sess.run(tpu.shutdown_system()) + run_step = sess.make_callable(run_step()) + self.evaluate(variables_lib.global_variables_initializer()) + + weights, biases, losses = [], [], [] + for _ in range(5): + _, loss = run_step() + losses.append(loss) + weights.append(self.evaluate(layer.kernel)) + biases.append(self.evaluate(layer.bias)) + self.evaluate(distribution.finalize()) + + loss_is_not_increasing = all(y <= x for x, y in zip(losses, losses[1:])) + self.assertTrue(loss_is_not_increasing) + + error = abs( + numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) + error_is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) + self.assertTrue(error_is_not_increasing) + + def _verify_loss_output(self, initial_loss, loss_output, aggregated, + distribution): + if not aggregated: + self.assertEqual(distribution.num_towers, + len(distribution.unwrap(loss_output))) + loss_output = distribution.reduce( + aggregation=variables_lib.VariableAggregation.MEAN, + value=loss_output, destinations="/device:CPU:0") + + unwrapped_output = distribution.unwrap(loss_output) + self.assertEqual(1, len(unwrapped_output)) + loss_tensor = unwrapped_output[0] + self.assertEqual(initial_loss.dtype, loss_tensor.dtype) + self.assertEqual(initial_loss.shape, loss_tensor.shape) if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py index eb2d102012217026f6edb2256ae05b5ce4e4301e..e3376a06368e8ef5efcda5bb69de66b7ec3390e1 100644 --- a/tensorflow/contrib/distribute/python/mirrored_strategy.py +++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py @@ -27,13 +27,17 @@ from tensorflow.contrib.distribute.python import values from tensorflow.python import pywrap_tensorflow from tensorflow.python.eager import context from tensorflow.python.eager import tape +from tensorflow.python.framework import constant_op from tensorflow.python.framework import device as tf_device from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables as variables_lib from tensorflow.python.training import coordinator from tensorflow.python.training import device_util from tensorflow.python.training import distribute as distribute_lib +from tensorflow.python.util import nest # TODO(josh11b): Replace asserts in this file with if ...: raise ... @@ -186,12 +190,20 @@ def _reduce_non_distributed_value(distribution, aggregation, value, raise ValueError("You are passing a `DistributedValue` to " "`_reduce_non_distributed_value`, which is not allowed.") + # If the same value is present on all towers then the PerDevice value will + # be a single value. We also handle the case when `value` is a single value + # and equal to 0. if value == 0: return 0 + # If the aggregation type is MEAN, then this essentially means that the same + # value should be on all destinations. if aggregation == variable_scope.VariableAggregation.MEAN: return distribution.broadcast(value, destinations) cross_tower_ops_lib.validate_destinations(destinations) + # We do not support an aggregation type of SUM if the value is the same across + # all towers. We call this as part of assign functions for MirroredVariables + # and summing up identical values across towers is not clearly defined. if (len(distribution.worker_devices) != 1 or not cross_tower_ops_lib.check_destinations(destinations)): raise ValueError("A non-DistributedValues value cannot be reduced with the " @@ -209,6 +221,75 @@ def _reduce_non_distributed_value(distribution, aggregation, value, return values.Mirrored(value_updates) +def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs): # pylint: disable=g-missing-docstring + # Figure out what collections this variable should be added to. + # We'll add the MirroredVariable to those collections instead. + collections = kwargs.pop("collections", None) + if collections is None: + collections = [ops.GraphKeys.GLOBAL_VARIABLES] + kwargs["collections"] = [] + + # Get synchronization value + synchronization = kwargs.get("synchronization", + variable_scope.VariableSynchronization.ON_WRITE) + if synchronization == variable_scope.VariableSynchronization.NONE: + raise ValueError("`NONE` variable synchronization mode is not " + "supported with `Mirrored` distribution strategy. Please" + " change the `synchronization` for variable: " + + kwargs["name"]) + elif synchronization == variable_scope.VariableSynchronization.ON_READ: + # Variables that are to be synced on read are tower local. + is_tower_local = True + kwargs["trainable"] = False + elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or + synchronization == variable_scope.VariableSynchronization.AUTO): + # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`. + is_tower_local = False + else: + raise ValueError("Invalid variable synchronization mode: " + + synchronization + " for variable: " + kwargs["name"]) + + # Get aggregation value + aggregation = kwargs.pop("aggregation", + variable_scope.VariableAggregation.NONE) + if aggregation not in [ + variable_scope.VariableAggregation.NONE, + variable_scope.VariableAggregation.SUM, + variable_scope.VariableAggregation.MEAN + ]: + raise ValueError("Invalid variable aggregation mode: " + aggregation + + " for variable: " + kwargs["name"]) + + # Ignore user-specified caching device, not needed for mirrored variables. + kwargs.pop("caching_device", None) + + # TODO(josh11b,apassos): It would be better if variable initialization + # was never recorded on the tape instead of having to do this manually + # here. + with tape.stop_recording(): + index = real_mirrored_creator(devices, *args, **kwargs) + + if is_tower_local: + result = values.TowerLocalVariable(index, index[devices[0]], aggregation) + else: + result = values.MirroredVariable(index, index[devices[0]], aggregation) + + if not context.executing_eagerly(): + g = ops.get_default_graph() + # If "trainable" is True, next_creator() will add the member variables + # to the TRAINABLE_VARIABLES collection, so we manually remove + # them and replace with the MirroredVariable. We can't set + # "trainable" to False for next_creator() since that causes functions + # like implicit_gradients to skip those variables. + if kwargs.get("trainable", True): + collections.append(ops.GraphKeys.TRAINABLE_VARIABLES) + l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES) + for v in index.values(): + l.remove(v) + g.add_to_collections(collections, result) + return result + + class MirroredStrategy(distribute_lib.DistributionStrategy): """Mirrors vars to distribute across multiple devices on a single machine. @@ -236,61 +317,17 @@ class MirroredStrategy(distribute_lib.DistributionStrategy): self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerDevice( - dict((d, i) for i, d in enumerate(devices))) + {d: i for i, d in enumerate(devices)}) self._cross_tower_ops = cross_tower_ops self._prefetch_on_device = prefetch_on_device # TODO(yuefengz): consider setting the default device. def _create_variable(self, next_creator, *args, **kwargs): """Create a mirrored variable. See `DistributionStrategy.scope`.""" - # Figure out what collections this variable should be added to. - # We'll add the MirroredVariable to those collections instead. - collections = kwargs.pop("collections", None) - if collections is None: - collections = [ops.GraphKeys.GLOBAL_VARIABLES] - kwargs["collections"] = [] - colocate_with = kwargs.pop("colocate_with", None) devices = self._get_devices_from(colocate_with) - # Get synchronization value - synchronization = kwargs.get( - "synchronization", variable_scope.VariableSynchronization.ON_WRITE) - if synchronization == variable_scope.VariableSynchronization.NONE: - raise ValueError("`NONE` variable synchronization mode is not " - "supported with `Mirrored` distribution strategy. Please" - " change the `synchronization` for variable: " + - kwargs["name"]) - elif synchronization == variable_scope.VariableSynchronization.ON_READ: - # Variables that are to be synced on read are tower local. - is_tower_local = True - kwargs["trainable"] = False - elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or - synchronization == variable_scope.VariableSynchronization.AUTO): - # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`. - is_tower_local = False - else: - raise ValueError("Invalid variable synchronization mode: " + - synchronization + " for variable: " + kwargs["name"]) - - # Get aggregation value - aggregation = kwargs.pop("aggregation", - variable_scope.VariableAggregation.NONE) - if aggregation not in [ - variable_scope.VariableAggregation.NONE, - variable_scope.VariableAggregation.SUM, - variable_scope.VariableAggregation.MEAN - ]: - raise ValueError("Invalid variable aggregation mode: " + aggregation + - " for variable: " + kwargs["name"]) - - # Ignore user-specified caching device, not needed for mirrored variables. - kwargs.pop("caching_device", None) - - # TODO(josh11b,apassos): It would be better if variable initialization - # was never recorded on the tape instead of having to do this manually - # here. - with tape.stop_recording(): + def _real_mirrored_creator(devices, *args, **kwargs): # pylint: disable=g-missing-docstring index = {} for i, d in enumerate(devices): with ops.device(d): @@ -314,33 +351,64 @@ class MirroredStrategy(distribute_lib.DistributionStrategy): v = next_creator(*args, **kwargs) assert not isinstance(v, values.DistributedVariable) index[d] = v + return index - if is_tower_local: - result = values.TowerLocalVariable(index, index[devices[0]], - aggregation) - else: - result = values.MirroredVariable(index, index[devices[0]], aggregation) - - if not context.executing_eagerly(): - g = ops.get_default_graph() - # If "trainable" is True, next_creator() will add the member variables - # to the TRAINABLE_VARIABLES collection, so we manually remove - # them and replace with the MirroredVariable. We can't set - # "trainable" to False for next_creator() since that causes functions - # like implicit_gradients to skip those variables. - if kwargs.get("trainable", True): - collections.append(ops.GraphKeys.TRAINABLE_VARIABLES) - l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES) - for v in index.values(): - l.remove(v) - g.add_to_collections(collections, result) - return result + return _create_mirrored_variable(devices, _real_mirrored_creator, *args, + **kwargs) def distribute_dataset(self, dataset_fn): return values.PerDeviceDataset( self._call_dataset_fn(dataset_fn), self._devices, self._prefetch_on_device) + # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed. + def _run_steps_on_dataset(self, fn, iterator, iterations, + initial_loop_values=None): + if initial_loop_values is None: + initial_loop_values = {} + initial_loop_values = nest.flatten(initial_loop_values) + + ctx = values.MultiStepContext() + def body(i, *args): + """A wrapper around `fn` to create the while loop body.""" + del args + fn_result = fn(ctx, iterator.get_next()) + for (name, output) in ctx.last_step_outputs.items(): + # Convert all outputs to tensors, potentially from `DistributedValues`. + ctx.last_step_outputs[name] = self.unwrap(output) + flat_last_step_outputs = nest.flatten(ctx.last_step_outputs) + with ops.control_dependencies([fn_result]): + return [i + 1] + flat_last_step_outputs + + cond = lambda i, *args: i < iterations + i = constant_op.constant(0) + loop_result = control_flow_ops.while_loop( + cond, body, [i] + initial_loop_values, name="", + parallel_iterations=1, back_prop=False, swap_memory=False, + return_same_structure=True) + + ctx.run_op = control_flow_ops.group(loop_result) + + # Convert the last_step_outputs from a list to the original dict structure + # of last_step_outputs. + last_step_tensor_outputs = loop_result[1:] + last_step_tensor_outputs_dict = nest.pack_sequence_as( + ctx.last_step_outputs, last_step_tensor_outputs) + + for (name, aggregation) in ctx._last_step_outputs_aggregations.items(): # pylint: disable=protected-access + output = last_step_tensor_outputs_dict[name] + # For outputs that have already been aggregated, wrap them in a Mirrored + # container, else in a PerDevice container. + if aggregation is variables_lib.VariableAggregation.NONE: + last_step_tensor_outputs_dict[name] = values.regroup( + {d: t for d, t in zip(self._devices, output)}, values.PerDevice) + else: + assert len(output) == 1 + last_step_tensor_outputs_dict[name] = output[0] + + ctx._set_last_step_outputs(last_step_tensor_outputs_dict) # pylint: disable=protected-access + return ctx + def _broadcast(self, tensor, destinations): # TODO(josh11b): In eager mode, use one thread per device, or async mode. return self._get_cross_tower_ops().broadcast(tensor, destinations or @@ -378,6 +446,9 @@ class MirroredStrategy(distribute_lib.DistributionStrategy): def _reduce(self, aggregation, value, destinations): assert not isinstance(value, values.Mirrored) if not isinstance(value, values.DistributedValues): + # This function handles reducing values that are not PerDevice or Mirrored + # values. For example, the same value could be present on all towers in + # which case `value` would be a single value or value could be 0. return _reduce_non_distributed_value(self, aggregation, value, destinations) return self._get_cross_tower_ops().reduce( @@ -426,6 +497,9 @@ class MirroredStrategy(distribute_lib.DistributionStrategy): return [val.get(device=d) for d in sorted(val.devices)] return [val] + def value_container(self, val): + return values.value_container(val) + @property def is_single_tower(self): return len(self._devices) == 1 diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py index aab7119901023affaad954c4c4ca7678a2ffee06..e064cfe37db40a51e18a16c532500415a8b74816 100644 --- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py +++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py @@ -25,7 +25,9 @@ from tensorflow.contrib.distribute.python import strategy_test_lib from tensorflow.contrib.distribute.python import values from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import backprop from tensorflow.python.eager import context +from tensorflow.python.eager import function from tensorflow.python.eager import test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes @@ -37,6 +39,7 @@ from tensorflow.python.ops import rnn from tensorflow.python.ops import rnn_cell_impl from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables +from tensorflow.python.training import device_util from tensorflow.python.training import distribute as distribute_lib @@ -838,6 +841,29 @@ class MirroredVariableUpdateTest(test.TestCase): model_fn, run_concurrently=False))) self.assertEquals(0.5, self.evaluate(mirrored_var)) + @test_util.run_in_graph_and_eager_modes(config=config) + def testAssignMirroredVarTowerContextWithSingleValue(self): + self._skip_eager_if_gpus_less_than(1) + def var_fn(): + return variable_scope.variable( + 1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN) + + dist = mirrored_strategy.MirroredStrategy( + ["/device:GPU:0", "/device:CPU:0"]) + + with dist.scope(): + mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False) + self.assertIsInstance(mirrored_var, values.MirroredVariable) + self.evaluate(variables.global_variables_initializer()) + self.assertEquals(1.0, self.evaluate(mirrored_var)) + + def model_fn(): + return mirrored_var.assign(5.0) + + self.evaluate(dist.unwrap(dist.call_for_each_tower( + model_fn, run_concurrently=False))) + self.assertEquals(5.0, self.evaluate(mirrored_var)) + @test_util.run_in_graph_and_eager_modes(config=config) def testAssignAddMirroredVarCrossTowerContext(self): self._skip_eager_if_gpus_less_than(1) @@ -880,6 +906,29 @@ class MirroredVariableUpdateTest(test.TestCase): model_fn, run_concurrently=False))) self.assertEquals(1.5, self.evaluate(mirrored_var)) + @test_util.run_in_graph_and_eager_modes(config=config) + def testAssignAddMirroredVarTowerContextWithSingleValue(self): + self._skip_eager_if_gpus_less_than(1) + def var_fn(): + return variable_scope.variable( + 1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN) + + dist = mirrored_strategy.MirroredStrategy( + ["/device:GPU:0", "/device:CPU:0"]) + + with dist.scope(): + mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False) + self.assertIsInstance(mirrored_var, values.MirroredVariable) + self.evaluate(variables.global_variables_initializer()) + self.assertEquals(1.0, self.evaluate(mirrored_var)) + + def model_fn(): + return mirrored_var.assign_add(5.0) + + self.evaluate(dist.unwrap(dist.call_for_each_tower( + model_fn, run_concurrently=False))) + self.assertEquals(6.0, self.evaluate(mirrored_var)) + @test_util.run_in_graph_and_eager_modes(config=config) def testAssignSubMirroredVarCrossTowerContext(self): self._skip_eager_if_gpus_less_than(1) @@ -922,6 +971,29 @@ class MirroredVariableUpdateTest(test.TestCase): model_fn, run_concurrently=False))) self.assertEquals(4.5, self.evaluate(mirrored_var)) + @test_util.run_in_graph_and_eager_modes(config=config) + def testAssignSubMirroredVarTowerContextWithSingleValue(self): + self._skip_eager_if_gpus_less_than(1) + def var_fn(): + return variable_scope.variable( + 5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN) + + dist = mirrored_strategy.MirroredStrategy( + ["/device:GPU:0", "/device:CPU:0"]) + + with dist.scope(): + mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False) + self.assertIsInstance(mirrored_var, values.MirroredVariable) + self.evaluate(variables.global_variables_initializer()) + self.assertEquals(5.0, self.evaluate(mirrored_var)) + + def model_fn(): + return mirrored_var.assign_sub(1.0) + + self.evaluate(dist.unwrap(dist.call_for_each_tower( + model_fn, run_concurrently=False))) + self.assertEquals(4.0, self.evaluate(mirrored_var)) + class MirroredAndTowerLocalVariableInitializerTest(test.TestCase): config = config_pb2.ConfigProto() @@ -974,7 +1046,7 @@ class TowerLocalVariableAssignTest(test.TestCase): def _skip_eager_if_gpus_less_than(self, num_gpus): if context.num_gpus() < num_gpus and context.executing_eagerly(): - self.skipTest("Enough GPUs not available for this test in eager mode.") + self.skipTest("Not enough GPUs available for this test in eager mode.") @test_util.run_in_graph_and_eager_modes(config=config) def testAssignTowerLocalVarSumAggregation(self): @@ -1036,5 +1108,131 @@ class TowerLocalVariableAssignTest(test.TestCase): self.assertEqual(6.0, self.evaluate(dist.read_var(tower_local_var))) +class MockModel(object): + + def __init__(self, two_variables=False): + self.variables = [] + self.variables.append(variable_scope.variable(1.25, name="dummy_var1")) + if two_variables: + self.variables.append(variable_scope.variable(2.0, name="dummy_var2")) + + def __call__(self, factor=2): + x = factor * self.variables[0] + if len(self.variables) > 1: + x += self.variables[1] + return x + + +class MirroredStrategyDefunTest(test.TestCase): + + def _skip_eager_if_gpus_less_than(self, num_gpus): + if context.num_gpus() < num_gpus and context.executing_eagerly(): + self.skipTest("Not enough GPUs available for this test in eager mode.") + + def _call_and_check(self, model_fn, inputs, expected_result, defuns, + two_variables=False): + cpu_dev = device_util.canonicalize("CPU:0") + gpu_dev = device_util.canonicalize("GPU:0") + devices = [cpu_dev, gpu_dev] + dist = mirrored_strategy.MirroredStrategy(devices) + + with dist.scope(): + mock_model = MockModel(two_variables) + self.evaluate(variables.global_variables_initializer()) + + result = dist.call_for_each_tower(model_fn, mock_model, *inputs, + run_concurrently=False) + for device in devices: + device_result = values.select_device(device, result) + device_expected_result = values.select_device(device, expected_result) + self.assertAllClose(device_expected_result, + self.evaluate(device_result)) + + for defun in defuns: + self.assertEqual(set(mock_model.variables), set(defun.variables)) + + @test_util.run_in_graph_and_eager_modes() + def testVariableInDefun(self): + self._skip_eager_if_gpus_less_than(1) + + @function.defun + def times_two(mock_model): + return mock_model() + + def model_fn(mock_model): + return times_two(mock_model) + + self._call_and_check(model_fn, [], 2.5, [times_two]) + + @test_util.run_in_graph_and_eager_modes() + def testVariableInNestedDefun(self): + self._skip_eager_if_gpus_less_than(1) + + @function.defun + def times_two(mock_model): + return mock_model() + + @function.defun + def two_x_plus_one(mock_model): + return times_two(mock_model) + 1 + + def model_fn(mock_model): + return two_x_plus_one(mock_model) + + self._call_and_check(model_fn, [], 3.5, [times_two, two_x_plus_one]) + + @test_util.run_in_graph_and_eager_modes() + def testTwoVariablesInNestedDefun(self): + self._skip_eager_if_gpus_less_than(1) + + @function.defun + def fn1(mock_model): + return mock_model() + + @function.defun + def fn2(mock_model): + return fn1(mock_model) + 1 + + def model_fn(mock_model): + return fn2(mock_model) + + self._call_and_check(model_fn, [], 5.5, [fn1, fn2], two_variables=True) + + @test_util.run_in_graph_and_eager_modes() + def testGradientTapeOverNestedDefuns(self): + self._skip_eager_if_gpus_less_than(1) + + @function.defun + def fn1(mock_model): + return mock_model() + + @function.defun + def fn2(mock_model): + return fn1(mock_model) + 1 + + def model_fn(mock_model): + with backprop.GradientTape(persistent=True) as gtape: + result = fn2(mock_model) + grads = gtape.gradient(result, + [v.get() for v in mock_model.variables]) + return grads + + self._call_and_check(model_fn, [], [2.0, 1.0], [fn1, fn2], + two_variables=True) + + @test_util.run_in_graph_and_eager_modes() + def testPassPerDevice(self): + self._skip_eager_if_gpus_less_than(1) + + @function.defun + def fn1(mock_model, factor): + return mock_model(factor) + + factors = values.PerDevice({"CPU:0": 5.0, "GPU:0": 3.0}) + expected_result = values.PerDevice({"CPU:0": 5.0 * 1.25, + "GPU:0": 3.0 * 1.25}) + self._call_and_check(fn1, [factors], expected_result, [fn1]) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py index fa479918bd48224d042725566ec905018b974f45..249de01f0880b02d603687db99692088480f7136 100644 --- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py +++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py @@ -20,11 +20,14 @@ from __future__ import print_function import contextlib import copy +import threading +import numpy as np from tensorflow.core.protobuf import config_pb2 from tensorflow.core.protobuf import rewriter_config_pb2 from tensorflow.python.client import session -from tensorflow.python.eager import test +from tensorflow.python.estimator import run_config +from tensorflow.python.platform import test from tensorflow.python.framework import test_util @@ -35,6 +38,12 @@ def create_in_process_cluster(num_workers, num_ps): worker_config = config_pb2.ConfigProto() worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac + # Enable collective ops which has no impact on non-collective ops. + # TODO(yuefengz, tucker): removing this after we move the initialization of + # collective mgr to the session level. + worker_config.experimental.collective_group_leader = ( + '/job:worker/replica:0/task:0') + ps_config = config_pb2.ConfigProto() ps_config.device_count['GPU'] = 0 @@ -43,7 +52,7 @@ def create_in_process_cluster(num_workers, num_ps): # We could've started the server in another process, we could then kill that # process to terminate the server. The reasons why we don't want multiple # processes are - # 1) it is more difficult to manage these processes + # 1) it is more difficult to manage these processes; # 2) there is something global in CUDA such that if we initialize CUDA in the # parent process, the child process cannot initialize it again and thus cannot # use GPUs (https://stackoverflow.com/questions/22950047). @@ -51,7 +60,8 @@ def create_in_process_cluster(num_workers, num_ps): num_workers, num_ps=num_ps, worker_config=worker_config, - ps_config=ps_config) + ps_config=ps_config, + protocol='grpc') class MultiWorkerTestBase(test.TestCase): @@ -60,11 +70,18 @@ class MultiWorkerTestBase(test.TestCase): @classmethod def setUpClass(cls): """Create a local cluster with 2 workers.""" - workers, _ = create_in_process_cluster(num_workers=2, num_ps=0) - cls._master_target = workers[0].target + cls._workers, cls._ps = create_in_process_cluster(num_workers=2, num_ps=0) + + def setUp(self): + # We only cache the session in one test because another test may have a + # different session config or master target. + self._thread_local = threading.local() + self._thread_local.cached_session = None + self._result = 0 + self._lock = threading.Lock() @contextlib.contextmanager - def test_session(self, graph=None, config=None): + def test_session(self, graph=None, config=None, target=None): """Create a test session with master target set to the testing cluster. This overrides the base class' method, removes arguments that are not needed @@ -75,6 +92,7 @@ class MultiWorkerTestBase(test.TestCase): graph: Optional graph to use during the returned session. config: An optional config_pb2.ConfigProto to use to configure the session. + target: the target of session to connect to. Yields: A Session object that should be used as a context manager to surround @@ -94,13 +112,46 @@ class MultiWorkerTestBase(test.TestCase): rewriter_config_pb2.RewriterConfig.OFF) if graph is None: - if self._cached_session is None: # pylint: disable=access-member-before-definition - self._cached_session = session.Session( - graph=None, config=config, target=self._master_target) - sess = self._cached_session + if getattr(self._thread_local, 'cached_session', None) is None: + self._thread_local.cached_session = session.Session( + graph=None, config=config, target=target or self._workers[0].target) + sess = self._thread_local.cached_session with sess.graph.as_default(), sess.as_default(): yield sess else: with session.Session( - graph=graph, config=config, target=self._master_target) as sess: + graph=graph, config=config, target=target or + self._workers[0].target) as sess: yield sess + + def _run_client(self, client_fn, task_type, task_id, num_gpus, *args, + **kwargs): + result = client_fn(task_type, task_id, num_gpus, *args, **kwargs) + if np.all(result): + with self._lock: + self._result += 1 + + def _run_between_graph_clients(self, client_fn, cluster_spec, num_gpus, *args, + **kwargs): + """Runs several clients for between-graph replication. + + Args: + client_fn: a function that needs to accept `task_type`, `task_id`, + `num_gpus` and returns True if it succeeds. + cluster_spec: a dict specifying jobs in a cluster. + num_gpus: number of GPUs per worker. + *args: will be passed to `client_fn`. + **kwargs: will be passed to `client_fn`. + """ + threads = [] + for task_type in [run_config.TaskType.CHIEF, run_config.TaskType.WORKER]: + for task_id in range(len(cluster_spec.get(task_type, []))): + t = threading.Thread( + target=self._run_client, + args=(client_fn, task_type, task_id, num_gpus) + args, + kwargs=kwargs) + t.start() + threads.append(t) + for t in threads: + t.join() + self.assertEqual(self._result, len(threads)) diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py index dbd3514aec7d40d9a04dba4bcbc5c14be639aa33..016978cdb3a152bbba0a2e63df1dea4035e32789 100644 --- a/tensorflow/contrib/distribute/python/one_device_strategy.py +++ b/tensorflow/contrib/distribute/python/one_device_strategy.py @@ -21,11 +21,14 @@ from __future__ import print_function import six from tensorflow.contrib.distribute.python import values +from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope as vs from tensorflow.python.training import distribute as distribute_lib +from tensorflow.python.util import nest # TODO(josh11b): Replace asserts in this file with if ...: raise ... @@ -66,6 +69,41 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy): def _broadcast(self, tensor, destinations): return tensor + # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed. + def _run_steps_on_dataset(self, fn, iterator, iterations, + initial_loop_values=None): + if initial_loop_values is None: + initial_loop_values = {} + initial_loop_values = nest.flatten(initial_loop_values) + + ctx = values.MultiStepContext() + def body(i, *args): + """A wrapper around `fn` to create the while loop body.""" + del args + fn_result = fn(ctx, iterator.get_next()) + flat_last_step_outputs = nest.flatten(ctx.last_step_outputs) + with ops.control_dependencies([fn_result]): + return [i + 1] + flat_last_step_outputs + + cond = lambda i, *args: i < iterations + i = constant_op.constant(0) + # TODO(priyag): Use max_iterations instead of an explicit counter. + loop_result = control_flow_ops.while_loop( + cond, body, [i] + initial_loop_values, name="", + parallel_iterations=1, back_prop=False, swap_memory=False, + return_same_structure=True) + + ctx.run_op = control_flow_ops.group(loop_result) + + # Convert the last_step_outputs from a list to the original dict structure + # of last_step_outputs. + last_step_tensor_outputs = loop_result[1:] + last_step_tensor_outputs_dict = nest.pack_sequence_as( + ctx.last_step_outputs, last_step_tensor_outputs) + + ctx._set_last_step_outputs(last_step_tensor_outputs_dict) # pylint: disable=protected-access + return ctx + def _call_for_each_tower(self, fn, *args, **kwargs): # We don't run `fn` in multiple threads in OneDeviceStrategy. kwargs.pop("run_concurrently", None) @@ -105,6 +143,9 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy): def _unwrap(self, value): return [value] + def value_container(self, value): + return value + @property def is_single_tower(self): return True diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py index 9bcf6f8bac1d0d694381a12e7609a87e8025fa63..407c78df95ded5ef6f3ad973392a4d4a21d07735 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py @@ -77,16 +77,16 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy): GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra caution needs to be taken: - 1) Always use @{tf.get_variable} instead of @{tf.Variable} which is not able + 1) Always use `tf.get_variable` instead of `tf.Variable` which is not able to refer to the same variable on different towers. 2) It is generally not recommended to open a device scope under the strategy's - scope. A device scope (i.e. calling @{tf.device}) will be merged with or + scope. A device scope (i.e. calling `tf.device`) will be merged with or override the device for operations but will not change the device for variables. 3) It is also not recommended to open a colocation scope (i.e. calling - @{tf.colocate_with}) under the strategy's scope. For colocating variables, + `tf.colocate_with`) under the strategy's scope. For colocating variables, use `distribution.colocate_vars_with` instead. Colocation of ops will possibly create conflicts of device assignement. """ @@ -312,6 +312,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy): return [val.get(device=d) for d in sorted(val.devices)] return [val] + def value_container(self, val): + return values.value_container(val) + def read_var(self, var): # No need to distinguish between normal variables and tower-local variables. return array_ops.identity(var) diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py index ad538b9e8ee99d3658ef3dbfad9fbe66bcfd2b6d..cf29c0ed91a14843ce15bf671dd363ca0f7073c0 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py @@ -18,7 +18,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import contextlib import json import threading from absl.testing import parameterized @@ -26,8 +25,6 @@ from absl.testing import parameterized from tensorflow.contrib.distribute.python import combinations from tensorflow.contrib.distribute.python import multi_worker_test_base from tensorflow.contrib.distribute.python import parameter_server_strategy -from tensorflow.core.protobuf import config_pb2 -from tensorflow.python.client import session from tensorflow.python.eager import context from tensorflow.python.estimator import run_config from tensorflow.python.framework import constant_op @@ -43,12 +40,19 @@ from tensorflow.python.training import device_util from tensorflow.python.training import distribute as distribute_lib -class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): +class ParameterServerStrategyTest(multi_worker_test_base.MultiWorkerTestBase, + parameterized.TestCase): @classmethod def setUpClass(cls): cls._workers, cls._ps = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=2) + cls._cluster_spec = { + run_config.TaskType.WORKER: [ + 'fake_worker_0', 'fake_worker_1', 'fake_worker_2' + ], + run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] + } def setUp(self): self._result = 0 @@ -57,40 +61,34 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): self._init_reached = 0 self._finish_condition = threading.Condition() self._finish_reached = 0 + super(ParameterServerStrategyTest, self).setUp() + + def _get_test_objects(self, task_type, task_id, num_gpus): + distribution = parameter_server_strategy.ParameterServerStrategy( + num_gpus_per_worker=num_gpus) + if not task_type: + return distribution, '' - def _get_ps_distribution_strategy(self, task_type, task_index, num_gpus=0): tf_config = { - 'cluster': { - run_config.TaskType.WORKER: [ - 'fake_worker_0', 'fake_worker_1', 'fake_worker_2' - ], - run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] - }, + 'cluster': self._cluster_spec, 'task': { 'type': task_type, - 'index': task_index + 'index': task_id } } - distribution = parameter_server_strategy.ParameterServerStrategy( - num_gpus_per_worker=num_gpus) with self._lock: # Accessing environment variables should be protected by locks because # environment variables are shared by all threads. with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): distribution.configure() - return distribution - - @contextlib.contextmanager - def _test_session(self, target): - config = config_pb2.ConfigProto(allow_soft_placement=True) - config.graph_options.optimizer_options.opt_level = -1 - with session.Session(graph=None, config=config, target=target) as sess: - yield sess + return distribution, self._workers[task_id].target - def _test_device_assignment_distributed(self, d, num_gpus=0): + def _test_device_assignment_distributed(self, task_type, task_id, num_gpus): + worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id) + d, _ = self._get_test_objects(task_type, task_id, num_gpus) with ops.Graph().as_default(), \ - self._test_session(target=self._workers[0].target) as sess, \ + self.test_session(target=self._workers[0].target) as sess, \ d.scope(): # Define a variable outside the call_for_each_tower scope. This is not @@ -108,12 +106,9 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): a = constant_op.constant(1.0) b = constant_op.constant(2.0) c = a + b - self.assertEqual(a.device, - '/job:worker/replica:0/task:1/%s' % last_part_device) - self.assertEqual(b.device, - '/job:worker/replica:0/task:1/%s' % last_part_device) - self.assertEqual(c.device, - '/job:worker/replica:0/task:1/%s' % last_part_device) + self.assertEqual(a.device, worker_device + '/' + last_part_device) + self.assertEqual(b.device, worker_device + '/' + last_part_device) + self.assertEqual(c.device, worker_device + '/' + last_part_device) # The device scope is ignored for variables but not for normal ops. with ops.device('/job:worker/task:0'): @@ -143,13 +138,12 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): z_add = z.assign_add(y) with ops.control_dependencies([z_add]): f = z + c - self.assertEqual(f.device, - '/job:worker/replica:0/task:1/%s' % last_part_device) + self.assertEqual(f.device, worker_device + '/' + last_part_device) # The device scope would merge with the default worker device. with ops.device('/CPU:1'): g = e + 1.0 - self.assertEqual(g.device, '/job:worker/replica:0/task:1/device:CPU:1') + self.assertEqual(g.device, worker_device + '/device:CPU:1') # Ths ops.colocate_with will be ignored when defining a variale but not # for a normal tensor. @@ -182,8 +176,7 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) def testDeviceAssignmentDistributed(self, num_gpus): - d = self._get_ps_distribution_strategy('worker', 1, num_gpus=num_gpus) - self._test_device_assignment_distributed(d, num_gpus=num_gpus) + self._test_device_assignment_distributed('worker', 1, num_gpus) def _test_device_assignment_local(self, d, @@ -191,7 +184,7 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): variable_device='CPU', num_gpus=0): with ops.Graph().as_default(), \ - self._test_session(target=self._workers[0].target) as sess, \ + self.test_session(target=self._workers[0].target) as sess, \ d.scope(): def model_fn(): @@ -272,30 +265,33 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): self.assertEqual(z_val, 43.0) self.assertEqual(f_val, 46.0) - def testDeviceAssignmentLocal(self): + def testDeviceAssignmentLocalCPU(self): distribution = parameter_server_strategy.ParameterServerStrategy( num_gpus_per_worker=0) self._test_device_assignment_local( distribution, compute_device='CPU', variable_device='CPU', num_gpus=0) + def testDeviceAssignmentLocalOneGPU(self): distribution = parameter_server_strategy.ParameterServerStrategy( num_gpus_per_worker=1) self._test_device_assignment_local( distribution, compute_device='GPU', variable_device='GPU', num_gpus=1) + def testDeviceAssignmentLocalTwoGPUs(self): distribution = parameter_server_strategy.ParameterServerStrategy( num_gpus_per_worker=2) self._test_device_assignment_local( distribution, compute_device='GPU', variable_device='CPU', num_gpus=2) - def _test_simple_increment(self, d, task_type, task_index, master_target): + def _test_simple_increment(self, task_type, task_id, num_gpus): + d, master_target = self._get_test_objects(task_type, task_id, num_gpus) if hasattr(d, '_cluster_spec') and d._cluster_spec: num_workers = len(d._cluster_spec.as_dict().get('worker', ['dummy_worker'])) else: num_workers = 1 with ops.Graph().as_default(), \ - self._test_session(target=master_target) as sess, \ + self.test_session(target=master_target) as sess, \ d.scope(): def model_fn(): @@ -314,7 +310,7 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): if context.num_gpus() < d._num_gpus_per_worker: return True - if task_index == 0: + if task_id == 0: variables.global_variables_initializer().run() # Workers waiting for chief worker's initializing variables. @@ -341,9 +337,10 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): return (x_val == 10.0 + 1.0 * num_workers * d.num_towers and y_val == 20.0 + 1.0 * num_workers * d.num_towers) - def _test_minimize_loss_graph(self, d, task_type, task_index, master_target): + def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): + d, master_target = self._get_test_objects(task_type, task_id, num_gpus) with ops.Graph().as_default(), \ - self._test_session(target=master_target) as sess, \ + self.test_session(target=master_target) as sess, \ d.scope(): l = core.Dense(1, use_bias=False) @@ -390,7 +387,7 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): if context.num_gpus() < d._num_gpus_per_worker: return True - if task_index == 0: + if task_id == 0: variables.global_variables_initializer().run() # Workers waiting for chief worker's initializing variables. @@ -413,42 +410,20 @@ class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase): self.assertLess(error_after, error_before) return error_after < error_before - def _run_client(self, index, model_fn, num_gpus): - task_type = run_config.TaskType.WORKER - result = model_fn( - self._get_ps_distribution_strategy(task_type, index, num_gpus=num_gpus), - task_type, index, self._workers[index].target) - if result: - with self._lock: - self._result += 1 - - def _run_multiple_clients(self, num_clients, model_fn, num_gpus=0): - threads = [] - for i in range(num_clients): - t = threading.Thread( - target=self._run_client, args=(i, model_fn, num_gpus)) - t.start() - threads.append(t) - for t in threads: - t.join() - def testSimpleBetweenGraph(self): - self._run_multiple_clients(3, self._test_simple_increment) - self.assertEqual(self._result, 3) + self._run_between_graph_clients(self._test_simple_increment, + self._cluster_spec, 0) @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) def testLocalSimpleIncrement(self, num_gpus): - d = parameter_server_strategy.ParameterServerStrategy( - num_gpus_per_worker=num_gpus) - self._test_simple_increment(d, 'dummy_worker', 0, '') + self._test_simple_increment(None, 0, num_gpus) @combinations.generate( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) def testMinimizeLossGraph(self, num_gpus): - self._run_multiple_clients( - 3, self._test_minimize_loss_graph, num_gpus=num_gpus) - self.assertEqual(self._result, 3) + self._run_between_graph_clients(self._test_minimize_loss_graph, + self._cluster_spec, num_gpus) if __name__ == '__main__': diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py index 24cdc627a35f4455cb92484566dc13fa1bbaf2cc..1ff60c076226299a89060a295c1cc0c50817b861 100644 --- a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py +++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py @@ -35,7 +35,7 @@ from tensorflow.python.util import nest # pylint: disable=protected-access class _PrefetchToDeviceIterator(object): - """A replacement for @{tf.data.Iterator} that prefetches to another device. + """A replacement for `tf.data.Iterator` that prefetches to another device. Args: input_dataset: The input dataset. @@ -108,7 +108,7 @@ class _PrefetchToDeviceIterator(object): self._input_dataset) def get_next(self, name=None): - """See @{tf.data.Iterator.get_next}.""" + """See `tf.data.Iterator.get_next`.""" self._get_next_call_count += 1 if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD: warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE) @@ -209,7 +209,7 @@ class _PrefetchToDeviceDataset(dataset_ops.Dataset): def prefetch_to_devices(devices, buffer_size=None): """A transformation that prefetches dataset values to the given `devices`. - NOTE: Although the transformation creates a @{tf.data.Dataset}, the + NOTE: Although the transformation creates a `tf.data.Dataset`, the transformation must be the final `Dataset` in the input pipeline. Args: @@ -220,7 +220,7 @@ def prefetch_to_devices(devices, buffer_size=None): Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): return _PrefetchToDeviceDataset(dataset, devices, buffer_size) diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py index d1fdb3279cf2a7cba6e2282d58eedccf38bd38a3..5aa19cf6a9f8411120ed929cecaf93dda6c9edf2 100644 --- a/tensorflow/contrib/distribute/python/single_loss_example.py +++ b/tensorflow/contrib/distribute/python/single_loss_example.py @@ -29,7 +29,8 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops -def single_loss_example(optimizer_fn, distribution, use_bias=False): +def single_loss_example(optimizer_fn, distribution, use_bias=False, + iterations_per_step=1): """Build a very simple network to use in tests and examples.""" def dataset_fn(): @@ -38,12 +39,13 @@ def single_loss_example(optimizer_fn, distribution, use_bias=False): optimizer = optimizer_fn() layer = core.Dense(1, use_bias=use_bias) - def loss_fn(x): + def loss_fn(ctx, x): + del ctx y = array_ops.reshape(layer(x), []) - constant_op.constant(1.) return y * y - single_loss_step = step_fn.StandardSingleLossStep(dataset_fn, loss_fn, - optimizer, distribution) + single_loss_step = step_fn.StandardSingleLossStep( + dataset_fn, loss_fn, optimizer, distribution, iterations_per_step) # Layer is returned for inspecting the kernels in tests. return single_loss_step, layer diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py index d1910622b38c748fc5a814f9e83c2294850d5d12..d3611570b472078bb5f154e9bcb8823c31d39c24 100644 --- a/tensorflow/contrib/distribute/python/step_fn.py +++ b/tensorflow/contrib/distribute/python/step_fn.py @@ -34,15 +34,9 @@ class Step(object): def __call__(self): """Perform one step of this training algorithm.""" - return self.step(self.inputs()) - - def inputs(self): - """For the generating the input to be passed to `step()`.""" raise NotImplementedError("must be implemented in descendants") - def step(self, inputs): - """Perform the main computation of this training algorithm.""" - raise NotImplementedError("must be implemented in descendants") + # TODO(priyag): Add an method to access initialization and finalize ops. class StandardInputStep(Step): @@ -54,12 +48,9 @@ class StandardInputStep(Step): """ def __init__(self, dataset_fn, distribution): - Step.__init__(self, distribution) - self._distributed_input = distribution.distribute_dataset( - dataset_fn).make_one_shot_iterator() - - def inputs(self): - return self._distributed_input.get_next() + super(StandardInputStep, self).__init__(distribution) + self._distributed_input = distribution.distribute_dataset(dataset_fn) + self._iterator = self._distributed_input.make_one_shot_iterator() class StandardSingleLossStep(StandardInputStep): @@ -69,8 +60,8 @@ class StandardSingleLossStep(StandardInputStep): ```python ... - step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer) - step.initialize(distribution) + step = step_fn.StandardSingleLossStep( + dataset, loss_fn, optimizer, distribution) # Run a single training step on a given DistributionStrategy: step(distribution) @@ -80,27 +71,43 @@ class StandardSingleLossStep(StandardInputStep): Args: dataset_fn: a function that returns a tf.data Dataset that produces the input for the model. - loss_fn: a function that returns loss. + loss_fn: a function that takes a context and inputs as arguments. It returns + the loss for those inputs. `context` is an instance of + `values.MultiStepContext` that will be passed when `loss_fn` is run. + `context` can be used to specify the outputs to be returned from + `loss_fn`, among other things. optimizer: an optimizer that implements an update rule. distribution: a `DistributionStrategy` object. """ - def __init__(self, dataset_fn, loss_fn, optimizer, distribution): - StandardInputStep.__init__(self, dataset_fn, distribution) + def __init__(self, dataset_fn, loss_fn, optimizer, distribution, + iterations_per_step=1): + super(StandardSingleLossStep, self).__init__(dataset_fn, distribution) self._loss_fn = loss_fn self._optimizer = optimizer self._is_run_concurrently = False + self._iterations_per_step = iterations_per_step - def step(self, inputs): + def __call__(self): with self._distribution.scope(): - gradients_fn = backprop.implicit_grad(self._loss_fn) - gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) - - grads_and_vars = self.distribution.call_for_each_tower( - gradients_fn, inputs, run_concurrently=self._is_run_concurrently) - # If threads use layers, then we need to run the first step sequentially, - # so that layers.build() is not executed in parallel. Otherwise, multiple - # sets of mirrored variables are going to be created. - self._is_run_concurrently = True - return self._optimizer._distributed_apply( # pylint: disable=protected-access - self.distribution, grads_and_vars) + def step_fn(ctx, inputs): + """Function to run one iteration with one input.""" + gradients_fn = backprop.implicit_grad(self._loss_fn) + gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) + + grads_and_vars = self.distribution.call_for_each_tower( + gradients_fn, + ctx, inputs, + run_concurrently=self._is_run_concurrently) + # If threads use layers, then we need to run the first step + # sequentially, so that layers.build() is not executed in parallel. + # Otherwise, multiple sets of mirrored variables are going to be + # created. + self._is_run_concurrently = True + return self._optimizer._distributed_apply( # pylint: disable=protected-access + self.distribution, grads_and_vars) + + # TODO(priyag): Return the outputs, context, etc as well. + ctx = self.distribution.run_steps_on_dataset( + step_fn, self._iterator, self._iterations_per_step) + return ctx.run_op diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py index 2ee94d8f70868c07ca217dd4d433585458efa8d8..8605ab1f7daeb81e778577ad3c4a18b39c57d743 100644 --- a/tensorflow/contrib/distribute/python/step_fn_test.py +++ b/tensorflow/contrib/distribute/python/step_fn_test.py @@ -33,12 +33,19 @@ class SingleLossStepTest(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.times( combinations.distributions_and_v1_optimizers(), - combinations.combine(mode=combinations.graph_and_eager_modes))) - def testTrainNetwork(self, distribution, optimizer_fn): + combinations.combine(mode=combinations.graph_and_eager_modes), + combinations.combine(is_tpu=[False])) + + combinations.combine( + distribution=[combinations.tpu_strategy], + optimizer_fn=combinations.optimizers_v1, + mode=["graph"], + is_tpu=[True])) + def testTrainNetwork(self, distribution, optimizer_fn, is_tpu): with distribution.scope(): single_loss_step, layer = single_loss_example( - optimizer_fn, distribution, use_bias=True) + optimizer_fn, distribution, use_bias=True, iterations_per_step=2) + self.evaluate(distribution.initialize()) if context.executing_eagerly(): run_step = single_loss_step else: @@ -47,12 +54,14 @@ class SingleLossStepTest(test.TestCase, parameterized.TestCase): self.evaluate(variables.global_variables_initializer()) weights, biases = [], [] - for _ in range(10): + for _ in range(5): run_step() weights.append(self.evaluate(layer.kernel)) biases.append(self.evaluate(layer.bias)) + self.evaluate(distribution.finalize()) + error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1) is_not_increasing = all(y <= x for x, y in zip(error, error[1:])) self.assertTrue(is_not_increasing) diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py index bc53898539d76320e331784f9a717be9491365e1..b510fdb888dafe9f18805bc60e9fb670710521ab 100644 --- a/tensorflow/contrib/distribute/python/tpu_strategy.py +++ b/tensorflow/contrib/distribute/python/tpu_strategy.py @@ -21,40 +21,80 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib import tpu +from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib from tensorflow.contrib.distribute.python import one_device_strategy from tensorflow.contrib.distribute.python import values from tensorflow.contrib.tpu.python.ops import tpu_ops +from tensorflow.contrib.tpu.python.tpu import tpu +from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib +from tensorflow.contrib.tpu.python.tpu import training_loop +from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope as vs +from tensorflow.python.ops import variables as variables_lib +from tensorflow.python.training import device_util +from tensorflow.python.training import server_lib from tensorflow.python.util import nest +def get_tpu_system_metadata(tpu_cluster_resolver): + """Retrieves TPU system metadata given a TPUClusterResolver.""" + master = tpu_cluster_resolver.master() + + # pylint: disable=protected-access + cluster_def = (tpu_cluster_resolver.cluster_spec() + or server_lib.ClusterSpec({})).as_cluster_def() + tpu_system_metadata = ( + tpu_system_metadata_lib._query_tpu_system_metadata( + master, + cluster_def=cluster_def, + query_topology=True)) + + return tpu_system_metadata + + class TPUStrategy(one_device_strategy.OneDeviceStrategy): """Experimental TPU distribution strategy implementation.""" - def __init__(self, num_cores_per_host=2): + def __init__(self, tpu_cluster_resolver, steps_per_run): + """Initializes the TPUStrategy object. + + Args: + tpu_cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver, + which provides information about the TPU cluster. + steps_per_run: Number of steps to run on device before returning to the + host. Note that this can have side-effects on performance, hooks, + metrics, summaries etc. + This parameter is only used when Distribution Strategy is used with + estimator or keras. + """ # TODO(isaprykin): Generalize the defaults. They are currently tailored for # the unit test. - super(TPUStrategy, self).__init__('/cpu:0') - # TODO(isaprykin): Auto-detect number of cores and hosts. - self._num_cores_per_host = num_cores_per_host + super(TPUStrategy, self).__init__('/device:CPU:0') + + self._tpu_cluster_resolver = tpu_cluster_resolver + self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver) + # TODO(priyag): This should not be hardcoded here. - self._host = '/task:0/device:CPU:0' + self._host = '/device:CPU:0' + # TODO(sourabhbajaj): Remove this once performance of running one step + # at a time is comparable to multiple steps. + self.steps_per_run = steps_per_run def distribute_dataset(self, dataset_fn): # TODO(priyag): Perhaps distribute across cores here. return self._call_dataset_fn(dataset_fn) - # TODO(priyag): Deal with OutOfRange errors. + # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed. # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have # a mechanism to infer the outputs of `fn`. Pending b/110550782. def _run_steps_on_dataset(self, fn, iterator, iterations, initial_loop_values=None): - # Enqueue ops + shapes = nest.flatten(iterator.output_shapes) if any([not s.is_fully_defined() for s in shapes]): raise ValueError( @@ -68,7 +108,7 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy): control_deps = [] sharded_inputs = [] with ops.device(self._host): - for _ in range(self._num_cores_per_host): + for _ in range(self.num_towers): # Use control dependencies to ensure a deterministic ordering. with ops.control_dependencies(control_deps): inputs = nest.flatten(iterator.get_next()) @@ -93,58 +133,117 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy): [constant_op.constant(0)], parallel_iterations=1) - # Dequeue ops def dequeue_fn(): - dequeued = tpu.infeed_dequeue_tuple(dtypes=types, shapes=shapes) + dequeued = tpu_ops.infeed_dequeue_tuple(dtypes=types, shapes=shapes) return nest.pack_sequence_as(iterator.output_shapes, dequeued) # Wrap `fn` for repeat. if initial_loop_values is None: - initial_loop_values = [] - ctx = values.MultiStepContext(initial_loop_values) + initial_loop_values = {} + initial_loop_values = nest.flatten(initial_loop_values) + ctx = values.MultiStepContext() def run_fn(*args, **kwargs): del args, kwargs fn_result = fn(ctx, dequeue_fn()) - if ctx.last_step_outputs is None: - ctx.last_step_outputs = [] - with ops.control_dependencies([fn_result]): - return array_ops.identity(ctx.last_step_outputs) + flat_last_step_outputs = nest.flatten(ctx.last_step_outputs) + if flat_last_step_outputs: + with ops.control_dependencies([fn_result]): + return [array_ops.identity(f) for f in flat_last_step_outputs] + else: + return fn_result - # Repeat # TODO(sourabhbajaj): The input to while loop should be based on the output # type of the step_fn def iterate_on_tpu(): - return tpu.repeat(iterations, run_fn, [initial_loop_values]) - - # Re-write and distribute computation. - # TODO(sourabhbajaj): Convert the output to PerDevice variable and - # implement support for that in reduce. - last_step_tensor_outputs = tpu.batch_parallel( - iterate_on_tpu, [], num_shards=self._num_cores_per_host) - - # Take index [0] of last_step_tensor_outputs as we wrapped - # initial_loop_values in a list in the `repeat` call. - return (control_flow_ops.group(last_step_tensor_outputs, enqueue_ops), - last_step_tensor_outputs[0], ctx) + return training_loop.repeat(iterations, run_fn, initial_loop_values) + + replicate_inputs = [[]] * self.num_towers + replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs) + ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops) + + # Filter out any ops from the outputs, typically this would be the case + # when there were no tensor outputs. + last_step_tensor_outputs = [x for x in replicate_outputs + if not isinstance(x, ops.Operation)] + + # Outputs are currently of the structure (grouped by device) + # [[output0_device0, output1_device0, output2_device0], + # [output0_device1, output1_device1, output2_device1]] + # Convert this to the following structure instead: (grouped by output) + # [[output0_device0, output0_device1], + # [output1_device0, output1_device1], + # [output2_device0, output2_device1]] + last_step_tensor_outputs = [list(x) for x in zip(*last_step_tensor_outputs)] + + # Convert replicate_outputs to the original dict structure of + # last_step_outputs. + last_step_tensor_outputs_dict = nest.pack_sequence_as( + ctx.last_step_outputs, last_step_tensor_outputs) + + for (name, aggregation) in ctx._last_step_outputs_aggregations.items(): # pylint: disable=protected-access + output = last_step_tensor_outputs_dict[name] + # For outputs that have already been aggregated, take the first value + # from the list as each value should be the same. Else return the full + # list of values. + if aggregation is not variables_lib.VariableAggregation.NONE: + # TODO(priyag): Should this return the element or a list with 1 element + last_step_tensor_outputs_dict[name] = output[0] + ctx._set_last_step_outputs(last_step_tensor_outputs_dict) # pylint: disable=protected-access + + return ctx def _call_for_each_tower(self, fn, *args, **kwargs): kwargs.pop('run_concurrently', None) with one_device_strategy._OneDeviceTowerContext(self): # pylint: disable=protected-access return fn(*args, **kwargs) - def get_initialization_ops(self): - return [tpu.initialize_system()] + def initialize(self): + if context.executing_eagerly(): + # TODO(priyag): Add appopriate call here when eager is supported for TPUs. + raise NotImplementedError('Eager mode not supported in TPUStrategy.') + else: + return [tpu.initialize_system()] - def get_finalize_ops(self): - return [tpu.shutdown_system()] + def finalize(self): + if context.executing_eagerly(): + # TODO(priyag): Add appopriate call here when eager is supported for TPUs. + raise NotImplementedError('Eager mode not supported in TPUStrategy.') + else: + return [tpu.shutdown_system()] def _reduce(self, aggregation, value, destinations): - del destinations # TPU is graph mode only. Rely on implicit Send/Recv. + graph = ops.get_default_graph() + cf_context = graph._get_control_flow_context() # pylint: disable=protected-access + # If we're inside the ReplicateContext, reduction should be done using + # CrossReplicaSum while outside we can directly use an add_n op. + while cf_context: + if isinstance(cf_context, tpu.TPUReplicateContext): + if aggregation == vs.VariableAggregation.MEAN: + # TODO(jhseu): Revisit once we support model-parallelism. + value *= (1. / self.num_towers) + return tpu_ops.cross_replica_sum(value) + cf_context = cf_context.outer_context + + # Validate that the destination is same as the host device + # Note we don't do this when in replicate context as the reduction is + # performed on the TPU device itself. + devices = cross_tower_ops_lib.get_devices_from(destinations) + if len(devices) == 1: + assert device_util.canonicalize(devices[0]) == device_util.canonicalize( + self._host) + else: + raise ValueError('Multiple devices are not supported for TPUStrategy') + + output = math_ops.add_n(value) if aggregation == vs.VariableAggregation.MEAN: - # TODO(jhseu): Revisit once we support model-parallelism. - value *= (1. / self._num_cores_per_host) - return tpu_ops.cross_replica_sum(value) + return output * (1. / len(value)) + return output + + def _unwrap(self, value): + if isinstance(value, list): + return value + return [value] @property def num_towers(self): - return self._num_cores_per_host + return self._tpu_metadata.num_of_cores_per_host diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py index 4018b1e02339e377acc0594407a4f89791ff57af..5fd4c9de696b715c3fb9b8a6ca64923b413a32e9 100644 --- a/tensorflow/contrib/distribute/python/values.py +++ b/tensorflow/contrib/distribute/python/values.py @@ -35,6 +35,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope as vs +from tensorflow.python.ops import variables as variables_lib from tensorflow.python.training import device_util from tensorflow.python.training import distribute as distribute_lib from tensorflow.python.training import saver @@ -294,6 +295,9 @@ class DistributedVariable(DistributedDelegate): self._primary_var.op.type) return self.get().op + def read_value(self): + return distribute_lib.get_distribution_strategy().read_var(self) + def _should_act_as_resource_variable(self): """Pass resource_variable_ops.is_resource_variable check.""" pass @@ -931,64 +935,120 @@ class MultiStepContext(object): This context object is useful when running multiple steps at a time using the `run_steps_on_dataset` API. For e.g. it allows the user's step function to - specify which outputs to emit at what frequency. Currently it only supports - capturing output from the last step, but will soon be augmented to support - other use cases such as output each N steps. + specify which outputs to emit at what frequency. Currently it supports + capturing output from the last step, as well as capturing non tensor outputs. + In the future it will be augmented to support other use cases such as output + each N steps. """ - def __init__(self, initial_loop_values=None): + def __init__(self): """Initializes an output context. - Args: - initial_loop_values: Initial values passed to the run steps - while loop. The only purpose is to verify the shapes and types - when the actual output is set. This will be removed once we - automatically infer the output shapes and types (and do not need to - check for user error in specifying them manually). Returns: A context object. """ - self._last_step_outputs = None - self._non_tensor_outputs = None - self._initial_loop_values = initial_loop_values + self._last_step_outputs = {} + self._last_step_outputs_aggregations = {} + self._non_tensor_outputs = {} @property def last_step_outputs(self): - """Return the last step's outputs.""" + """A dictionary consisting of outputs to be captured on last step. + + Keys in the dictionary are names of tensors to be captured, as specified + when `set_last_step_output` is called. + Values in the dictionary are the tensors themselves. If + `set_last_step_output` was called with an `aggregation` for this output, + then the value is the aggregated value. + + Returns: + A dictionary with last step outputs. + """ return self._last_step_outputs - @last_step_outputs.setter - def last_step_outputs(self, outputs): - """Set the last step's outputs.""" - self._verify_structure_shapes_types(outputs, self._initial_loop_values) + def _set_last_step_outputs(self, outputs): + """Replace the entire dictionary of last step outputs.""" + if not isinstance(outputs, dict): + raise ValueError("Need a dictionary to set last_step_outputs.") self._last_step_outputs = outputs + def set_last_step_output(self, name, output, + aggregation=variables_lib.VariableAggregation.NONE): + """Set `output` with `name` to be outputted from the last step. + + Args: + name: String, name to identify the output. Doesn't need to match tensor + name. + output: The tensors that should be outputted with `name`. See below for + actual types supported. + aggregation: Aggregation method to use to aggregate outputs from multiple + towers. Required if `set_last_step_output` is called in a tower context. + Optional in cross_tower_context. + When present, the outputs from all the towers are aggregated using the + current distribution strategy's `reduce` method. Hence, the type of + `output` must be what's supported by the corresponding `reduce` method. + For e.g. if using MirroredStrategy and aggregation is set, output + must be a `PerDevice` value. + The aggregation method is also recorded in a dictionary + `_last_step_outputs_aggregations` for later interpreting of the + outputs as already reduced or not. + + """ + if distribute_lib.get_cross_tower_context(): + self._last_step_outputs_aggregations[name] = aggregation + if aggregation is variables_lib.VariableAggregation.NONE: + self._last_step_outputs[name] = output + else: + distribution = distribute_lib.get_distribution_strategy() + self._last_step_outputs[name] = distribution.reduce( + aggregation, output, destinations="/device:CPU:0") + else: + assert aggregation is not variables_lib.VariableAggregation.NONE + def merge_fn(distribution, value): + self._last_step_outputs[name] = distribution.reduce( + aggregation, value, destinations="/device:CPU:0") + # Setting this inside the `merge_fn` because all towers share the same + # context object, so it's more robust to set it only once (even if all + # the towers are trying to set the same value). + self._last_step_outputs_aggregations[name] = aggregation + distribute_lib.get_tower_context().merge_call(merge_fn, output) + @property def non_tensor_outputs(self): - """Return the non tensor outputs.""" + """A dictionary consisting of any non tensor outputs to be captured.""" return self._non_tensor_outputs - @non_tensor_outputs.setter - def non_tensor_outputs(self, outputs): - """Set any non tensor outputs.""" - self._non_tensor_outputs = outputs - - def _verify_structure_shapes_types(self, left, right): - """Verify that the structure, shapes and types of left are same as right.""" - nest.assert_same_structure(left, right) - flat_left = nest.flatten(left) - flat_right = nest.flatten(right) - assert len(flat_left) == len(flat_right), ( - "Length of left {} and right {} should be same.". - format(len(flat_left), len(flat_right))) - - for o, i in zip(flat_left, flat_right): - # TODO(priyag): Add checks for other types like IndexedSlices. - if isinstance(o, ops.Tensor): - assert isinstance(i, ops.Tensor) - assert o.shape == i.shape, ( - "Shape {} of left {} doesn't match shape {} of right {}.". - format(o.shape, o, i.shape, i)) - assert o.dtype == i.dtype, ( - "Dtype {} of left {} doesn't match dtype {} of right {}.". - format(o.dtype, o, i.dtype, i)) + def set_non_tensor_output(self, name, output): + """Set `output` with `name` to be captured as a non tensor output.""" + if distribute_lib.get_cross_tower_context(): + self._non_tensor_outputs[name] = output + else: + def merge_fn(distribution, value): + # NOTE(priyag): For non tensor outputs, we simply return all the values + # in a list as aggregation doesn't make sense on non tensors. + self._non_tensor_outputs[name] = distribution.unwrap(value) + distribute_lib.get_tower_context().merge_call(merge_fn, output) + + +def value_container(val): + """Returns the container that this per-device `value` belongs to. + + Args: + val: A value returned by `call_for_each_tower()` or a variable + created in `scope()`. + + Returns: + A container that `value` belongs to. + If value does not belong to any container (including the case of + container having been destroyed), returns the value itself. + """ + # pylint: disable=protected-access + if (hasattr(val, "_distributed_container") and + # DistributedVariable has _distributed_container defined + # but we don't want to return it. + not isinstance(val, DistributedVariable)): + container = val._distributed_container() + # pylint: disable=protected-access + if container is not None: + return container + return val diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD index ad00d1734dd14ed846522a33d888a5387cb25cc6..a8d0d493abcd7de540799f6b94c3cdb9ce9dafae 100644 --- a/tensorflow/contrib/distributions/BUILD +++ b/tensorflow/contrib/distributions/BUILD @@ -124,7 +124,7 @@ cuda_py_test( cuda_py_test( name = "conditional_distribution_test", - size = "small", + size = "medium", srcs = [ "python/kernel_tests/conditional_distribution_test.py", "python/kernel_tests/distribution_test.py", diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py index 85d604e34ac25cf94b601470b7f166d9d414a8e3..49a9afe3f6debe048369c52328fb5534946ab9e5 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py @@ -29,6 +29,17 @@ from tensorflow.python.platform import test class MatrixInverseTriLBijectorTest(test.TestCase): """Tests the correctness of the Y = inv(tril) transformation.""" + #The inverse of 0 is undefined, as the numbers above the main + #diagonal must be zero, we zero out these numbers after running inverse. + #See: https://github.com/numpy/numpy/issues/11445 + def _inv(self, x): + y = np.linalg.inv(x) + #triu_indices only works on 2d arrays + #need to iterate over all the 2d arrays in a x-dimensional array. + for idx in np.ndindex(y.shape[0:-2]): + y[idx][np.triu_indices(y[idx].shape[-1], 1)] = 0 + return y + @test_util.run_in_graph_and_eager_modes def testComputesCorrectValues(self): inv = bijectors.MatrixInverseTriL(validate_args=True) @@ -98,7 +109,7 @@ class MatrixInverseTriLBijectorTest(test.TestCase): [2., 3.]]], [[[4., 0.], [5., -6.]]]], dtype=np.float32) - x_inv_ = np.linalg.inv(x_) + x_inv_ = self._inv(x_) expected_fldj_ = -4. * np.sum( np.log(np.abs(np.diagonal(x_, axis1=-2, axis2=-1))), axis=-1) diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py index f5aaa5cf34abde3ea4d25de1ecf3adaef3f2a770..aa680a92be64cf0f099acd335369f2a1610c5953 100644 --- a/tensorflow/contrib/distributions/python/ops/sample_stats.py +++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py @@ -134,7 +134,7 @@ def auto_correlation( x_len = util.prefer_static_shape(x_rotated)[-1] # TODO(langmore) Investigate whether this zero padding helps or hurts. At - # the moment is is necessary so that all FFT implementations work. + # the moment is necessary so that all FFT implementations work. # Zero pad to the next power of 2 greater than 2 * x_len, which equals # 2**(ceil(Log_2(2 * x_len))). Note: Log_2(X) = Log_e(X) / Log_e(2). x_len_float64 = math_ops.cast(x_len, np.float64) @@ -198,7 +198,7 @@ def auto_correlation( # Recall R[m] is a sum of N / 2 - m nonzero terms x[n] Conj(x[n - m]). The # other terms were zeros arising only due to zero padding. # `denominator = (N / 2 - m)` (defined below) is the proper term to - # divide by by to make this an unbiased estimate of the expectation + # divide by to make this an unbiased estimate of the expectation # E[X[n] Conj(X[n - m])]. x_len = math_ops.cast(x_len, dtype.real_dtype) max_lags = math_ops.cast(max_lags, dtype.real_dtype) diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD index 0cc764d2208c5b061b7b836bdf57a035f52c6fcf..f7933639a086483b8dc044837276ce0e76840319 100644 --- a/tensorflow/contrib/eager/python/BUILD +++ b/tensorflow/contrib/eager/python/BUILD @@ -199,7 +199,7 @@ py_library( "//tensorflow/python:training", "//tensorflow/python:variable_scope", "//tensorflow/python/eager:context", - "//tensorflow/python/estimator:util", + "//tensorflow/python/estimator:estimator_py", ], ) @@ -223,3 +223,17 @@ py_test( "//tensorflow/python/eager:test", ], ) + +py_test( + name = "remote_test", + srcs = ["remote_test.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/contrib/eager/python:tfe", + "//tensorflow/python:array_ops", + "//tensorflow/python:client", + "//tensorflow/python:framework", + "//tensorflow/python:math_ops", + "//tensorflow/python/eager:function", + ], +) diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py index e31dbbe80f9634e8e45ec91bf395eab82942c8ce..135095a97980da8988b976948fb18492526e390c 100644 --- a/tensorflow/contrib/eager/python/datasets.py +++ b/tensorflow/contrib/eager/python/datasets.py @@ -22,16 +22,13 @@ from tensorflow.contrib.data.python.ops import prefetching_ops from tensorflow.python.data.ops import iterator_ops from tensorflow.python.eager import context from tensorflow.python.framework import ops -from tensorflow.python.ops import gen_dataset_ops -from tensorflow.python.training.checkpointable import base as checkpointable -from tensorflow.python.training.saver import BaseSaverBuilder -class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase): +class Iterator(iterator_ops.EagerIterator): """An iterator producing tf.Tensor objects from a tf.data.Dataset. NOTE: Unlike the iterator created by the - @{tf.data.Dataset.make_one_shot_iterator} method, this class enables + `tf.data.Dataset.make_one_shot_iterator` method, this class enables additional experimental functionality, such as prefetching to the GPU. """ @@ -82,30 +79,3 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase): # TODO(b/77291417): Fix with context.execution_mode(context.SYNC): return super(Iterator, self)._next_internal() - - # TODO(shivaniagrawal): Expose checkpointable stateful objects from dataset - # attributes(potential). - - class _Saveable(BaseSaverBuilder.SaveableObject): - """SaveableObject for saving/restoring iterator state.""" - - def __init__(self, iterator_resource, name): - serialized_iterator = gen_dataset_ops.serialize_iterator( - iterator_resource) - specs = [ - BaseSaverBuilder.SaveSpec(serialized_iterator, "", name + "_STATE") - ] - # pylint: disable=protected-access - super(Iterator._Saveable, self).__init__(iterator_resource, specs, name) - - def restore(self, restored_tensors, restored_shapes): - with ops.colocate_with(self.op): - return gen_dataset_ops.deserialize_iterator(self.op, - restored_tensors[0]) - - def _gather_saveables_for_checkpoint(self): - - def _saveable_factory(name): - return self._Saveable(self._resource, name) - - return {"ITERATOR": _saveable_factory} diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py index acc605247faffcf7ba83891dacdab13fc8c8574a..a753d77580758af9de8410de4a08f7ea278c4c79 100644 --- a/tensorflow/contrib/eager/python/datasets_test.py +++ b/tensorflow/contrib/eager/python/datasets_test.py @@ -37,6 +37,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.ops import math_ops from tensorflow.python.ops import script_ops +from tensorflow.python.training import checkpoint_management from tensorflow.python.training.checkpointable import util as checkpointable_utils @@ -306,6 +307,19 @@ class IteratorTest(test.TestCase): checkpoint.restore(save_path) self.assertEqual(2, iterator.get_next().numpy()) + def testRestoreInReconstructedIterator(self): + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt') + dataset = Dataset.range(10) + for i in range(5): + iterator = datasets.Iterator(dataset) + checkpoint = checkpointable_utils.Checkpoint(iterator=iterator) + checkpoint.restore(checkpoint_management.latest_checkpoint( + checkpoint_directory)) + for j in range(2): + self.assertEqual(i * 2 + j, iterator.get_next().numpy()) + checkpoint.save(file_prefix=checkpoint_prefix) + class DatasetConstructorBenchmark(test.Benchmark): diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD index 12155a459c29c353c57679c407e7dda25047a35c..6f02c90368d966b8cf8d0dee09f9d2a5013c90c1 100644 --- a/tensorflow/contrib/eager/python/examples/BUILD +++ b/tensorflow/contrib/eager/python/examples/BUILD @@ -15,8 +15,6 @@ py_library( "//tensorflow/contrib/eager/python/examples/revnet:config", "//tensorflow/contrib/eager/python/examples/rnn_colorbot", "//tensorflow/contrib/eager/python/examples/rnn_ptb", - "//tensorflow/contrib/eager/python/examples/sagan", - "//tensorflow/contrib/eager/python/examples/sagan:config", "//tensorflow/contrib/eager/python/examples/spinn:data", ], ) diff --git a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py index bd0057fb1a0175a805a0f7a1e4dcaa2bdc3c435a..4b3cb624bc947a1d1956eff6accb6d4da3bf3b87 100644 --- a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py +++ b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py @@ -128,8 +128,10 @@ class DensenetBenchmark(tf.test.Benchmark): weight_decay=1e-4, dropout_rate=0, pool_initial=True, include_top=True) logits = model(images, training=True) - loss = tf.losses.softmax_cross_entropy( + cross_ent = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) + regularization = tf.add_n(model.losses) + loss = cross_ent + regularization optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) train_op = optimizer.minimize(loss) diff --git a/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py b/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py index 4f19711fb87d6b5558302fd69104aca7e2cf403e..0736ed02b7437240e5da4dd529ad9ba9a5a15042 100644 --- a/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py +++ b/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py @@ -98,12 +98,52 @@ class DensenetTest(tf.test.TestCase): output_shape = model(rand_input).shape self.assertEqual(output_shape, (batch_size, output_classes)) + def test_regularization(self): + if tf.test.is_gpu_available(): + rand_input = tf.random_uniform((10, 3, 32, 32)) + data_format = 'channels_first' + else: + rand_input = tf.random_uniform((10, 32, 32, 3)) + data_format = 'channels_last' + weight_decay = 1e-4 + + conv = tf.keras.layers.Conv2D( + 3, (3, 3), + padding='same', + use_bias=False, + data_format=data_format, + kernel_regularizer=tf.keras.regularizers.l2(weight_decay)) + optimizer = tf.train.GradientDescentOptimizer(0.1) + conv(rand_input) # Initialize the variables in the layer + + def compute_true_l2(vs, wd): + return tf.reduce_sum(tf.square(vs)) * wd + + true_l2 = compute_true_l2(conv.variables, weight_decay) + keras_l2 = tf.add_n(conv.losses) + self.assertAllClose(true_l2, keras_l2) + + with tf.GradientTape() as tape_true, tf.GradientTape() as tape_keras: + loss = tf.reduce_sum(conv(rand_input)) + loss_with_true_l2 = loss + compute_true_l2(conv.variables, weight_decay) + loss_with_keras_l2 = loss + tf.add_n(conv.losses) + + true_grads = tape_true.gradient(loss_with_true_l2, conv.variables) + keras_grads = tape_keras.gradient(loss_with_keras_l2, conv.variables) + self.assertAllClose(true_grads, keras_grads) + + optimizer.apply_gradients(zip(keras_grads, conv.variables)) + keras_l2_after_update = tf.add_n(conv.losses) + self.assertNotAllClose(keras_l2, keras_l2_after_update) + def compute_gradients(model, images, labels): with tf.GradientTape() as tape: logits = model(images, training=True) - loss = tf.losses.softmax_cross_entropy( + cross_ent = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels) + regularization = tf.add_n(model.losses) + loss = cross_ent + regularization tf.contrib.summary.scalar(name='loss', tensor=loss) return tape.gradient(loss, model.variables) diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ca27a85a229d41a85fa26ecdc982da478fe9e202 --- /dev/null +++ b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb @@ -0,0 +1,649 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0TD5ZrvEMbhZ" + }, + "source": [ + "##### Copyright 2018 The TensorFlow Authors.\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\").\n", + "\n", + "# Convolutional VAE: An example with tf.keras and eager\n", + "\n", + "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e \n", + "\u003c/td\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ITZuApL56Mny" + }, + "source": [ + "![evolution of output during training](https://tensorflow.org/images/autoencoders/cvae.gif)\n", + "\n", + "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager) by training a Variational Autoencoder. (VAE, [[1]](https://arxiv.org/abs/1312.6114), [[2]](https://arxiv.org/abs/1401.4082)).\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "P-JuIu2N_SQf" + }, + "outputs": [], + "source": [ + "# to generate gifs\n", + "!pip install imageio" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "e1_Y75QXJS6h" + }, + "source": [ + "## Import TensorFlow and enable Eager execution" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "YfIk2es3hJEd" + }, + "outputs": [], + "source": [ + "from __future__ import absolute_import, division, print_function\n", + "\n", + "# Import TensorFlow \u003e= 1.9 and enable eager execution\n", + "import tensorflow as tf\n", + "tfe = tf.contrib.eager\n", + "tf.enable_eager_execution()\n", + "\n", + "import os\n", + "import time\n", + "import numpy as np\n", + "import glob\n", + "import matplotlib.pyplot as plt\n", + "import PIL\n", + "import imageio\n", + "from IPython import display" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "iYn4MdZnKCey" + }, + "source": [ + "## Load the MNIST dataset\n", + "Each MNIST image is originally a vector of 784 integers, each of which is between 0-255 and represents the intensity of a pixel. We model each pixel with a Bernoulli distribution in our model, and we statically binarize the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "a4fYMGxGhrna" + }, + "outputs": [], + "source": [ + "(train_images, _), (test_images, _) = tf.keras.datasets.mnist.load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "NFC2ghIdiZYE" + }, + "outputs": [], + "source": [ + "train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')\n", + "test_images = test_images.reshape(test_images.shape[0], 28, 28, 1).astype('float32')\n", + "\n", + "# Normalizing the images to the range of [0., 1.]\n", + "train_images /= 255.\n", + "test_images /= 255.\n", + "\n", + "# Binarization\n", + "train_images[train_images \u003e= .5] = 1.\n", + "train_images[train_images \u003c .5] = 0.\n", + "test_images[test_images \u003e= .5] = 1.\n", + "test_images[test_images \u003c .5] = 0." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "S4PIDhoDLbsZ" + }, + "outputs": [], + "source": [ + "TRAIN_BUF = 60000\n", + "BATCH_SIZE = 100\n", + "\n", + "TEST_BUF = 10000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PIGN6ouoQxt3" + }, + "source": [ + "## Use *tf.data* to create batches and shuffle the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "-yKCCQOoJ7cn" + }, + "outputs": [], + "source": [ + "train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(TRAIN_BUF).batch(BATCH_SIZE)\n", + "test_dataset = tf.data.Dataset.from_tensor_slices(test_images).shuffle(TEST_BUF).batch(BATCH_SIZE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "THY-sZMiQ4UV" + }, + "source": [ + "## Wire up the generative and inference network with *tf.keras.Sequential*\n", + "\n", + "In our VAE example, we use two small ConvNets for the generative and inference network. Since these neural nets are small, we use `tf.keras.Sequential` to simplify our code. Let $x$ and $z$ denote the observation and latent variable respectively in the following descriptions. \n", + "\n", + "### Generative Network\n", + "This defines the generative model which takes a latent encoding as input, and outputs the parameters for a conditional distribution of the observation, i.e. $p(x|z)$. Additionally, we use a unit Gaussian prior $p(z)$ for the latent variable.\n", + "\n", + "### Inference Network\n", + "This defines an approximate posterior distribution $q(z|x)$, which takes as input an observation and outputs a set of parameters for the conditional distribution of the latent representation. In this example, we simply model this distribution as a diagonal Gaussian. In this case, the inference network outputs the mean and log-variance parameters of a factorized Gaussian (log-variance instead of the variance directly is for numerical stability).\n", + "\n", + "### Reparameterization Trick\n", + "During optimization, we can sample from $q(z|x)$ by first sampling from a unit Gaussian, and then multiplying by the standard deviation and adding the mean. This ensures the gradients could pass through the sample to the inference network parameters.\n", + "\n", + "### Network architecture\n", + "For the inference network, we use two convolutional layers followed by a fully-connected layer. In the generative network, we mirror this architecture by using a fully-connected layer followed by three convolution transpose layers (a.k.a. deconvolutional layers in some contexts). Note, it's common practice to avoid using batch normalization when training VAEs, since the additional stochasticity due to using mini-batches may aggravate instability on top of the stochasticity from sampling." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "VGLbvBEmjK0a" + }, + "outputs": [], + "source": [ + "class CVAE(tf.keras.Model):\n", + " def __init__(self, latent_dim):\n", + " super(CVAE, self).__init__()\n", + " self.latent_dim = latent_dim\n", + " self.inference_net = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.InputLayer(input_shape=(28, 28, 1)),\n", + " tf.keras.layers.Conv2D(\n", + " filters=32, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n", + " tf.keras.layers.Conv2D(\n", + " filters=64, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n", + " tf.keras.layers.Flatten(),\n", + " # No activation\n", + " tf.keras.layers.Dense(latent_dim + latent_dim),\n", + " ]\n", + " )\n", + "\n", + " self.generative_net = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.InputLayer(input_shape=(latent_dim,)),\n", + " tf.keras.layers.Dense(units=7*7*32, activation=tf.nn.relu),\n", + " tf.keras.layers.Reshape(target_shape=(7, 7, 32)),\n", + " tf.keras.layers.Conv2DTranspose(\n", + " filters=64,\n", + " kernel_size=3,\n", + " strides=(2, 2),\n", + " padding=\"SAME\",\n", + " activation=tf.nn.relu),\n", + " tf.keras.layers.Conv2DTranspose(\n", + " filters=32,\n", + " kernel_size=3,\n", + " strides=(2, 2),\n", + " padding=\"SAME\",\n", + " activation=tf.nn.relu),\n", + " # No activation\n", + " tf.keras.layers.Conv2DTranspose(\n", + " filters=1, kernel_size=3, strides=(1, 1), padding=\"SAME\"),\n", + " ]\n", + " )\n", + "\n", + " def sample(self, eps=None):\n", + " if eps is None:\n", + " eps = tf.random_normal(shape=(100, self.latent_dim))\n", + " return self.decode(eps, apply_sigmoid=True)\n", + "\n", + " def encode(self, x):\n", + " mean, logvar = tf.split(self.inference_net(x), num_or_size_splits=2, axis=1)\n", + " return mean, logvar\n", + "\n", + " def reparameterize(self, mean, logvar):\n", + " eps = tf.random_normal(shape=mean.shape)\n", + " return eps * tf.exp(logvar * .5) + mean\n", + "\n", + " def decode(self, z, apply_sigmoid=False):\n", + " logits = self.generative_net(z)\n", + " if apply_sigmoid:\n", + " probs = tf.sigmoid(logits)\n", + " return probs\n", + "\n", + " return logits" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0FMYgY_mPfTi" + }, + "source": [ + "## Define the loss function and the optimizer\n", + "\n", + "VAEs train by maximizing the evidence lower bound (ELBO) on the marginal log-likelihood:\n", + "\n", + "$$\\log p(x) \\ge \\text{ELBO} = \\mathbb{E}_{q(z|x)}\\left[\\log \\frac{p(x, z)}{q(z|x)}\\right].$$\n", + "\n", + "In practice, we optimize the single sample Monte Carlo estimate of this expectation:\n", + "\n", + "$$\\log p(x| z) + \\log p(z) - \\log q(z|x),$$\n", + "where $z$ is sampled from $q(z|x)$.\n", + "\n", + "**Note**: we could also analytically compute the KL term, but here we incorporate all three terms in the Monte Carlo estimator for simplicity." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "iWCn_PVdEJZ7" + }, + "outputs": [], + "source": [ + "def log_normal_pdf(sample, mean, logvar, raxis=1):\n", + " log2pi = tf.log(2. * np.pi)\n", + " return tf.reduce_sum(\n", + " -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),\n", + " axis=raxis)\n", + "\n", + "def compute_loss(model, x):\n", + " mean, logvar = model.encode(x)\n", + " z = model.reparameterize(mean, logvar)\n", + " x_logit = model.decode(z)\n", + "\n", + " cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)\n", + " logpx_z = -tf.reduce_sum(cross_ent, axis=[1, 2, 3])\n", + " logpz = log_normal_pdf(z, 0., 0.)\n", + " logqz_x = log_normal_pdf(z, mean, logvar)\n", + " return -tf.reduce_mean(logpx_z + logpz - logqz_x)\n", + "\n", + "def compute_gradients(model, x):\n", + " with tf.GradientTape() as tape:\n", + " loss = compute_loss(model, x)\n", + " return tape.gradient(loss, model.trainable_variables), loss\n", + "\n", + "optimizer = tf.train.AdamOptimizer(1e-4)\n", + "def apply_gradients(optimizer, gradients, variables, global_step=None):\n", + " optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Rw1fkAczTQYh" + }, + "source": [ + "## Training\n", + "\n", + "* We start by iterating over the dataset\n", + "* During each iteration, we pass the image to the encoder to obtain a set of mean and log-variance parameters of the approximate posterior $q(z|x)$\n", + "* We then apply the *reparameterization trick* to sample from $q(z|x)$\n", + "* Finally, we pass the reparameterized samples to the decoder to obtain the logits of the generative distribution $p(x|z)$\n", + "* **Note:** Since we use the dataset loaded by keras with 60k datapoints in the training set and 10k datapoints in the test set, our resulting ELBO on the test set is slightly higher than reported results in the literature which uses dynamic binarization of Larochelle's MNIST.\n", + "\n", + "## Generate Images\n", + "\n", + "* After training, it is time to generate some images\n", + "* We start by sampling a set of latent vectors from the unit Gaussian prior distribution $p(z)$\n", + "* The generator will then convert the latent sample $z$ to logits of the observation, giving a distribution $p(x|z)$\n", + "* Here we plot the probabilities of Bernoulli distributions\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "NS2GWywBbAWo" + }, + "outputs": [], + "source": [ + "epochs = 100\n", + "latent_dim = 50\n", + "num_examples_to_generate = 16\n", + "\n", + "# keeping the random vector constant for generation (prediction) so\n", + "# it will be easier to see the improvement.\n", + "random_vector_for_generation = tf.random_normal(\n", + " shape=[num_examples_to_generate, latent_dim])\n", + "model = CVAE(latent_dim)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "RmdVsmvhPxyy" + }, + "outputs": [], + "source": [ + "def generate_and_save_images(model, epoch, test_input):\n", + " predictions = model.sample(test_input)\n", + " fig = plt.figure(figsize=(4,4))\n", + "\n", + " for i in range(predictions.shape[0]):\n", + " plt.subplot(4, 4, i+1)\n", + " plt.imshow(predictions[i, :, :, 0], cmap='gray')\n", + " plt.axis('off')\n", + "\n", + " # tight_layout minimizes the overlap between 2 sub-plots\n", + " plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "2M7LmLtGEMQJ" + }, + "outputs": [], + "source": [ + "generate_and_save_images(model, 0, random_vector_for_generation)\n", + "\n", + "for epoch in range(1, epochs + 1):\n", + " start_time = time.time()\n", + " for train_x in train_dataset:\n", + " gradients, loss = compute_gradients(model, train_x)\n", + " apply_gradients(optimizer, gradients, model.trainable_variables)\n", + " end_time = time.time()\n", + "\n", + " if epoch % 1 == 0:\n", + " loss = tfe.metrics.Mean()\n", + " for test_x in test_dataset.make_one_shot_iterator():\n", + " loss(compute_loss(model, test_x))\n", + " elbo = -loss.result()\n", + " display.clear_output(wait=False)\n", + " print('Epoch: {}, Test set ELBO: {}, '\n", + " 'time elapse for current epoch {}'.format(epoch,\n", + " elbo,\n", + " end_time - start_time))\n", + " generate_and_save_images(\n", + " model, epoch, random_vector_for_generation)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "P4M_vIbUi7c0" + }, + "source": [ + "### Display an image using the epoch number" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "WfO5wCdclHGL" + }, + "outputs": [], + "source": [ + "def display_image(epoch_no):\n", + " return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "5x3q9_Oe5q0A" + }, + "outputs": [], + "source": [ + "display_image(epochs) # Display images" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "NywiH3nL8guF" + }, + "source": [ + "### Generate a GIF of all the saved images." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "IGKQgENQ8lEI" + }, + "outputs": [], + "source": [ + "with imageio.get_writer('cvae.gif', mode='I') as writer:\n", + " filenames = glob.glob('image*.png')\n", + " filenames = sorted(filenames)\n", + " last = -1\n", + " for i,filename in enumerate(filenames):\n", + " frame = 2*(i**0.5)\n", + " if round(frame) \u003e round(last):\n", + " last = frame\n", + " else:\n", + " continue\n", + " image = imageio.imread(filename)\n", + " writer.append_data(image)\n", + " image = imageio.imread(filename)\n", + " writer.append_data(image)\n", + " \n", + "# this is a hack to display the gif inside the notebook\n", + "os.system('cp cvae.gif cvae.gif.png')" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "uV0yiKpzNP1b" + }, + "outputs": [], + "source": [ + "display.Image(filename=\"cvae.gif.png\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "yQXO_dlXkKsT" + }, + "source": [ + "To downlod the animation from Colab uncomment the code below:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": { + "autoexec": { + "startup": false, + "wait_interval": 0 + } + }, + "colab_type": "code", + "id": "4fSJS3m5HLFM" + }, + "outputs": [], + "source": [ + "#from google.colab import files\n", + "#files.download('cvae.gif')" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "default_view": {}, + "name": "cvae.ipynb", + "private_outputs": true, + "provenance": [ + { + "file_id": "1eb0NOTQapkYs3X0v-zL1x5_LFKgDISnp", + "timestamp": 1527173385672 + } + ], + "toc_visible": true, + "version": "0.3.2", + "views": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb index 44ff43a1112e771eb6c91c398286a003e17632e0..975105a179f7d777251306474637dc46ddd4f707 100644 --- a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb +++ b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb @@ -40,12 +40,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "u_2z-B3piVsw" }, @@ -69,12 +64,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "YfIk2es3hJEd" }, @@ -82,7 +72,7 @@ "source": [ "from __future__ import absolute_import, division, print_function\n", "\n", - "# Import TensorFlow \u003e= 1.9 and enable eager execution\n", + "# Import TensorFlow \u003e= 1.10 and enable eager execution\n", "import tensorflow as tf\n", "tf.enable_eager_execution()\n", "\n", @@ -112,12 +102,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "a4fYMGxGhrna" }, @@ -130,12 +115,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "NFC2ghIdiZYE" }, @@ -150,12 +130,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "S4PIDhoDLbsZ" }, @@ -179,12 +154,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "-yKCCQOoJ7cn" }, @@ -217,12 +187,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "VGLbvBEmjK0a" }, @@ -265,12 +230,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "bkOfJxk5j5Hi" }, @@ -299,12 +259,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "gDkA05NE6QMs" }, @@ -318,12 +273,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "k1HpMSLImuRi" }, @@ -360,12 +310,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "wkMNfBWlT-PV" }, @@ -388,12 +333,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "90BIcCKcDMxz" }, @@ -407,12 +347,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "iWCn_PVdEJZ7" }, @@ -422,6 +357,34 @@ "generator_optimizer = tf.train.AdamOptimizer(1e-4)" ] }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "mWtinsGDPJlV" + }, + "source": [ + "## Checkpoints (Object-based saving)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CA1w-7s2POEy" + }, + "outputs": [], + "source": [ + "checkpoint_dir = './training_checkpoints'\n", + "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n", + "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n", + " discriminator_optimizer=discriminator_optimizer,\n", + " generator=generator,\n", + " discriminator=discriminator)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -449,12 +412,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "NS2GWywBbAWo" }, @@ -462,7 +420,7 @@ "source": [ "EPOCHS = 150\n", "noise_dim = 100\n", - "num_examples_to_generate = 100\n", + "num_examples_to_generate = 16\n", "\n", "# keeping the random vector constant for generation (prediction) so\n", "# it will be easier to see the improvement of the gan.\n", @@ -474,12 +432,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "RmdVsmvhPxyy" }, @@ -490,15 +443,13 @@ " # don't want to train the batchnorm layer when doing inference.\n", " predictions = model(test_input, training=False)\n", "\n", - " fig = plt.figure(figsize=(10,10))\n", + " fig = plt.figure(figsize=(4,4))\n", " \n", " for i in range(predictions.shape[0]):\n", - " plt.subplot(10, 10, i+1)\n", + " plt.subplot(4, 4, i+1)\n", " plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')\n", " plt.axis('off')\n", " \n", - " # tight_layout minimizes the overlap between 2 sub-plots\n", - " plt.tight_layout()\n", " plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n", " plt.show()" ] @@ -507,12 +458,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "2M7LmLtGEMQJ" }, @@ -542,15 +488,20 @@ " discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.variables))\n", "\n", " \n", - " if epoch % 10 == 0:\n", + " if epoch % 1 == 0:\n", " display.clear_output(wait=True)\n", " generate_and_save_images(generator,\n", " epoch + 1,\n", " random_vector_for_generation)\n", - "\n", + " \n", + " # saving (checkpoint) the model every 15 epochs\n", + " if epoch % 15 == 0:\n", + " checkpoint.save(file_prefix = checkpoint_prefix)\n", + " \n", " print ('Time taken for epoch {} is {} sec'.format(epoch + 1,\n", " time.time()-start))\n", " # generating after the final epoch\n", + " display.clear_output(wait=True)\n", " generate_and_save_images(generator,\n", " epochs,\n", " random_vector_for_generation)" @@ -560,12 +511,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "Ly3UN0SLLY2l" }, @@ -574,6 +520,30 @@ "train(train_dataset, EPOCHS, noise_dim)" ] }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "rfM4YcPVPkNO" + }, + "source": [ + "## Restore the latest checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "XhXsd0srPo8c" + }, + "outputs": [], + "source": [ + "# restoring the latest checkpoint in checkpoint_dir\n", + "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))" + ] + }, { "cell_type": "markdown", "metadata": { @@ -581,40 +551,28 @@ "id": "P4M_vIbUi7c0" }, "source": [ - "# Display an image using the epoch number" + "## Display an image using the epoch number" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "WfO5wCdclHGL" }, "outputs": [], "source": [ "def display_image(epoch_no):\n", - " plt.figure(figsize=(15,15))\n", - " plt.imshow(np.array(PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))))\n", - " plt.axis('off')" + " return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "5x3q9_Oe5q0A" }, @@ -647,12 +605,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "IGKQgENQ8lEI" }, @@ -661,23 +614,27 @@ "with imageio.get_writer('dcgan.gif', mode='I') as writer:\n", " filenames = glob.glob('image*.png')\n", " filenames = sorted(filenames)\n", - " for filename in filenames:\n", + " last = -1\n", + " for i,filename in enumerate(filenames):\n", + " frame = 2*(i**0.5)\n", + " if round(frame) \u003e round(last):\n", + " last = frame\n", + " else:\n", + " continue\n", " image = imageio.imread(filename)\n", " writer.append_data(image)\n", - " # this is a hack to display the gif inside the notebook\n", - " os.system('mv dcgan.gif dcgan.gif.png')" + " image = imageio.imread(filename)\n", + " writer.append_data(image)\n", + " \n", + "# this is a hack to display the gif inside the notebook\n", + "os.system('cp dcgan.gif dcgan.gif.png')" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "uV0yiKpzNP1b" }, @@ -686,22 +643,28 @@ "display.Image(filename=\"dcgan.gif.png\")" ] }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6EEG-wePkmJQ" + }, + "source": [ + "To downlod the animation from Colab uncomment the code below:" + ] + }, { "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "4UJjSnIMOzOJ" }, "outputs": [], "source": [ - "" + "#from google.colab import files\n", + "#files.download('dcgan.gif')" ] } ], @@ -709,7 +672,6 @@ "accelerator": "GPU", "colab": { "collapsed_sections": [], - "default_view": {}, "name": "dcgan.ipynb", "private_outputs": true, "provenance": [ @@ -719,8 +681,7 @@ } ], "toc_visible": true, - "version": "0.3.2", - "views": {} + "version": "0.3.2" }, "kernelspec": { "display_name": "Python 3", diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb index b173f856c641b4d7dca96adda113f904c97a25a7..78a711548ddfe40a24aaa9e0b0f41a238795a950 100644 --- a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb +++ b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb @@ -96,12 +96,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "wZ6LOM12wKGH" }, @@ -124,18 +119,13 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "yG_n40gFzf9s" }, "outputs": [], "source": [ - "# Import TensorFlow \u003e= 1.9 and enable eager execution\n", + "# Import TensorFlow \u003e= 1.10 and enable eager execution\n", "import tensorflow as tf\n", "\n", "# Note: Once you enable eager execution, it cannot be disabled. \n", @@ -165,12 +155,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "pD_55cOxLkAb" }, @@ -194,12 +179,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "-E5JvY3wzf94" }, @@ -224,12 +204,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "IalZLbvOzf-F" }, @@ -247,12 +222,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "1v_qUYfAzf-I" }, @@ -302,12 +272,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "0UHJDA39zf-O" }, @@ -341,12 +306,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "p2pGotuNzf-S" }, @@ -376,12 +336,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "P3KTiiInzf-a" }, @@ -445,12 +400,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "7t2XrzEOzf-e" }, @@ -463,12 +413,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "dkjWIATszf-h" }, @@ -481,6 +426,32 @@ " return tf.losses.sparse_softmax_cross_entropy(labels=real, logits=preds)" ] }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3K6s6F79P7za" + }, + "source": [ + "## Checkpoints (Object-based saving)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oAGisDdfP9rL" + }, + "outputs": [], + "source": [ + "checkpoint_dir = './training_checkpoints'\n", + "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n", + "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n", + " model=model)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -514,12 +485,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "d4tSNwymzf-q" }, @@ -547,17 +513,44 @@ " loss = loss_function(target, predictions)\n", " \n", " grads = tape.gradient(loss, model.variables)\n", - " optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())\n", + " optimizer.apply_gradients(zip(grads, model.variables))\n", "\n", " if batch % 100 == 0:\n", " print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,\n", " batch,\n", " loss))\n", - " \n", + " # saving (checkpoint) the model every 5 epochs\n", + " if epoch % 5 == 0:\n", + " checkpoint.save(file_prefix = checkpoint_prefix)\n", + "\n", " print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))\n", " print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))" ] }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "01AR9vpNQMFF" + }, + "source": [ + "## Restore the latest checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "tyvpYomYQQkF" + }, + "outputs": [], + "source": [ + "# restoring the latest checkpoint in checkpoint_dir\n", + "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))" + ] + }, { "cell_type": "markdown", "metadata": { @@ -584,12 +577,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "WvuwZBX5Ogfd" }, @@ -651,12 +639,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "gtEd86sX5cB2" }, @@ -670,13 +653,11 @@ "accelerator": "GPU", "colab": { "collapsed_sections": [], - "default_view": {}, "name": "text_generation.ipynb", "private_outputs": true, "provenance": [], "toc_visible": true, - "version": "0.3.2", - "views": {} + "version": "0.3.2" }, "kernelspec": { "display_name": "Python 3", diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb index 1ab1b71bd0549e06a1d86611c21faef1f182d740..1d07721e3b65ba0f1774d7e7c8d1997608f390c6 100644 --- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb +++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb @@ -1,39 +1,11 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "nmt_with_attention.ipynb", - "version": "0.3.2", - "views": {}, - "default_view": {}, - "provenance": [ - { - "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U", - "timestamp": 1527858391290 - }, - { - "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv", - "timestamp": 1527776041613 - } - ], - "private_outputs": true, - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, "cells": [ { + "cell_type": "markdown", "metadata": { - "id": "AOpGoE2T-YXS", - "colab_type": "text" + "colab_type": "text", + "id": "AOpGoE2T-YXS" }, - "cell_type": "markdown", "source": [ "##### Copyright 2018 The TensorFlow Authors.\n", "\n", @@ -41,19 +13,19 @@ "\n", "# Neural Machine Translation with Attention\n", "\n", - "
\n", - "\n", - " Run in Google Colab \n", - "\n", - "View source on GitHub
" + "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e \n", + "\u003c/td\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e" ] }, { + "cell_type": "markdown", "metadata": { - "id": "CiwtNgENbx2g", - "colab_type": "text" + "colab_type": "text", + "id": "CiwtNgENbx2g" }, - "cell_type": "markdown", "source": [ "This notebook trains a sequence to sequence (seq2seq) model for Spanish to English translation using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). This is an advanced example that assumes some knowledge of sequence to sequence models.\n", "\n", @@ -61,27 +33,24 @@ "\n", "The translation quality is reasonable for a toy example, but the generated attention plot is perhaps more interesting. This shows which parts of the input sentence has the model's attention while translating:\n", "\n", - "\"spanish-english\n", + "\u003cimg src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\"\u003e\n", "\n", "Note: This example takes approximately 10 mintues to run on a single P100 GPU." ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "tnxXKDjq3jEL", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "tnxXKDjq3jEL" }, - "cell_type": "code", + "outputs": [], "source": [ "from __future__ import absolute_import, division, print_function\n", "\n", - "# Import TensorFlow >= 1.9 and enable eager execution\n", + "# Import TensorFlow \u003e= 1.10 and enable eager execution\n", "import tensorflow as tf\n", "\n", "tf.enable_eager_execution()\n", @@ -96,16 +65,14 @@ "import time\n", "\n", "print(tf.__version__)" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "wfodePkj3jEa", - "colab_type": "text" + "colab_type": "text", + "id": "wfodePkj3jEa" }, - "cell_type": "markdown", "source": [ "## Download and prepare the dataset\n", "\n", @@ -124,17 +91,14 @@ ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "kRVATYOgJs1b", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "kRVATYOgJs1b" }, - "cell_type": "code", + "outputs": [], "source": [ "# Download the file\n", "path_to_zip = tf.keras.utils.get_file(\n", @@ -142,22 +106,17 @@ " extract=True)\n", "\n", "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\"" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "rd0jw-eC3jEh", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "rd0jw-eC3jEh" }, - "cell_type": "code", + "outputs": [], "source": [ "# Converts the unicode file to ascii\n", "def unicode_to_ascii(s):\n", @@ -169,7 +128,7 @@ " w = unicode_to_ascii(w.lower().strip())\n", " \n", " # creating a space between a word and the punctuation following it\n", - " # eg: \"he is a boy.\" => \"he is a boy .\" \n", + " # eg: \"he is a boy.\" =\u003e \"he is a boy .\" \n", " # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n", " w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n", " w = re.sub(r'[\" \"]+', \" \", w)\n", @@ -181,24 +140,19 @@ " \n", " # adding a start and an end token to the sentence\n", " # so that the model know when to start and stop predicting.\n", - " w = ' ' + w + ' '\n", + " w = '\u003cstart\u003e ' + w + ' \u003cend\u003e'\n", " return w" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "OHn4Dct23jEm", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "OHn4Dct23jEm" }, - "cell_type": "code", + "outputs": [], "source": [ "# 1. Remove the accents\n", "# 2. Clean the sentences\n", @@ -209,25 +163,20 @@ " word_pairs = [[preprocess_sentence(w) for w in l.split('\\t')] for l in lines[:num_examples]]\n", " \n", " return word_pairs" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "9xbqO7Iie9bb", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "9xbqO7Iie9bb" }, - "cell_type": "code", + "outputs": [], "source": [ - "# This class creates a word -> index mapping (e.g,. \"dad\" -> 5) and vice-versa \n", - "# (e.g., 5 -> \"dad\") for each language,\n", + "# This class creates a word -\u003e index mapping (e.g,. \"dad\" -\u003e 5) and vice-versa \n", + "# (e.g., 5 -\u003e \"dad\") for each language,\n", "class LanguageIndex():\n", " def __init__(self, lang):\n", " self.lang = lang\n", @@ -243,28 +192,23 @@ " \n", " self.vocab = sorted(self.vocab)\n", " \n", - " self.word2idx[''] = 0\n", + " self.word2idx['\u003cpad\u003e'] = 0\n", " for index, word in enumerate(self.vocab):\n", " self.word2idx[word] = index + 1\n", " \n", " for word, index in self.word2idx.items():\n", " self.idx2word[index] = word" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "eAY9k49G3jE_", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "eAY9k49G3jE_" }, - "cell_type": "code", + "outputs": [], "source": [ "def max_length(tensor):\n", " return max(len(t) for t in tensor)\n", @@ -300,86 +244,71 @@ " padding='post')\n", " \n", " return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "GOi42V79Ydlr", - "colab_type": "text" + "colab_type": "text", + "id": "GOi42V79Ydlr" }, - "cell_type": "markdown", "source": [ "### Limit the size of the dataset to experiment faster (optional)\n", "\n", - "Training on the complete dataset of >100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):" + "Training on the complete dataset of \u003e100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):" ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "cnxC7q-j3jFD", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "cnxC7q-j3jFD" }, - "cell_type": "code", + "outputs": [], "source": [ "# Try experimenting with the size of that dataset\n", "num_examples = 30000\n", "input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "4QILQkOs3jFG", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "4QILQkOs3jFG" }, - "cell_type": "code", + "outputs": [], "source": [ "# Creating training and validation sets using an 80-20 split\n", "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n", "\n", "# Show length\n", "len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "rgCLkfv5uO3d", - "colab_type": "text" + "colab_type": "text", + "id": "rgCLkfv5uO3d" }, - "cell_type": "markdown", "source": [ "### Create a tf.data dataset" ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "TqHsArVZ3jFS", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "TqHsArVZ3jFS" }, - "cell_type": "code", + "outputs": [], "source": [ "BUFFER_SIZE = len(input_tensor_train)\n", "BATCH_SIZE = 64\n", @@ -391,29 +320,27 @@ "\n", "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n", "dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "TNfHIF71ulLu", - "colab_type": "text" + "colab_type": "text", + "id": "TNfHIF71ulLu" }, - "cell_type": "markdown", "source": [ "## Write the encoder and decoder model\n", "\n", "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://www.tensorflow.org/tutorials/seq2seq). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://www.tensorflow.org/tutorials/seq2seq#background_on_the_attention_mechanism) from the seq2seq tutorial. The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n", "\n", - "\"attention\n", + "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\"\u003e\n", "\n", "The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*. \n", "\n", "Here are the equations that are implemented:\n", "\n", - "\"attention\n", - "\"attention\n", + "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\"\u003e\n", + "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\"\u003e\n", "\n", "We're using *Bahdanau attention*. Lets decide on notation before writing the simplified form:\n", "\n", @@ -435,17 +362,14 @@ ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "avyJ_4VIUoHb", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "avyJ_4VIUoHb" }, - "cell_type": "code", + "outputs": [], "source": [ "def gru(units):\n", " # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)\n", @@ -461,22 +385,17 @@ " return_state=True, \n", " recurrent_activation='sigmoid', \n", " recurrent_initializer='glorot_uniform')" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "nZ2rI24i3jFg", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "nZ2rI24i3jFg" }, - "cell_type": "code", + "outputs": [], "source": [ "class Encoder(tf.keras.Model):\n", " def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n", @@ -493,22 +412,17 @@ " \n", " def initialize_hidden_state(self):\n", " return tf.zeros((self.batch_sz, self.enc_units))" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "yJ_B3mhW3jFk", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "yJ_B3mhW3jFk" }, - "cell_type": "code", + "outputs": [], "source": [ "class Decoder(tf.keras.Model):\n", " def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):\n", @@ -562,51 +476,41 @@ " \n", " def initialize_hidden_state(self):\n", " return tf.zeros((self.batch_sz, self.dec_units))" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "P5UY8wko3jFp", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "P5UY8wko3jFp" }, - "cell_type": "code", + "outputs": [], "source": [ "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n", "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "_ch_71VbIRfK", - "colab_type": "text" + "colab_type": "text", + "id": "_ch_71VbIRfK" }, - "cell_type": "markdown", "source": [ "## Define the optimizer and the loss function" ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "WmTHr5iV3jFr", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "WmTHr5iV3jFr" }, - "cell_type": "code", + "outputs": [], "source": [ "optimizer = tf.train.AdamOptimizer()\n", "\n", @@ -615,16 +519,41 @@ " mask = 1 - np.equal(real, 0)\n", " loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n", " return tf.reduce_mean(loss_)" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "hpObfY22IddU", - "colab_type": "text" + "colab_type": "text", + "id": "DMVWzzsfNl4e" }, + "source": [ + "## Checkpoints (Object-based saving)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Zj8bXQTgNwrF" + }, + "outputs": [], + "source": [ + "checkpoint_dir = './training_checkpoints'\n", + "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n", + "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n", + " encoder=encoder,\n", + " decoder=decoder)" + ] + }, + { "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hpObfY22IddU" + }, "source": [ "## Training\n", "\n", @@ -638,17 +567,14 @@ ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "ddefjBMa3jF0", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "ddefjBMa3jF0" }, - "cell_type": "code", + "outputs": [], "source": [ "EPOCHS = 10\n", "\n", @@ -666,7 +592,7 @@ " \n", " dec_hidden = enc_hidden\n", " \n", - " dec_input = tf.expand_dims([targ_lang.word2idx['']] * BATCH_SIZE, 1) \n", + " dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']] * BATCH_SIZE, 1) \n", " \n", " # Teacher forcing - feeding the target as the next input\n", " for t in range(1, targ.shape[1]):\n", @@ -686,26 +612,27 @@ " \n", " gradients = tape.gradient(loss, variables)\n", " \n", - " optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n", + " optimizer.apply_gradients(zip(gradients, variables))\n", " \n", " if batch % 100 == 0:\n", " print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,\n", " batch,\n", " batch_loss.numpy()))\n", + " # saving (checkpoint) the model every 2 epochs\n", + " if epoch % 2 == 0:\n", + " checkpoint.save(file_prefix = checkpoint_prefix)\n", " \n", " print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n", " total_loss / N_BATCH))\n", " print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "mU3Ce8M6I3rz", - "colab_type": "text" + "colab_type": "text", + "id": "mU3Ce8M6I3rz" }, - "cell_type": "markdown", "source": [ "## Translate\n", "\n", @@ -717,17 +644,14 @@ ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "EbQpyYs13jF_", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "EbQpyYs13jF_" }, - "cell_type": "code", + "outputs": [], "source": [ "def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n", " attention_plot = np.zeros((max_length_targ, max_length_inp))\n", @@ -744,7 +668,7 @@ " enc_out, enc_hidden = encoder(inputs, hidden)\n", "\n", " dec_hidden = enc_hidden\n", - " dec_input = tf.expand_dims([targ_lang.word2idx['']], 0)\n", + " dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']], 0)\n", "\n", " for t in range(max_length_targ):\n", " predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)\n", @@ -757,29 +681,24 @@ "\n", " result += targ_lang.idx2word[predicted_id] + ' '\n", "\n", - " if targ_lang.idx2word[predicted_id] == '':\n", + " if targ_lang.idx2word[predicted_id] == '\u003cend\u003e':\n", " return result, sentence, attention_plot\n", " \n", " # the predicted ID is fed back into the model\n", " dec_input = tf.expand_dims([predicted_id], 0)\n", "\n", " return result, sentence, attention_plot" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "s5hQWlbN3jGF", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "s5hQWlbN3jGF" }, - "cell_type": "code", + "outputs": [], "source": [ "# function for plotting the attention weights\n", "def plot_attention(attention, sentence, predicted_sentence):\n", @@ -793,22 +712,17 @@ " ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)\n", "\n", " plt.show()" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "sl9zUHzg3jGI", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "sl9zUHzg3jGI" }, - "cell_type": "code", + "outputs": [], "source": [ "def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n", " result, sentence, attention_plot = evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)\n", @@ -818,89 +732,91 @@ " \n", " attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]\n", " plot_attention(attention_plot, sentence.split(' '), result.split(' '))" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "WrAM0FDomq3E", + "colab_type": "text", + "id": "n250XbnjOaqP" + }, + "source": [ + "## Restore the latest checkpoint and test" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "UJpT9D5_OgP6" }, + "outputs": [], + "source": [ + "# restoring the latest checkpoint in checkpoint_dir\n", + "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))" + ] + }, + { "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "WrAM0FDomq3E" + }, + "outputs": [], "source": [ "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "zSx2iM36EZQZ", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "zSx2iM36EZQZ" }, - "cell_type": "code", + "outputs": [], "source": [ "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "A3LLCx3ZE0Ls", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "A3LLCx3ZE0Ls" }, - "cell_type": "code", + "outputs": [], "source": [ "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "DUQVLVqUE1YW", + "colab": {}, "colab_type": "code", - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - } + "id": "DUQVLVqUE1YW" }, - "cell_type": "code", + "outputs": [], "source": [ "# wrong translation\n", "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "RTe5P5ioMJwN", - "colab_type": "text" + "colab_type": "text", + "id": "RTe5P5ioMJwN" }, - "cell_type": "markdown", "source": [ "## Next steps\n", "\n", @@ -908,5 +824,31 @@ "* Experiment with training on a larger dataset, or using more epochs\n" ] } - ] + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "nmt_with_attention.ipynb", + "private_outputs": true, + "provenance": [ + { + "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U", + "timestamp": 1527858391290 + }, + { + "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv", + "timestamp": 1527776041613 + } + ], + "toc_visible": true, + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb index 7c0f9b5b8161a763c4153ebdeece7e0d1b90b384..51b7ffc4de0cee31f7a907ae7bf90f17056f9bcf 100644 --- a/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb +++ b/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb @@ -1,46 +1,30 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "automatic_differentiation.ipynb", - "version": "0.3.2", - "views": {}, - "default_view": {}, - "provenance": [], - "private_outputs": true, - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, "cells": [ { + "cell_type": "markdown", "metadata": { - "id": "t09eeeR5prIJ", - "colab_type": "text" + "colab_type": "text", + "id": "t09eeeR5prIJ" }, - "cell_type": "markdown", "source": [ "##### Copyright 2018 The TensorFlow Authors." ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "GCCk8_dHpuNf", - "colab_type": "code", + "cellView": "form", "colab": { "autoexec": { "startup": false, "wait_interval": 0 } }, - "cellView": "form" + "colab_type": "code", + "id": "GCCk8_dHpuNf" }, - "cell_type": "code", + "outputs": [], "source": [ "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -53,81 +37,79 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "xh8WkEwWpnm7", - "colab_type": "text" + "colab_type": "text", + "id": "xh8WkEwWpnm7" }, - "cell_type": "markdown", "source": [ "# Automatic differentiation and gradient tape" ] }, { + "cell_type": "markdown", "metadata": { - "id": "idv0bPeCp325", - "colab_type": "text" + "colab_type": "text", + "id": "idv0bPeCp325" }, - "cell_type": "markdown", "source": [ - "
\n", - "\n", - " Run in Google Colab\n", - "\n", - "View source on GitHub
" + "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", + "\u003c/td\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e" ] }, { + "cell_type": "markdown", "metadata": { - "id": "vDJ4XzMqodTy", - "colab_type": "text" + "colab_type": "text", + "id": "vDJ4XzMqodTy" }, - "cell_type": "markdown", "source": [ "In the previous tutorial we introduced `Tensor`s and operations on them. In this tutorial we will cover [automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation), a key technique for optimizing machine learning models." ] }, { + "cell_type": "markdown", "metadata": { - "id": "GQJysDM__Qb0", - "colab_type": "text" + "colab_type": "text", + "id": "GQJysDM__Qb0" }, - "cell_type": "markdown", "source": [ "## Setup\n" ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "OiMPZStlibBv", - "colab_type": "code", "colab": { "autoexec": { "startup": false, "wait_interval": 0 } - } + }, + "colab_type": "code", + "id": "OiMPZStlibBv" }, - "cell_type": "code", + "outputs": [], "source": [ "import tensorflow as tf\n", "tf.enable_eager_execution()\n", "\n", "tfe = tf.contrib.eager # Shorthand for some symbols" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "1CLWJl0QliB0", - "colab_type": "text" + "colab_type": "text", + "id": "1CLWJl0QliB0" }, - "cell_type": "markdown", "source": [ "## Derivatives of a function\n", "\n", @@ -135,17 +117,19 @@ ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "9FViq92UX7P8", - "colab_type": "code", "colab": { "autoexec": { "startup": false, "wait_interval": 0 } - } + }, + "colab_type": "code", + "id": "9FViq92UX7P8" }, - "cell_type": "code", + "outputs": [], "source": [ "from math import pi\n", "\n", @@ -159,17 +143,15 @@ "# with respect to its arguments. Since f() has a single argument,\n", "# grad_f will return a list with a single element.\n", "grad_f = tfe.gradients_function(f)\n", - "assert tf.abs(grad_f(pi/2)[0]).numpy() < 1e-7" - ], - "execution_count": 0, - "outputs": [] + "assert tf.abs(grad_f(pi/2)[0]).numpy() \u003c 1e-7" + ] }, { + "cell_type": "markdown", "metadata": { - "id": "v9fPs8RyopCf", - "colab_type": "text" + "colab_type": "text", + "id": "v9fPs8RyopCf" }, - "cell_type": "markdown", "source": [ "### Higher-order gradients\n", "\n", @@ -177,17 +159,19 @@ ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "3D0ZvnGYo0rW", - "colab_type": "code", "colab": { "autoexec": { "startup": false, "wait_interval": 0 } - } + }, + "colab_type": "code", + "id": "3D0ZvnGYo0rW" }, - "cell_type": "code", + "outputs": [], "source": [ "def f(x):\n", " return tf.square(tf.sin(x))\n", @@ -205,16 +189,14 @@ "plt.plot(x, grad(grad(grad(f)))(x), label=\"third derivative\")\n", "plt.legend()\n", "plt.show()" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "-39gouo7mtgu", - "colab_type": "text" + "colab_type": "text", + "id": "-39gouo7mtgu" }, - "cell_type": "markdown", "source": [ "## Gradient tapes\n", "\n", @@ -225,21 +207,25 @@ ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "MH0UfjympWf7", - "colab_type": "code", "colab": { "autoexec": { "startup": false, "wait_interval": 0 } - } + }, + "colab_type": "code", + "id": "MH0UfjympWf7" }, - "cell_type": "code", + "outputs": [], "source": [ "def f(x, y):\n", " output = 1\n", - " for i in range(y):\n", + " # Must use range(int(y)) instead of range(y) in Python 3 when\n", + " # using TensorFlow 1.10 and earlier. Can use range(y) in 1.11+\n", + " for i in range(int(y)):\n", " output = tf.multiply(output, x)\n", " return output\n", "\n", @@ -251,16 +237,14 @@ "assert g(3.0, 2).numpy() == 6.0 # And its gradient will be 2 * x\n", "assert f(4.0, 3).numpy() == 64.0 # f(x, 3) is essentially x * x * x\n", "assert g(4.0, 3).numpy() == 48.0 # And its gradient will be 3 * x * x" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "aNmR5-jhpX2t", - "colab_type": "text" + "colab_type": "text", + "id": "aNmR5-jhpX2t" }, - "cell_type": "markdown", "source": [ "At times it may be inconvenient to encapsulate computation of interest into a function. For example, if you want the gradient of the output with respect to intermediate values computed in the function. In such cases, the slightly more verbose but explicit [tf.GradientTape](https://www.tensorflow.org/api_docs/python/tf/GradientTape) context is useful. All computation inside the context of a `tf.GradientTape` is \"recorded\".\n", "\n", @@ -268,17 +252,19 @@ ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "bAFeIE8EuVIq", - "colab_type": "code", "colab": { "autoexec": { "startup": false, "wait_interval": 0 } - } + }, + "colab_type": "code", + "id": "bAFeIE8EuVIq" }, - "cell_type": "code", + "outputs": [], "source": [ "x = tf.ones((2, 2))\n", " \n", @@ -300,16 +286,14 @@ "for i in [0, 1]:\n", " for j in [0, 1]:\n", " assert dz_dx[i][j].numpy() == 8.0" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "DK05KXrAAld3", - "colab_type": "text" + "colab_type": "text", + "id": "DK05KXrAAld3" }, - "cell_type": "markdown", "source": [ "### Higher-order gradients\n", "\n", @@ -317,17 +301,19 @@ ] }, { + "cell_type": "code", + "execution_count": 0, "metadata": { - "id": "cPQgthZ7ugRJ", - "colab_type": "code", "colab": { "autoexec": { "startup": false, "wait_interval": 0 } - } + }, + "colab_type": "code", + "id": "cPQgthZ7ugRJ" }, - "cell_type": "code", + "outputs": [], "source": [ "# TODO(ashankar): Should we use the persistent tape here instead? Follow up on Tom and Alex's discussion\n", "\n", @@ -344,21 +330,37 @@ "\n", "assert dy_dx.numpy() == 3.0\n", "assert d2y_dx2.numpy() == 6.0" - ], - "execution_count": 0, - "outputs": [] + ] }, { + "cell_type": "markdown", "metadata": { - "id": "4U1KKzUpNl58", - "colab_type": "text" + "colab_type": "text", + "id": "4U1KKzUpNl58" }, - "cell_type": "markdown", "source": [ "## Next Steps\n", "\n", "In this tutorial we covered gradient computation in TensorFlow. With that we have enough of the primitives required to build an train neural networks, which we will cover in the [next tutorial](https://github.com/tensorflow/models/tree/master/official/contrib/eager/python/examples/notebooks/3_neural_networks.ipynb)." ] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "default_view": {}, + "name": "automatic_differentiation.ipynb", + "private_outputs": true, + "provenance": [], + "toc_visible": true, + "version": "0.3.2", + "views": {} + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb b/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..acc0f5b6531c39861cd38b4c0cc7683a4c26c6ff --- /dev/null +++ b/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb @@ -0,0 +1,810 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0TD5ZrvEMbhZ" + }, + "source": [ + "##### Copyright 2018 The TensorFlow Authors.\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\").\n", + "\n", + "# Pix2Pix: An example with tf.keras and eager\n", + "\n", + "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb\"\u003e\n", + " \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e \n", + "\u003c/td\u003e\u003ctd\u003e\n", + "\u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/pix2pix/pix2pix_eager.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ITZuApL56Mny" + }, + "source": [ + "This notebook demonstrates image to image translation using conditional GAN's, as described in [Image-to-Image Translation with Conditional Adversarial Networks](https://arxiv.org/abs/1611.07004). Using this technique we can colorize black and white photos, convert google maps to google earth, etc. Here, we convert building facades to real buildings. We use [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager) to achieve this.\n", + "\n", + "In example, we will use the [CMP Facade Database](http://cmp.felk.cvut.cz/~tylecr1/facade/), helpfully provided by the [Center for Machine Perception](http://cmp.felk.cvut.cz/) at the [Czech Technical University in Prague](https://www.cvut.cz/). To keep our example short, we will use a preprocessed [copy](https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets/) of this dataset, created by the authors of the [paper](https://arxiv.org/abs/1611.07004) above.\n", + "\n", + "Each epoch takes around 58 seconds on a single P100 GPU.\n", + "\n", + "Below is the output generated after training the model for 200 epochs.\n", + "\n", + "\n", + "![sample output_1](https://www.tensorflow.org/images/gan/pix2pix_1.png)\n", + "![sample output_2](https://www.tensorflow.org/images/gan/pix2pix_2.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "e1_Y75QXJS6h" + }, + "source": [ + "## Import TensorFlow and enable eager execution" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "YfIk2es3hJEd" + }, + "outputs": [], + "source": [ + "# Import TensorFlow \u003e= 1.10 and enable eager execution\n", + "import tensorflow as tf\n", + "tf.enable_eager_execution()\n", + "\n", + "import os\n", + "import time\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import PIL\n", + "from IPython.display import clear_output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "iYn4MdZnKCey" + }, + "source": [ + "## Load the dataset\n", + "\n", + "You can download this dataset and similar datasets from [here](https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets). As mentioned in the [paper](https://arxiv.org/abs/1611.07004) we apply random jittering and mirroring to the training dataset.\n", + "* In random jittering, the image is resized to `286 x 286` and then randomly cropped to `256 x 256`\n", + "* In random mirroring, the image is randomly flipped horizontally i.e left to right." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Kn-k8kTXuAlv" + }, + "outputs": [], + "source": [ + "path_to_zip = tf.keras.utils.get_file('facades.tar.gz',\n", + " cache_subdir=os.path.abspath('.'),\n", + " origin='https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets/facades.tar.gz', \n", + " extract=True)\n", + "\n", + "PATH = os.path.join(os.path.dirname(path_to_zip), 'facades/')" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2CbTEt448b4R" + }, + "outputs": [], + "source": [ + "BUFFER_SIZE = 400\n", + "BATCH_SIZE = 1\n", + "IMG_WIDTH = 256\n", + "IMG_HEIGHT = 256" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "tyaP4hLJ8b4W" + }, + "outputs": [], + "source": [ + "def load_image(image_file, is_train):\n", + " image = tf.read_file(image_file)\n", + " image = tf.image.decode_jpeg(image)\n", + "\n", + " w = tf.shape(image)[1]\n", + "\n", + " w = w // 2\n", + " real_image = image[:, :w, :]\n", + " input_image = image[:, w:, :]\n", + "\n", + " input_image = tf.cast(input_image, tf.float32)\n", + " real_image = tf.cast(real_image, tf.float32)\n", + "\n", + " if is_train:\n", + " # random jittering\n", + " \n", + " # resizing to 286 x 286 x 3\n", + " # method = 2 indicates using \"ResizeMethod.NEAREST_NEIGHBOR\"\n", + " input_image = tf.image.resize_images(input_image, [286, 286], \n", + " align_corners=True, method=2)\n", + " real_image = tf.image.resize_images(real_image, [286, 286], \n", + " align_corners=True, method=2)\n", + " \n", + " # randomly cropping to 256 x 256 x 3\n", + " stacked_image = tf.stack([input_image, real_image], axis=0)\n", + " cropped_image = tf.random_crop(stacked_image, size=[2, IMG_HEIGHT, IMG_WIDTH, 3])\n", + " input_image, real_image = cropped_image[0], cropped_image[1]\n", + "\n", + " if np.random.random() \u003e 0.5:\n", + " # random mirroring\n", + " input_image = tf.image.flip_left_right(input_image)\n", + " real_image = tf.image.flip_left_right(real_image)\n", + " else:\n", + " input_image = tf.image.resize_images(input_image, size=[IMG_HEIGHT, IMG_WIDTH], \n", + " align_corners=True, method=2)\n", + " real_image = tf.image.resize_images(real_image, size=[IMG_HEIGHT, IMG_WIDTH], \n", + " align_corners=True, method=2)\n", + " \n", + " # normalizing the images to [-1, 1]\n", + " input_image = (input_image / 127.5) - 1\n", + " real_image = (real_image / 127.5) - 1\n", + "\n", + " return input_image, real_image" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PIGN6ouoQxt3" + }, + "source": [ + "## Use tf.data to create batches, map(do preprocessing) and shuffle the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "SQHmYSmk8b4b" + }, + "outputs": [], + "source": [ + "train_dataset = tf.data.Dataset.list_files(PATH+'train/*.jpg')\n", + "train_dataset = train_dataset.shuffle(BUFFER_SIZE)\n", + "train_dataset = train_dataset.map(lambda x: load_image(x, True))\n", + "train_dataset = train_dataset.batch(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "MS9J0yA58b4g" + }, + "outputs": [], + "source": [ + "test_dataset = tf.data.Dataset.list_files(PATH+'test/*.jpg')\n", + "test_dataset = test_dataset.map(lambda x: load_image(x, False))\n", + "test_dataset = test_dataset.batch(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "THY-sZMiQ4UV" + }, + "source": [ + "## Write the generator and discriminator models\n", + "\n", + "* **Generator** \n", + " * The architecture of generator is a modified U-Net.\n", + " * Each block in the encoder is (Conv -\u003e Batchnorm -\u003e Leaky ReLU)\n", + " * Each block in the decoder is (Transposed Conv -\u003e Batchnorm -\u003e Dropout(applied to the first 3 blocks) -\u003e ReLU)\n", + " * There are skip connections between the encoder and decoder (as in U-Net).\n", + " \n", + "* **Discriminator**\n", + " * The Discriminator is a PatchGAN.\n", + " * Each block in the discriminator is (Conv -\u003e BatchNorm -\u003e Leaky ReLU)\n", + " * The shape of the output after the last layer is (batch_size, 30, 30, 1)\n", + " * Each 30x30 patch of the output classifies a 70x70 portion of the input image (such an architecture is called a PatchGAN).\n", + " * Discriminator receives 2 inputs.\n", + " * Input image and the target image, which it should classify as real.\n", + " * Input image and the generated image (output of generator), which it should classify as fake. \n", + " * We concatenate these 2 inputs together in the code (`tf.concat([inp, tar], axis=-1)`)\n", + "\n", + "* Shape of the input travelling through the generator and the discriminator is in the comments in the code.\n", + "\n", + "To learn more about the architecture and the hyperparameters you can refer the [paper](https://arxiv.org/abs/1611.07004).\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "tqqvWxlw8b4l" + }, + "outputs": [], + "source": [ + "OUTPUT_CHANNELS = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "lFPI4Nu-8b4q" + }, + "outputs": [], + "source": [ + "class Downsample(tf.keras.Model):\n", + " \n", + " def __init__(self, filters, size, apply_batchnorm=True):\n", + " super(Downsample, self).__init__()\n", + " self.apply_batchnorm = apply_batchnorm\n", + " initializer = tf.random_normal_initializer(0., 0.02)\n", + "\n", + " self.conv1 = tf.keras.layers.Conv2D(filters, \n", + " (size, size), \n", + " strides=2, \n", + " padding='same',\n", + " kernel_initializer=initializer,\n", + " use_bias=False)\n", + " if self.apply_batchnorm:\n", + " self.batchnorm = tf.keras.layers.BatchNormalization()\n", + " \n", + " def call(self, x, training):\n", + " x = self.conv1(x)\n", + " if self.apply_batchnorm:\n", + " x = self.batchnorm(x, training=training)\n", + " x = tf.nn.leaky_relu(x)\n", + " return x \n", + "\n", + "\n", + "class Upsample(tf.keras.Model):\n", + " \n", + " def __init__(self, filters, size, apply_dropout=False):\n", + " super(Upsample, self).__init__()\n", + " self.apply_dropout = apply_dropout\n", + " initializer = tf.random_normal_initializer(0., 0.02)\n", + "\n", + " self.up_conv = tf.keras.layers.Conv2DTranspose(filters, \n", + " (size, size), \n", + " strides=2, \n", + " padding='same',\n", + " kernel_initializer=initializer,\n", + " use_bias=False)\n", + " self.batchnorm = tf.keras.layers.BatchNormalization()\n", + " if self.apply_dropout:\n", + " self.dropout = tf.keras.layers.Dropout(0.5)\n", + "\n", + " def call(self, x1, x2, training):\n", + " x = self.up_conv(x1)\n", + " x = self.batchnorm(x, training=training)\n", + " if self.apply_dropout:\n", + " x = self.dropout(x, training=training)\n", + " x = tf.nn.relu(x)\n", + " x = tf.concat([x, x2], axis=-1)\n", + " return x\n", + "\n", + "\n", + "class Generator(tf.keras.Model):\n", + " \n", + " def __init__(self):\n", + " super(Generator, self).__init__()\n", + " initializer = tf.random_normal_initializer(0., 0.02)\n", + " \n", + " self.down1 = Downsample(64, 4, apply_batchnorm=False)\n", + " self.down2 = Downsample(128, 4)\n", + " self.down3 = Downsample(256, 4)\n", + " self.down4 = Downsample(512, 4)\n", + " self.down5 = Downsample(512, 4)\n", + " self.down6 = Downsample(512, 4)\n", + " self.down7 = Downsample(512, 4)\n", + " self.down8 = Downsample(512, 4)\n", + "\n", + " self.up1 = Upsample(512, 4, apply_dropout=True)\n", + " self.up2 = Upsample(512, 4, apply_dropout=True)\n", + " self.up3 = Upsample(512, 4, apply_dropout=True)\n", + " self.up4 = Upsample(512, 4)\n", + " self.up5 = Upsample(256, 4)\n", + " self.up6 = Upsample(128, 4)\n", + " self.up7 = Upsample(64, 4)\n", + "\n", + " self.last = tf.keras.layers.Conv2DTranspose(OUTPUT_CHANNELS, \n", + " (4, 4), \n", + " strides=2, \n", + " padding='same',\n", + " kernel_initializer=initializer)\n", + " \n", + " @tf.contrib.eager.defun\n", + " def call(self, x, training):\n", + " # x shape == (bs, 256, 256, 3) \n", + " x1 = self.down1(x, training=training) # (bs, 128, 128, 64)\n", + " x2 = self.down2(x1, training=training) # (bs, 64, 64, 128)\n", + " x3 = self.down3(x2, training=training) # (bs, 32, 32, 256)\n", + " x4 = self.down4(x3, training=training) # (bs, 16, 16, 512)\n", + " x5 = self.down5(x4, training=training) # (bs, 8, 8, 512)\n", + " x6 = self.down6(x5, training=training) # (bs, 4, 4, 512)\n", + " x7 = self.down7(x6, training=training) # (bs, 2, 2, 512)\n", + " x8 = self.down8(x7, training=training) # (bs, 1, 1, 512)\n", + "\n", + " x9 = self.up1(x8, x7, training=training) # (bs, 2, 2, 1024)\n", + " x10 = self.up2(x9, x6, training=training) # (bs, 4, 4, 1024)\n", + " x11 = self.up3(x10, x5, training=training) # (bs, 8, 8, 1024)\n", + " x12 = self.up4(x11, x4, training=training) # (bs, 16, 16, 1024)\n", + " x13 = self.up5(x12, x3, training=training) # (bs, 32, 32, 512)\n", + " x14 = self.up6(x13, x2, training=training) # (bs, 64, 64, 256)\n", + " x15 = self.up7(x14, x1, training=training) # (bs, 128, 128, 128)\n", + "\n", + " x16 = self.last(x15) # (bs, 256, 256, 3)\n", + " x16 = tf.nn.tanh(x16)\n", + "\n", + " return x16" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ll6aNeQx8b4v" + }, + "outputs": [], + "source": [ + "class DiscDownsample(tf.keras.Model):\n", + " \n", + " def __init__(self, filters, size, apply_batchnorm=True):\n", + " super(DiscDownsample, self).__init__()\n", + " self.apply_batchnorm = apply_batchnorm\n", + " initializer = tf.random_normal_initializer(0., 0.02)\n", + "\n", + " self.conv1 = tf.keras.layers.Conv2D(filters, \n", + " (size, size), \n", + " strides=2, \n", + " padding='same',\n", + " kernel_initializer=initializer,\n", + " use_bias=False)\n", + " if self.apply_batchnorm:\n", + " self.batchnorm = tf.keras.layers.BatchNormalization()\n", + " \n", + " def call(self, x, training):\n", + " x = self.conv1(x)\n", + " if self.apply_batchnorm:\n", + " x = self.batchnorm(x, training=training)\n", + " x = tf.nn.leaky_relu(x)\n", + " return x \n", + "\n", + "class Discriminator(tf.keras.Model):\n", + " \n", + " def __init__(self):\n", + " super(Discriminator, self).__init__()\n", + " initializer = tf.random_normal_initializer(0., 0.02)\n", + " \n", + " self.down1 = DiscDownsample(64, 4, False)\n", + " self.down2 = DiscDownsample(128, 4)\n", + " self.down3 = DiscDownsample(256, 4)\n", + " \n", + " # we are zero padding here with 1 because we need our shape to \n", + " # go from (batch_size, 32, 32, 256) to (batch_size, 31, 31, 512)\n", + " self.zero_pad1 = tf.keras.layers.ZeroPadding2D()\n", + " self.conv = tf.keras.layers.Conv2D(512, \n", + " (4, 4), \n", + " strides=1, \n", + " kernel_initializer=initializer, \n", + " use_bias=False)\n", + " self.batchnorm1 = tf.keras.layers.BatchNormalization()\n", + " \n", + " # shape change from (batch_size, 31, 31, 512) to (batch_size, 30, 30, 1)\n", + " self.zero_pad2 = tf.keras.layers.ZeroPadding2D()\n", + " self.last = tf.keras.layers.Conv2D(1, \n", + " (4, 4), \n", + " strides=1,\n", + " kernel_initializer=initializer)\n", + " \n", + " @tf.contrib.eager.defun\n", + " def call(self, inp, tar, training):\n", + " # concatenating the input and the target\n", + " x = tf.concat([inp, tar], axis=-1) # (bs, 256, 256, channels*2)\n", + " x = self.down1(x, training=training) # (bs, 128, 128, 64)\n", + " x = self.down2(x, training=training) # (bs, 64, 64, 128)\n", + " x = self.down3(x, training=training) # (bs, 32, 32, 256)\n", + "\n", + " x = self.zero_pad1(x) # (bs, 34, 34, 256)\n", + " x = self.conv(x) # (bs, 31, 31, 512)\n", + " x = self.batchnorm1(x, training=training)\n", + " x = tf.nn.leaky_relu(x)\n", + " \n", + " x = self.zero_pad2(x) # (bs, 33, 33, 512)\n", + " # don't add a sigmoid activation here since\n", + " # the loss function expects raw logits.\n", + " x = self.last(x) # (bs, 30, 30, 1)\n", + "\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gDkA05NE6QMs" + }, + "outputs": [], + "source": [ + "# The call function of Generator and Discriminator have been decorated\n", + "# with tf.contrib.eager.defun()\n", + "# We get a performance speedup if defun is used (~25 seconds per epoch)\n", + "generator = Generator()\n", + "discriminator = Discriminator()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0FMYgY_mPfTi" + }, + "source": [ + "## Define the loss functions and the optimizer\n", + "\n", + "* **Discriminator loss**\n", + " * The discriminator loss function takes 2 inputs; **real images, generated images**\n", + " * real_loss is a sigmoid cross entropy loss of the **real images** and an **array of ones(since these are the real images)**\n", + " * generated_loss is a sigmoid cross entropy loss of the **generated images** and an **array of zeros(since these are the fake images)**\n", + " * Then the total_loss is the sum of real_loss and the generated_loss\n", + " \n", + "* **Generator loss**\n", + " * It is a sigmoid cross entropy loss of the generated images and an **array of ones**.\n", + " * The [paper](https://arxiv.org/abs/1611.07004) also includes L1 loss which is MAE (mean absolute error) between the generated image and the target image.\n", + " * This allows the generated image to become structurally similar to the target image.\n", + " * The formula to calculate the total generator loss = gan_loss + LAMBDA * l1_loss, where LAMBDA = 100. This value was decided by the authors of the [paper](https://arxiv.org/abs/1611.07004)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "cyhxTuvJyIHV" + }, + "outputs": [], + "source": [ + "LAMBDA = 100" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "wkMNfBWlT-PV" + }, + "outputs": [], + "source": [ + "def discriminator_loss(disc_real_output, disc_generated_output):\n", + " real_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels = tf.ones_like(disc_real_output), \n", + " logits = disc_real_output)\n", + " generated_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels = tf.zeros_like(disc_generated_output), \n", + " logits = disc_generated_output)\n", + "\n", + " total_disc_loss = real_loss + generated_loss\n", + "\n", + " return total_disc_loss" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "90BIcCKcDMxz" + }, + "outputs": [], + "source": [ + "def generator_loss(disc_generated_output, gen_output, target):\n", + " gan_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels = tf.ones_like(disc_generated_output),\n", + " logits = disc_generated_output) \n", + " # mean absolute error\n", + " l1_loss = tf.reduce_mean(tf.abs(target - gen_output))\n", + "\n", + " total_gen_loss = gan_loss + (LAMBDA * l1_loss)\n", + "\n", + " return total_gen_loss" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "iWCn_PVdEJZ7" + }, + "outputs": [], + "source": [ + "generator_optimizer = tf.train.AdamOptimizer(2e-4, beta1=0.5)\n", + "discriminator_optimizer = tf.train.AdamOptimizer(2e-4, beta1=0.5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "aKUZnDiqQrAh" + }, + "source": [ + "## Checkpoints (Object-based saving)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "WJnftd5sQsv6" + }, + "outputs": [], + "source": [ + "checkpoint_dir = './training_checkpoints'\n", + "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n", + "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n", + " discriminator_optimizer=discriminator_optimizer,\n", + " generator=generator,\n", + " discriminator=discriminator)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Rw1fkAczTQYh" + }, + "source": [ + "## Training\n", + "\n", + "* We start by iterating over the dataset\n", + "* The generator gets the input image and we get a generated output.\n", + "* The discriminator receives the input_image and the generated image as the first input. The second input is the input_image and the target_image.\n", + "* Next, we calculate the generator and the discriminator loss.\n", + "* Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables(inputs) and apply those to the optimizer.\n", + "\n", + "## Generate Images\n", + "\n", + "* After training, its time to generate some images!\n", + "* We pass images from the test dataset to the generator.\n", + "* The generator will then translate the input image into the output we expect.\n", + "* Last step is to plot the predictions and **voila!**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NS2GWywBbAWo" + }, + "outputs": [], + "source": [ + "EPOCHS = 200" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "RmdVsmvhPxyy" + }, + "outputs": [], + "source": [ + "def generate_images(model, test_input, tar):\n", + " # the training=True is intentional here since\n", + " # we want the batch statistics while running the model\n", + " # on the test dataset. If we use training=False, we will get \n", + " # the accumulated statistics learned from the training dataset\n", + " # (which we don't want)\n", + " prediction = model(test_input, training=True)\n", + " plt.figure(figsize=(15,15))\n", + "\n", + " display_list = [test_input[0], tar[0], prediction[0]]\n", + " title = ['Input Image', 'Ground Truth', 'Predicted Image']\n", + "\n", + " for i in range(3):\n", + " plt.subplot(1, 3, i+1)\n", + " plt.title(title[i])\n", + " # getting the pixel values between [0, 1] to plot it.\n", + " plt.imshow(display_list[i] * 0.5 + 0.5)\n", + " plt.axis('off')\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2M7LmLtGEMQJ" + }, + "outputs": [], + "source": [ + "def train(dataset, epochs): \n", + " for epoch in range(epochs):\n", + " start = time.time()\n", + "\n", + " for input_image, target in dataset:\n", + "\n", + " with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:\n", + " gen_output = generator(input_image, training=True)\n", + "\n", + " disc_real_output = discriminator(input_image, target, training=True)\n", + " disc_generated_output = discriminator(input_image, gen_output, training=True)\n", + "\n", + " gen_loss = generator_loss(disc_generated_output, gen_output, target)\n", + " disc_loss = discriminator_loss(disc_real_output, disc_generated_output)\n", + "\n", + " generator_gradients = gen_tape.gradient(gen_loss, \n", + " generator.variables)\n", + " discriminator_gradients = disc_tape.gradient(disc_loss, \n", + " discriminator.variables)\n", + "\n", + " generator_optimizer.apply_gradients(zip(generator_gradients, \n", + " generator.variables))\n", + " discriminator_optimizer.apply_gradients(zip(discriminator_gradients, \n", + " discriminator.variables))\n", + "\n", + " if epoch % 1 == 0:\n", + " clear_output(wait=True)\n", + " for inp, tar in test_dataset.take(1):\n", + " generate_images(generator, inp, tar)\n", + " \n", + " # saving (checkpoint) the model every 20 epochs\n", + " if epoch % 20 == 0:\n", + " checkpoint.save(file_prefix = checkpoint_prefix)\n", + "\n", + " print ('Time taken for epoch {} is {} sec\\n'.format(epoch + 1,\n", + " time.time()-start))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "a1zZmKmvOH85" + }, + "outputs": [], + "source": [ + "train(train_dataset, EPOCHS)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kz80bY3aQ1VZ" + }, + "source": [ + "## Restore the latest checkpoint and test" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4t4x69adQ5xb" + }, + "outputs": [], + "source": [ + "# restoring the latest checkpoint in checkpoint_dir\n", + "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "1RGysMU_BZhx" + }, + "source": [ + "## Testing on the entire test dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "KUgSnmy2nqSP" + }, + "outputs": [], + "source": [ + "# Run the trained model on the entire test dataset\n", + "for inp, tar in test_dataset:\n", + " generate_images(generator, inp, tar)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3AJXOByaZVOf" + }, + "outputs": [], + "source": [ + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "pix2pix_eager.ipynb", + "private_outputs": true, + "provenance": [ + { + "file_id": "1eb0NOTQapkYs3X0v-zL1x5_LFKgDISnp", + "timestamp": 1527173385672 + } + ], + "toc_visible": true, + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tensorflow/contrib/eager/python/examples/revnet/README.md b/tensorflow/contrib/eager/python/examples/revnet/README.md index 2875d0ffb330c2593a7f293f417a5d1ce8322624..822d86e9c7a7e620da3b84ded9af98b1c1d4b701 100644 --- a/tensorflow/contrib/eager/python/examples/revnet/README.md +++ b/tensorflow/contrib/eager/python/examples/revnet/README.md @@ -1,6 +1,6 @@ # RevNet with TensorFlow eager execution -This folder contains a TensorFlow eager implementation of the [Reversible Residual Network](https://arxiv.org/pdf/1707.04585.pdf) adapted from the released implementation by the authors. The presented implementation can be ran both in eager and graph mode. The code is considerably simplified with `tf.GradientTape`. Moreover, we reduce the step of reconstructing the outputs. This saves us from using `tf.stop_gradient` and makes the model run faster. +This folder contains a TensorFlow eager implementation of the [Reversible Residual Network](https://arxiv.org/pdf/1707.04585.pdf) adapted from the released implementation by the authors. The presented implementation can be ran with both eager and graph execution. The code is considerably simplified with `tf.GradientTape`. Moreover, we reduce the a redundant forward pass in the implementation by the authors. This saves us from using `tf.stop_gradient` and makes the model run faster. ## Content @@ -16,7 +16,7 @@ This folder contains a TensorFlow eager implementation of the [Reversible Residu - `resnet_preprocessing.py`, `imagenet_input.py`: Boilerplate to read ImageNet data from TFRecords. ## Train on CIFAR-10/CIFAR-100 -- Make sure you have installed TensorFlow 1.9+ or the latest `tf-nightly` +- Make sure you have installed TensorFlow 1.10+ or the latest `tf-nightly` or `tf-nightly-gpu` pip package in order to access the eager execution feature. - First run @@ -41,11 +41,13 @@ python main.py --data_dir ${PWD}/cifar - `config`: RevNet configuration. - `use_defun`: Use `tfe.defun` to boost performance. -- To train a model with estimators in graph-mode, run +- To train a model with estimators in graph execution, run ```bash python main_estimator.py --data_dir ${PWD}/cifar ``` +To ensure our code works properly when using the Keras model in an estimator, +`tf-nightly` or `tf-nightly-gpu` is highly recommended as of August 2018. - Optional arguments for `main.py` include - `model_dir`: Directory to store eventfiles and checkpoints. @@ -54,13 +56,19 @@ python main_estimator.py --data_dir ${PWD}/cifar - `export`: Export the model for serving if True. ## Speed up with `tfe.defun` -Even though the speed difference between pure eager execution and graph-mode execution is noticeable, -the difference between fully "defunned" model training and graph-mode +To ensure that `tf.contrib.eager.defun` in our code works properly with all +part of the model during training, the latest `tf-nightly` or `tf-nightly-gpu` +is highly recommended as of August 2018. + +Even though the speed difference between pure eager execution and graph execution is noticeable, +the difference between fully "defunned" model training and graph training is negligible. ## Train on ImageNet with Cloud TPUs -The standard way to train models on Cloud TPUs is via TPU estimators and graph-mode +The standard way to train models on Cloud TPUs is via TPU estimators and graph execution. Models built with the `tf.keras` API are fully compatible with TPU estimators. +To ensure our code works properly in this setting, +`tf-nightly` or `tf-nightly-gpu` is highly recommended as of August 2018. ### Setup a Google Cloud project @@ -96,7 +104,8 @@ python main_estimator_tpu.py \ ``` ## Performance -- With the current implementation, RevNet-38 achieves >92% on CIFAR-10 and >71% on CIFAR-100. +- RevNet-38 achieves >92% and >71% accuracy on CIFAR-10 and CIFAR-100 respectively. +- RevNet-56 achieves <26% top-1 error rate on ImageNet. ## Reference The Reversible Residual Network: Backpropagation Without Storing Activations. diff --git a/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py b/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py index fda9020ddf79cd3fd59611d03c1a4202a4901337..9ff6b605b912772a92ab9e07a0ba5b9325030e43 100644 --- a/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py +++ b/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py @@ -188,6 +188,40 @@ class RevBlockTest(tf.test.TestCase): self._check_grad_angle(dx_true, dx) self._check_grad_angle(dw_true, dw) + def test_backward_grads_with_nativepy(self): + if not tf.test.is_gpu_available(): + self.skipTest("GPU not available") + + input_shape = (128, 8, 8) + data_shape = (16,) + input_shape + x = tf.random_normal(shape=data_shape, dtype=tf.float64) + dy = tf.random_normal(shape=data_shape, dtype=tf.float64) + dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=1) + block = blocks.RevBlock( + n_res=3, + filters=128, + strides=(1, 1), + input_shape=input_shape, + fused=False, + dtype=tf.float64) + with tf.GradientTape() as tape: + tape.watch(x) + x1, x2 = tf.split(x, num_or_size_splits=2, axis=1) + y1, y2 = block((x1, x2), training=True) + y = tf.concat((y1, y2), axis=1) + + # Compute true grads + dx_true = tape.gradient(y, x, output_gradients=dy) + + # Compute grads from reconstruction + (dx1, dx2), _ = block.backward_grads( + x=(x1, x2), y=(y1, y2), dy=(dy1, dy2), training=True) + dx = tf.concat((dx1, dx2), axis=1) + + thres = 1e-5 + diff_abs = tf.reshape(abs(dx - dx_true), [-1]) + assert all(diff_abs < thres) + class _ResidualTest(tf.test.TestCase): diff --git a/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py b/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py index e81351b1b14dbf6973e7430c369774339e2dcdd8..34a9984b0ecc527ad1991c28146246b716e96c98 100644 --- a/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py +++ b/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py @@ -211,8 +211,7 @@ class ImageNetInput(object): dataset = tf.data.Dataset.range(1).repeat().map(self._get_null_input) dataset = dataset.prefetch(batch_size) - dataset = dataset.apply( - tf.contrib.data.batch_and_drop_remainder(batch_size)) + dataset = dataset.batch(batch_size, drop_remainder=True) if self.transpose_input: dataset = dataset.map( lambda images, labels: (tf.transpose(images, [1, 2, 3, 0]), labels), diff --git a/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py b/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py index f0aad9b11088e72e9027e3ba59c1924ace9ee558..8520cf5b71af503be35d5415707a283fb363a476 100644 --- a/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py +++ b/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py @@ -12,22 +12,90 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Cloud TPU Estimator workflow with RevNet train on CIFAR-10.""" +"""Cloud TPU Estimator workflow with RevNet train on ImageNet.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function -import os import time from absl import flags import tensorflow as tf -from tensorflow.contrib.eager.python.examples.revnet import cifar_input -from tensorflow.contrib.eager.python.examples.revnet import main as main_ +from tensorflow.contrib import summary +from tensorflow.contrib.eager.python.examples.revnet import config as config_ +from tensorflow.contrib.eager.python.examples.revnet import imagenet_input from tensorflow.contrib.eager.python.examples.revnet import revnet from tensorflow.contrib.training.python.training import evaluation -from tensorflow.python.estimator import estimator as estimator_ +from tensorflow.python.estimator import estimator + +MEAN_RGB = [0.485, 0.456, 0.406] +STDDEV_RGB = [0.229, 0.224, 0.225] + + +def _host_call_fn(gs, loss, lr): + """Training host call. + + Creates scalar summaries for training metrics. + + This function is executed on the CPU and should not directly reference + any Tensors in the rest of the `model_fn`. To pass Tensors from the + model to the `metric_fn`, provide as part of the `host_call`. See + https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec + for more information. + + Arguments should match the list of `Tensor` objects passed as the second + element in the tuple passed to `host_call`. + + Args: + gs: `Tensor with shape `[batch]` for the global_step + loss: `Tensor` with shape `[batch]` for the training loss. + lr: `Tensor` with shape `[batch]` for the learning_rate. + + Returns: + List of summary ops to run on the CPU host. + """ + # Host call fns are executed FLAGS.iterations_per_loop times after one + # TPU loop is finished, setting max_queue value to the same as number of + # iterations will make the summary writer only flush the data to storage + # once per loop. + gs = gs[0] + with summary.create_file_writer( + FLAGS.model_dir, max_queue=FLAGS.iterations_per_loop).as_default(): + with summary.always_record_summaries(): + summary.scalar("loss", loss[0], step=gs) + summary.scalar("learning_rate", lr[0], step=gs) + return summary.all_summary_ops() + + +def _metric_fn(labels, logits): + """Evaluation metric function. Evaluates accuracy. + + This function is executed on the CPU and should not directly reference + any Tensors in the rest of the `model_fn`. To pass Tensors from the model + to the `metric_fn`, provide as part of the `eval_metrics`. See + https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec + for more information. + + Arguments should match the list of `Tensor` objects passed as the second + element in the tuple passed to `eval_metrics`. + + Args: + labels: `Tensor` with shape `[batch]`. + logits: `Tensor` with shape `[batch, num_classes]`. + + Returns: + A dict of the metrics to return from evaluation. + """ + predictions = tf.argmax(logits, axis=1) + top_1_accuracy = tf.metrics.accuracy(labels, predictions) + in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) + top_5_accuracy = tf.metrics.mean(in_top_5) + + return { + "top_1_accuracy": top_1_accuracy, + "top_5_accuracy": top_5_accuracy, + } def model_fn(features, labels, mode, params): @@ -42,45 +110,58 @@ def model_fn(features, labels, mode, params): Returns: An instance of `tf.contrib.tpu.TPUEstimatorSpec` """ + revnet_config = params["revnet_config"] + model = revnet.RevNet(config=revnet_config) inputs = features if isinstance(inputs, dict): inputs = features["image"] - config = params["config"] - model = revnet.RevNet(config=config) + if revnet_config.data_format == "channels_first": + assert not FLAGS.transpose_input # channels_first only for GPU + inputs = tf.transpose(inputs, [0, 3, 1, 2]) + + if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT: + inputs = tf.transpose(inputs, [3, 0, 1, 2]) # HWCN to NHWC + + # Normalize the image to zero mean and unit variance. + inputs -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=inputs.dtype) + inputs /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=inputs.dtype) if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.piecewise_constant( - global_step, config.lr_decay_steps, config.lr_list) - optimizer = tf.train.MomentumOptimizer( - learning_rate, momentum=config.momentum) - + global_step, revnet_config.lr_decay_steps, revnet_config.lr_list) + optimizer = tf.train.MomentumOptimizer(learning_rate, + revnet_config.momentum) if FLAGS.use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) logits, saved_hidden = model(inputs, training=True) grads, loss = model.compute_gradients(saved_hidden, labels, training=True) - train_op = optimizer.apply_gradients( - zip(grads, model.trainable_variables), global_step=global_step) + with tf.control_dependencies(model.get_updates_for(inputs)): + train_op = optimizer.apply_gradients( + zip(grads, model.trainable_variables), global_step=global_step) + if not FLAGS.skip_host_call: + # To log the loss, current learning rate, and epoch for Tensorboard, the + # summary op needs to be run on the host CPU via host_call. host_call + # expects [batch_size, ...] Tensors, thus reshape to introduce a batch + # dimension. These Tensors are implicitly concatenated to + # [params['batch_size']]. + gs_t = tf.reshape(global_step, [1]) + loss_t = tf.reshape(loss, [1]) + lr_t = tf.reshape(learning_rate, [1]) + host_call = (_host_call_fn, [gs_t, loss_t, lr_t]) return tf.contrib.tpu.TPUEstimatorSpec( - mode=tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op) + mode=mode, loss=loss, train_op=train_op, host_call=host_call) elif mode == tf.estimator.ModeKeys.EVAL: logits, _ = model(inputs, training=False) loss = model.compute_loss(labels=labels, logits=logits) - def metric_fn(labels, logits): - predictions = tf.argmax(logits, axis=1) - accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions) - return { - "accuracy": accuracy, - } - return tf.contrib.tpu.TPUEstimatorSpec( - mode=mode, loss=loss, eval_metrics=(metric_fn, [labels, logits])) + mode=mode, loss=loss, eval_metrics=(_metric_fn, [labels, logits])) else: # Predict or export logits, _ = model(inputs, training=False) @@ -97,113 +178,75 @@ def model_fn(features, labels, mode, params): }) -def get_input_fn(config, data_dir, split): - """Get the input function required by the `tf.contrib.tpu.TPUEstimator` API. - - Args: - config: Customized hyperparameters - data_dir: Directory where the data is stored - split: One of `train`, `validation`, `train_all`, and `test` - - Returns: - Input function required by the `tf.contrib.tpu.TPUEstimator` API - """ - - data_dir = os.path.join(data_dir, config.dataset) - # Fix split-dependent hyperparameters - if split == "train_all" or split == "train": - data_aug = True - epochs = config.tpu_epochs - shuffle = True - else: - data_aug = False - epochs = 1 - shuffle = False - - def input_fn(params): - """Input function required by the `tf.contrib.tpu.TPUEstimator` API.""" - batch_size = params["batch_size"] - return cifar_input.get_ds_from_tfrecords( - data_dir=data_dir, - split=split, - data_aug=data_aug, - batch_size=batch_size, # per-shard batch size - epochs=epochs, - shuffle=shuffle, - prefetch=batch_size, # per-shard batch size - data_format=config.data_format) - - return input_fn - - def main(_): tf.logging.set_verbosity(tf.logging.INFO) # RevNet specific configuration - config = main_.get_config(config_name=FLAGS.config, dataset=FLAGS.dataset) + revnet_config = { + "revnet-56": config_.get_hparams_imagenet_56(), + "revnet-104": config_.get_hparams_imagenet_104() + }[FLAGS.revnet_config] if FLAGS.use_tpu: - tf.logging.info("Using TPU.") - tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( - FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) - else: - tpu_cluster_resolver = None - - # TPU specific configuration - tpu_config = tf.contrib.tpu.TPUConfig( - # Recommended to be set as number of global steps for next checkpoint - iterations_per_loop=FLAGS.iterations_per_loop, - num_shards=FLAGS.num_shards) + revnet_config.data_format = "channels_last" + + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) # Estimator specific configuration - run_config = tf.contrib.tpu.RunConfig( + config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto( - allow_soft_placement=True, log_device_placement=False), - tpu_config=tpu_config, + allow_soft_placement=True, log_device_placement=True), + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_shards, + per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig. + PER_HOST_V2), ) - # Construct TPU Estimator - estimator = tf.contrib.tpu.TPUEstimator( + # Input pipelines are slightly different (with regards to shuffling and + # preprocessing) between training and evaluation. + imagenet_train, imagenet_eval = [ + imagenet_input.ImageNetInput( + is_training=is_training, + data_dir=FLAGS.data_dir, + transpose_input=FLAGS.transpose_input, + use_bfloat16=False) for is_training in [True, False] + ] + + revnet_classifier = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=FLAGS.use_tpu, - train_batch_size=config.tpu_batch_size, - eval_batch_size=config.tpu_eval_batch_size, - config=run_config, - params={"config": config}) - - # Construct input functions - train_input_fn = get_input_fn( - config=config, data_dir=FLAGS.data_dir, split="train_all") - eval_input_fn = get_input_fn( - config=config, data_dir=FLAGS.data_dir, split="test") - - # Disabling a range within an else block currently doesn't work - # due to https://github.com/PyCQA/pylint/issues/872 + train_batch_size=revnet_config.tpu_batch_size, + eval_batch_size=revnet_config.tpu_eval_batch_size, + config=config, + export_to_tpu=False, + params={"revnet_config": revnet_config}) + + steps_per_epoch = revnet_config.tpu_iters_per_epoch + eval_steps = revnet_config.tpu_eval_steps + # pylint: disable=protected-access if FLAGS.mode == "eval": - # TPUEstimator.evaluate *requires* a steps argument. - # Note that the number of examples used during evaluation is - # --eval_steps * --batch_size. - # So if you change --batch_size then change --eval_steps too. - eval_steps = 10000 // config.tpu_eval_batch_size - # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info("Starting to evaluate.") try: start_timestamp = time.time() # This time will include compilation time - eval_results = estimator.evaluate( - input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=ckpt) + eval_results = revnet_classifier.evaluate( + input_fn=imagenet_eval.input_fn, + steps=eval_steps, + checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info("Eval results: %s. Elapsed seconds: %d" % (eval_results, elapsed_time)) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split("-")[1]) - if current_step >= config.max_train_iter: + if current_step >= revnet_config.max_train_iter: tf.logging.info( "Evaluation finished after training step %d" % current_step) break @@ -217,37 +260,56 @@ def main(_): "Checkpoint %s no longer exists, skipping checkpoint" % ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' - current_step = estimator_._load_global_step_from_checkpoint_dir( + current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) - tf.logging.info("Training for %d steps . Current" - " step %d." % (config.max_train_iter, current_step)) + + tf.logging.info( + "Training for %d steps (%.2f epochs in total). Current" + " step %d." % (revnet_config.max_train_iter, + revnet_config.max_train_iter / steps_per_epoch, + current_step)) start_timestamp = time.time() # This time will include compilation time + if FLAGS.mode == "train": - estimator.train(input_fn=train_input_fn, max_steps=config.max_train_iter) + revnet_classifier.train( + input_fn=imagenet_train.input_fn, + max_steps=revnet_config.max_train_iter) + else: - eval_steps = 10000 // config.tpu_eval_batch_size assert FLAGS.mode == "train_and_eval" - while current_step < config.max_train_iter: + while current_step < revnet_config.max_train_iter: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, - config.max_train_iter) - estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint) + revnet_config.max_train_iter) + revnet_classifier.train( + input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint + tf.logging.info("Finished training up to step %d. Elapsed seconds %d." % + (next_checkpoint, int(time.time() - start_timestamp))) + # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images - # may be consistently excluded modulo the batch size. + # may be excluded modulo the batch size. As long as the batch size is + # consistent, the evaluated images are also consistent. tf.logging.info("Starting to evaluate.") - eval_results = estimator.evaluate( - input_fn=eval_input_fn, steps=eval_steps) + eval_results = revnet_classifier.evaluate( + input_fn=imagenet_eval.input_fn, steps=eval_steps) tf.logging.info("Eval results: %s" % eval_results) - elapsed_time = int(time.time() - start_timestamp) - tf.logging.info("Finished training up to step %d. Elapsed seconds %d." % - (config.max_train_iter, elapsed_time)) - # pylint: enable=protected-access + elapsed_time = int(time.time() - start_timestamp) + tf.logging.info("Finished training up to step %d. Elapsed seconds %d." % + (revnet_config.max_train_iter, elapsed_time)) + + if FLAGS.export_dir is not None: + # The guide to serve an exported TensorFlow model is at: + # https://www.tensorflow.org/serving/serving_basic + tf.logging.info("Starting to export model.") + revnet_classifier.export_savedmodel( + export_dir_base=FLAGS.export_dir, + serving_input_receiver_fn=imagenet_input.image_serving_input_fn) if __name__ == "__main__": @@ -279,14 +341,10 @@ if __name__ == "__main__": default=None, help="[Optional] Directory to store the model information") flags.DEFINE_string( - "dataset", - default="cifar-10", - help="[Optional] The dataset used; either `cifar-10` or `cifar-100`") - flags.DEFINE_string( - "config", - default="revnet-38", + "revnet_config", + default="revnet-56", help="[Optional] Architecture of network. " - "Other options include `revnet-110` and `revnet-164`") + "Other options include `revnet-104`") flags.DEFINE_boolean( "use_tpu", default=True, help="[Optional] Whether to use TPU") flags.DEFINE_integer( @@ -300,20 +358,37 @@ if __name__ == "__main__": " train steps, the loop will exit before reaching" " --iterations_per_loop. The larger this value is, the higher the" " utilization on the TPU.")) - flags.DEFINE_string( - "mode", - default="train_and_eval", - help="[Optional] Mode to run: train, eval, train_and_eval") flags.DEFINE_integer( - "eval_timeout", 60 * 60 * 24, - "Maximum seconds between checkpoints before evaluation terminates.") + "eval_timeout", + default=None, + help="Maximum seconds between checkpoints before evaluation terminates.") flags.DEFINE_integer( "steps_per_eval", - default=1000, + default=5000, help=( "Controls how often evaluation is performed. Since evaluation is" " fairly expensive, it is advised to evaluate as infrequently as" " possible (i.e. up to --train_steps, which evaluates the model only" " after finishing the entire training regime).")) + flags.DEFINE_bool( + "transpose_input", + default=True, + help="Use TPU double transpose optimization") + flags.DEFINE_string( + "export_dir", + default=None, + help=("The directory where the exported SavedModel will be stored.")) + flags.DEFINE_bool( + "skip_host_call", + default=False, + help=("Skip the host_call which is executed every training step. This is" + " generally used for generating training summaries (train loss," + " learning rate, etc...). When --skip_host_call=false, there could" + " be a performance drop if host_call function is slow and cannot" + " keep up with the TPU-side computation.")) + flags.DEFINE_string( + "mode", + default="train_and_eval", + help='One of {"train_and_eval", "train", "eval"}.') FLAGS = flags.FLAGS tf.app.run() diff --git a/tensorflow/contrib/eager/python/examples/sagan/BUILD b/tensorflow/contrib/eager/python/examples/sagan/BUILD deleted file mode 100644 index b470a41d815ce650731680065cc7341f844e3fdc..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/eager/python/examples/sagan/BUILD +++ /dev/null @@ -1,59 +0,0 @@ -licenses(["notice"]) # Apache 2.0 - -package(default_visibility = ["//tensorflow:internal"]) - -load("//tensorflow:tensorflow.bzl", "cuda_py_test") - -# Model -py_library( - name = "config", - srcs = ["config.py"], - srcs_version = "PY2AND3", - deps = [ - "//tensorflow:tensorflow_py", - ], -) - -py_library( - name = "ops", - srcs = ["ops.py"], - srcs_version = "PY2AND3", - deps = [ - "//tensorflow:tensorflow_py", - ], -) - -py_library( - name = "sagan", - srcs = ["sagan.py"], - srcs_version = "PY2AND3", - deps = [ - ":ops", - "//tensorflow:tensorflow_py", - ], -) - -# Tests -cuda_py_test( - name = "ops_test", - size = "small", - srcs = ["ops_test.py"], - additional_deps = [ - ":ops", - "//tensorflow:tensorflow_py", - ], -) - -cuda_py_test( - name = "sagan_test", - size = "large", - srcs = ["sagan_test.py"], - additional_deps = [ - ":config", - ":sagan", - "//tensorflow:tensorflow_py", - ], - tags = [ - "optonly", - ], -) diff --git a/tensorflow/contrib/eager/python/examples/sagan/config.py b/tensorflow/contrib/eager/python/examples/sagan/config.py deleted file mode 100644 index 1967bbd867447d9deaf9a7cb3b22a38889276a50..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/eager/python/examples/sagan/config.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Self-attention generative adversarial with eager execution. - -Configuration in format of tf.contrib.training.HParams. -Supports default 128x128 ImageNet. - -Reference [Self-Attention Generative Adversarial -Networks](https://arxiv.org/pdf/1805.08318.pdf) - -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -tfe = tf.contrib.eager - - -def get_hparams_imagenet(): - """Configurations to train SAGAN on 128x128 ImageNet dataset.""" - config = tf.contrib.training.HParams() - if tf.test.is_gpu_available(): - config.add_hparam("image_shape", (3, 128, 128)) - config.add_hparam("data_format", "channels_first") - config.add_hparam("g_init_shape", (512, 4, 4)) - else: - config.add_hparam("image_shape", (128, 128, 3)) - config.add_hparam("data_format", "channels_first") - config.add_hparam("g_init_shape", (4, 4, 512)) - - config.add_hparam("latent_dim", 128) - config.add_hparam("update_g_once_every", 1) - config.add_hparam("batch_size", 64) - config.add_hparam("d_init_filters", 32) - config.add_hparam("num_upsamples", 5) - # (512, 4, 4) -> (3, 128, 128) - return config - - -def get_hparams_mock(): - """Configurations of smaller networks for testing.""" - config = tf.contrib.training.HParams() - if tf.test.is_gpu_available(): - config.add_hparam("image_shape", (3, 16, 16)) - config.add_hparam("data_format", "channels_first") - config.add_hparam("g_init_shape", (32, 2, 2)) - else: - config.add_hparam("image_shape", (16, 16, 3)) - config.add_hparam("data_format", "channels_last") - config.add_hparam("g_init_shape", (2, 2, 32)) - - config.add_hparam("latent_dim", 16) - config.add_hparam("update_g_once_every", 1) - config.add_hparam("batch_size", 2) - config.add_hparam("d_init_filters", 4) - config.add_hparam("num_upsamples", 3) - # (32, 2, 2) -> (3, 16, 16) - return config diff --git a/tensorflow/contrib/eager/python/examples/sagan/ops.py b/tensorflow/contrib/eager/python/examples/sagan/ops.py deleted file mode 100644 index 9a03cab1d12fc16baa7343f72ac58ccd39f698bc..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/eager/python/examples/sagan/ops.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Self-attention generative adversarial with eager execution. - -Auxiliary operations. - -Reference [Self-Attention Generative Adversarial -Networks](https://arxiv.org/pdf/1805.08318.pdf) -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf - - -def flatten_hw(x, data_format="channels_first"): - """Flatten the input tensor across height and width dimensions.""" - if data_format == "channels_last": - x = tf.transpose(x, perm=[0, 3, 1, 2]) # Convert to `channels_first` - - old_shape = tf.shape(x) - new_shape = [old_shape[0], old_shape[2] * old_shape[3], old_shape[1]] - - return tf.reshape(x, new_shape) - - -def broaden_hw(x, h, w, c, data_format="channels_first"): - """Broaden dimension so that output has height and width.""" - if data_format == "channels_first": - shape = [-1, c, h, w] - else: - shape = [-1, h, w, c] - - return tf.reshape(x, shape) - - -class BroadenHW(tf.keras.layers.Layer): - """Wrapper class so that `broaden_hw` can be used in `tf.keras.Sequential`.""" - - def __init__(self, h, w, c, data_format="channels_first"): - super(BroadenHW, self).__init__() - self.h = h - self.w = w - self.c = c - self.data_format = data_format - - def call(self, x): - return broaden_hw( - x, h=self.h, w=self.w, c=self.c, data_format=self.data_format) - - def compute_output_shape(self, input_shape): - input_shape = tf.TensorShape(input_shape).as_list() - if self.data_format == "channels_first": - output_shape = (input_shape[0], self.c, self.h, self.w) - else: - output_shape = (input_shape[0], self.h, self.w, self.c) - - return tf.TensorShape(output_shape) diff --git a/tensorflow/contrib/eager/python/examples/sagan/ops_test.py b/tensorflow/contrib/eager/python/examples/sagan/ops_test.py deleted file mode 100644 index 3454985904215b59d27fc4b76ccb4a8c2c2eff00..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/eager/python/examples/sagan/ops_test.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for auxiliary operations.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -from tensorflow.contrib.eager.python.examples.sagan import ops - - -class OpsTest(tf.test.TestCase): - - def test_flatten_hw(self): - """Test `flatten_hw` function with mock object.""" - - batch_size = 1 - # Default NCHW format - if tf.test.is_gpu_available(): - x = tf.random_normal(shape=(batch_size, 3, 4, 4)) - y = ops.flatten_hw(x, data_format="channels_first") - self.assertEqual(y.shape, (batch_size, 4 * 4, 3)) - - # NHWC format - x = tf.random_normal(shape=(batch_size, 4, 4, 3)) - y = ops.flatten_hw(x, data_format="channels_last") - self.assertEqual(y.shape, (batch_size, 4 * 4, 3)) - - def test_broaden_hw(self): - """Test `broaden_hw` function with mock object.""" - - batch_size = 1 - # NHWC format - x = tf.random_normal(shape=[batch_size, 4 * 4 * 16]) - y = ops.broaden_hw(x, h=4, w=4, c=16, data_format="channels_last") - self.assertEqual(y.shape, (batch_size, 4, 4, 16)) - - # Default NCHW format - if tf.test.is_gpu_available(): - y = ops.broaden_hw(x, h=4, w=4, c=16, data_format="channels_first") - self.assertEqual(y.shape, (batch_size, 16, 4, 4)) - - -if __name__ == "__main__": - tf.enable_eager_execution() - tf.test.main() diff --git a/tensorflow/contrib/eager/python/examples/sagan/sagan.py b/tensorflow/contrib/eager/python/examples/sagan/sagan.py deleted file mode 100644 index 81304149851675e07a3c7f9ad92697da2017022b..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/eager/python/examples/sagan/sagan.py +++ /dev/null @@ -1,232 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Self-attention generative adversarial with eager execution. - -Code for main model. - -Reference [Self-Attention Generative Adversarial -Networks](https://arxiv.org/pdf/1805.08318.pdf) -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import tensorflow as tf -from tensorflow.contrib.eager.python.examples.sagan import ops -tfe = tf.contrib.eager - - -class SelfAttentionModule(tf.keras.Model): - """Self-attention module composed of convolutional layers.""" - - def __init__(self, - attention_features, - original_features, - data_format="channels_first"): - """Initialize the module. - - Args: - attention_features: Number of filters for the attention computation. - original_features: Number of filters of the original Tensor. - data_format: Either 'channels_first' or 'channels_last' - """ - super(SelfAttentionModule, self).__init__() - self.data_format = data_format - # Matrix multiplication implemented as 2D Convolution - self.f = tf.keras.layers.Conv2D( - filters=attention_features, - kernel_size=1, - strides=(1, 1), - data_format=data_format) - self.g = tf.keras.layers.Conv2D( - filters=attention_features, - kernel_size=1, - strides=(1, 1), - data_format=data_format) - self.h = tf.keras.layers.Conv2D( - filters=original_features, - kernel_size=1, - strides=(1, 1), - data_format=data_format) - self.scale = tf.Variable(0., trainable=True) - - def call(self, x): - f = self.f(x) - g = self.g(x) - h = self.h(x) - - f_flatten = ops.flatten_hw(f, data_format=self.data_format) - g_flatten = ops.flatten_hw(g, data_format=self.data_format) - h_flatten = ops.flatten_hw(h, data_format=self.data_format) - - s = tf.matmul(g_flatten, f_flatten, transpose_b=True) - b = tf.nn.softmax(s, axis=-1) - o = tf.matmul(b, h_flatten) - y = self.scale * tf.reshape(o, tf.shape(x)) + x - - return y - - def compute_output_shape(self, input_shape): - return input_shape - - -class SAGAN(tf.contrib.checkpoint.Checkpointable): - """Self-attention generative adversarial network.""" - - def __init__(self, config): - """Initialize the model. - - Args: - config: tf.contrib.training.HParams object; specifies hyperparameters - """ - super(SAGAN, self).__init__() - self.config = config - self.generator = self._construct_generator() - self.discriminator = self._construct_discriminator() - - def _construct_generator(self): - """Construct generator.""" - # TODO(lxuechen): Add spectral normalization for WGAN - axis = 1 if self.config.data_format == "channels_first" else 3 - - generator = tf.keras.Sequential() - generator.add( - tf.keras.layers.InputLayer(input_shape=(self.config.latent_dim,))) - generator.add( - tf.keras.layers.Dense( - units=np.prod(self.config.g_init_shape), activation=tf.nn.relu)) - - if self.config.data_format == "channels_first": - c, h, w = self.config.g_init_shape - else: - h, w, c = self.config.g_init_shape - - # Reshape to NHWC/NCHW - generator.add( - ops.BroadenHW(h=h, w=w, c=c, data_format=self.config.data_format)) - - filters_list = [c // 2**p for p in range(1, self.config.num_upsamples + 1)] - filters_list[-1] = 3 # Standard RGB images - - for filters in filters_list[:len(filters_list) // 2]: - generator.add( - tf.keras.layers.Conv2DTranspose( - filters=filters, - kernel_size=4, - strides=(2, 2), - use_bias=False, - padding="SAME", - data_format=self.config.data_format)) - generator.add(tf.keras.layers.BatchNormalization(axis=axis)) - generator.add(tf.keras.layers.Activation("relu")) - - # pylint: disable=undefined-loop-variable - generator.add( - SelfAttentionModule( - original_features=filters, - attention_features=filters // 8, - data_format=self.config.data_format)) - # pylint: enable=undefined-loop-variable - - for filters in filters_list[len(filters_list) // 2:]: - generator.add( - tf.keras.layers.Conv2DTranspose( - filters=filters, - kernel_size=4, - strides=(2, 2), - use_bias=False, - padding="SAME", - data_format=self.config.data_format)) - if filters == 3: - # Assume Image rescaled to [-1, 1] - generator.add(tf.keras.layers.Activation("tanh")) - else: - generator.add(tf.keras.layers.BatchNormalization(axis=axis)) - generator.add(tf.keras.layers.Activation("relu")) - - return generator - - def _construct_discriminator(self): - """Construct discriminator.""" - # TODO(lxuechen): Add spectral normalization for WGAN - discriminator = tf.keras.Sequential() - discriminator.add( - tf.keras.layers.InputLayer(input_shape=self.config.image_shape)) - - filters_list = [ - self.config.d_init_filters * 2**p - for p in range(self.config.num_upsamples) - ] - - for filters in filters_list[:(len(filters_list) + 1) // 2]: - discriminator.add( - tf.keras.layers.Conv2D( - filters=filters, - kernel_size=4, - strides=(2, 2), - padding="SAME", - data_format=self.config.data_format)) - discriminator.add(tf.keras.layers.LeakyReLU(alpha=.1)) - - # pylint: disable=undefined-loop-variable - discriminator.add( - SelfAttentionModule( - original_features=filters, - attention_features=filters // 8, - data_format=self.config.data_format)) - # pylint: enable=undefined-loop-variable - - for filters in filters_list[(len(filters_list) + 1) // 2:]: - discriminator.add( - tf.keras.layers.Conv2D( - filters=filters, - kernel_size=4, - strides=(2, 2), - padding="SAME", - data_format=self.config.data_format)) - discriminator.add(tf.keras.layers.LeakyReLU(alpha=.1)) - - discriminator.add(tf.keras.layers.Flatten()) - discriminator.add(tf.keras.layers.Dense(units=1)) - - return discriminator - - def compute_loss_and_grads(self, real_images, noise, training=True): - """Compute loss and gradients for both generator and discriminator.""" - # TODO(lxuechen): Add gradient penalty for discriminator - with tf.GradientTape() as g_tape, tf.GradientTape() as d_tape: - real_logits = self.discriminator(real_images, training=training) - - fake_images = self.generator.call(noise, training=training) - fake_logits = self.discriminator.call(fake_images) - - g_loss = self.compute_g_loss(fake_logits) - d_loss = self.compute_d_loss(fake_logits, real_logits) - - g_grads = g_tape.gradient(g_loss, self.generator.trainable_variables) - d_grads = d_tape.gradient(d_loss, self.discriminator.trainable_variables) - - return g_loss, d_loss, g_grads, d_grads - - def compute_g_loss(self, fake_logits): - return -tf.reduce_mean(fake_logits) # Hinge loss - - def compute_d_loss(self, fake_logits, real_logits): - # Hinge loss - real_loss = tf.reduce_mean(tf.nn.relu(1. - real_logits)) - fake_loss = tf.reduce_mean(tf.nn.relu(1. + fake_logits)) - return real_loss + fake_loss diff --git a/tensorflow/contrib/eager/python/examples/sagan/sagan_test.py b/tensorflow/contrib/eager/python/examples/sagan/sagan_test.py deleted file mode 100644 index 18345945108111b57c5401c26b7dca0bfc8f8316..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/eager/python/examples/sagan/sagan_test.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for self-attention generative adversarial network.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -from tensorflow.contrib.eager.python.examples.sagan import config as config_ -from tensorflow.contrib.eager.python.examples.sagan import sagan -tfe = tf.contrib.eager - - -class SAGANTest(tf.test.TestCase): - - def setUp(self): - super(SAGANTest, self).setUp() - config = config_.get_hparams_mock() - self.noise_shape = (config.batch_size, config.latent_dim) - self.logits_shape = (config.batch_size, 1) - self.images_shape = (config.batch_size,) + config.image_shape - - self.model = sagan.SAGAN(config=config) - self.noise = tf.random_normal(shape=self.noise_shape) - self.real_images = tf.random_normal(shape=self.images_shape) - self.config = config - - def tearDown(self): - del self.model - del self.noise - del self.real_images - super(SAGANTest, self).tearDown() - - def test_generator_call(self): - """Test `generator.__call__` function.""" - fake_images = self.model.generator(self.noise, training=False) - self.assertEqual(fake_images.shape, self.images_shape) - - def test_generator_call_defun(self): - """Test `generator.__call__` function with defun.""" - call_ = tfe.defun(self.model.generator.__call__) - fake_images = call_(self.noise, training=False) - self.assertEqual(fake_images.shape, self.images_shape) - - def test_discriminator_call(self): - """Test `discriminator.__call__` function.""" - real_logits = self.model.discriminator(self.real_images) - self.assertEqual(real_logits.shape, self.logits_shape) - - def test_discriminator_call_defun(self): - """Test `discriminator.__call__` function with defun.""" - call_ = tfe.defun(self.model.discriminator.__call__) - real_logits = call_(self.real_images) - self.assertEqual(real_logits.shape, self.logits_shape) - - def test_compute_loss_and_grads(self): - """Test `compute_loss_and_grads` function.""" - g_loss, d_loss, g_grads, d_grads = self.model.compute_loss_and_grads( - self.real_images, self.noise, training=False) - self.assertEqual(g_loss.shape, ()) - self.assertEqual(d_loss.shape, ()) - self.assertTrue(isinstance(g_grads, list)) - self.assertTrue(isinstance(d_grads, list)) - g_vars = self.model.generator.trainable_variables - d_vars = self.model.discriminator.trainable_variables - - self.assertEqual(len(g_grads), len(g_vars)) - self.assertEqual(len(d_grads), len(d_vars)) - - def test_compute_loss_and_grads_defun(self): - """Test `compute_loss_and_grads` function with defun.""" - compute_loss_and_grads = tfe.defun(self.model.compute_loss_and_grads) - g_loss, d_loss, g_grads, d_grads = compute_loss_and_grads( - self.real_images, self.noise, training=False) - self.assertEqual(g_loss.shape, ()) - self.assertEqual(d_loss.shape, ()) - self.assertTrue(isinstance(g_grads, list)) - self.assertTrue(isinstance(d_grads, list)) - g_vars = self.model.generator.trainable_variables - d_vars = self.model.discriminator.trainable_variables - - self.assertEqual(len(g_grads), len(g_vars)) - self.assertEqual(len(d_grads), len(d_vars)) - - -if __name__ == "__main__": - tf.enable_eager_execution() - tf.test.main() diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py index 8ac553e0ae71382966d03d9ef4429adf5137b369..d18a097063c7d25947af3e2e2959ce574edd553f 100644 --- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py +++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py @@ -36,7 +36,7 @@ from third_party.examples.eager.spinn import spinn from tensorflow.contrib.summary import summary_test_util from tensorflow.python.eager import test from tensorflow.python.framework import test_util -from tensorflow.python.training import saver +from tensorflow.python.training import checkpoint_management from tensorflow.python.training.checkpointable import util as checkpointable_utils # pylint: enable=g-bad-import-order @@ -422,7 +422,7 @@ class SpinnTest(test_util.TensorFlowTestCase): # 5. Verify that checkpoints exist and contains all the expected variables. self.assertTrue(glob.glob(os.path.join(config.logdir, "ckpt*"))) object_graph = checkpointable_utils.object_metadata( - saver.latest_checkpoint(config.logdir)) + checkpoint_management.latest_checkpoint(config.logdir)) ckpt_variable_names = set() for node in object_graph.nodes: for attribute in node.attributes: diff --git a/tensorflow/contrib/eager/python/remote_test.py b/tensorflow/contrib/eager/python/remote_test.py new file mode 100644 index 0000000000000000000000000000000000000000..76f48eeb1cab9d1f014adeafe4827cb5d3a8c77d --- /dev/null +++ b/tensorflow/contrib/eager/python/remote_test.py @@ -0,0 +1,178 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for remote eager execution.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import os + +import numpy as np + +from tensorflow.core.protobuf import cluster_pb2 +from tensorflow.core.protobuf import tensorflow_server_pb2 +from tensorflow.python.eager import backprop +from tensorflow.python.eager import context +from tensorflow.python.eager import function +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.platform import test +from tensorflow.python.training import server_lib + +JOB_NAME = "remote_device" +ALT_JOB_NAME = "alt_remote_device" + + +def run_sync_and_async(f): + """Execute all test methods in the given class in sync and async modes.""" + + @functools.wraps(f) + def decorator(self, *args, **kwargs): + with context.execution_mode(context.ASYNC): + f(self, *args, **kwargs) + + with context.execution_mode(context.SYNC): + f(self, *args, **kwargs) + + return decorator + + +def get_server_def(job_name, local_server_port, remote_server_addresses, + task_index): + """Returns a server def with a single job + multiple tasks.""" + cluster_def = cluster_pb2.ClusterDef() + job_def = cluster_def.job.add() + job_def.name = job_name + job_def.tasks[0] = "localhost:%d" % local_server_port + + for i, remote_server_address in enumerate(remote_server_addresses, start=1): + job_def.tasks[i] = remote_server_address + + server_def = tensorflow_server_pb2.ServerDef( + cluster=cluster_def, + job_name=job_name, + task_index=task_index, + protocol="grpc") + + return server_def + + +class RemoteExecutionTest(test.TestCase): + + def __init__(self, methodName="runTest"): # pylint: disable=invalid-name + super(RemoteExecutionTest, self).__init__(methodName) + self._cached_server1 = server_lib.Server.create_local_server() + self._cached_server2 = server_lib.Server.create_local_server() + + os.environ["TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC"] = "1" + + self._cached_server1_target = self._cached_server1.target[len("grpc://"):] + self._cached_server2_target = self._cached_server2.target[len("grpc://"):] + + # Start the local server. + context.set_server_def( + server_def=get_server_def( + JOB_NAME, + local_server_port=0, + remote_server_addresses=[ + self._cached_server1_target, self._cached_server2_target + ], + task_index=0)) + + @run_sync_and_async + def testDefunMatmul(self): + """Basic remote eager execution with defun.""" + + mm_defun = function.defun(math_ops.matmul) + with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME): + x1 = array_ops.ones([2, 2]) + with ops.device("job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME): + x2 = array_ops.ones([2, 2]) + y = mm_defun(x1, x2) + np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy()) + + @run_sync_and_async + def testSimpleMatmul(self): + """Basic remote eager execution.""" + + with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME): + x1 = array_ops.ones([2, 2]) + with ops.device("job:%s/replica:0/task:2/device:CPU:0" % JOB_NAME): + x2 = array_ops.ones([2, 2]) + y = math_ops.matmul(x1, x2) + np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy()) + + @run_sync_and_async + def testSimpleWeightRead(self): + """Basic remote eager weight read.""" + + with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME): + w = resource_variable_ops.ResourceVariable([[2.0]]) + loss = w * w + np.testing.assert_array_equal([[4.0]], loss.numpy()) + + @run_sync_and_async + def testTapeWeightRead(self): + """Remote eager weight read in a tape.""" + + with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME): + w = resource_variable_ops.ResourceVariable([[3.0]]) + with backprop.GradientTape() as tape: + loss = w * w + + grad = tape.gradient(loss, w) + np.testing.assert_array_equal([[9.0]], loss.numpy()) + np.testing.assert_array_equal([[6.0]], grad.numpy()) + + @run_sync_and_async + def testServerDefChanged(self): + """Update server def, and run ops on new cluster.""" + context.set_server_def( + server_def=get_server_def( + ALT_JOB_NAME, + local_server_port=0, + remote_server_addresses=[ + self._cached_server1_target, self._cached_server2_target + ], + task_index=0)) + + with ops.device("job:%s/replica:0/task:1/device:CPU:0" % ALT_JOB_NAME): + x1 = array_ops.ones([2, 2]) + y = math_ops.matmul(x1, x1) + np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy()) + + # Set the server def back to JOB_NAME + context.set_server_def( + server_def=get_server_def( + JOB_NAME, + local_server_port=0, + remote_server_addresses=[ + self._cached_server1_target, self._cached_server2_target + ], + task_index=0)) + + with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME): + x1 = array_ops.ones([2, 2]) + y = math_ops.matmul(x1, x1) + np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy()) + + +if __name__ == "__main__": + ops.enable_eager_execution() + test.main() diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py index d70930864784b3e48140da27ca33ff13f593e663..f9c716360c5755ee1902b576545d776725f9966f 100644 --- a/tensorflow/contrib/eager/python/saver.py +++ b/tensorflow/contrib/eager/python/saver.py @@ -161,7 +161,7 @@ class Saver(object): Args: file_prefix: Path prefix where parameters were previously saved. Typically obtained from a previous `save()` call, or from - @{tf.train.latest_checkpoint}. + `tf.train.latest_checkpoint`. """ with ops.device("/device:CPU:0"): self._saver.restore(None, file_prefix) diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py index 2f0ab616e40560e21dfe19fffb0010f724e48ecd..de11d00a1a0a34372467eedb02d790c920e7f449 100644 --- a/tensorflow/contrib/eager/python/tfe.py +++ b/tensorflow/contrib/eager/python/tfe.py @@ -71,6 +71,8 @@ To use, at program startup, call `tfe.enable_eager_execution()`. @@run_test_in_graph_and_eager_modes @@run_all_tests_in_graph_and_eager_modes +@@TensorSpec + @@DEVICE_PLACEMENT_EXPLICIT @@DEVICE_PLACEMENT_WARN @@DEVICE_PLACEMENT_SILENT @@ -114,6 +116,7 @@ from tensorflow.python.eager.execution_callbacks import inf_callback from tensorflow.python.eager.execution_callbacks import inf_nan_callback from tensorflow.python.eager.execution_callbacks import nan_callback from tensorflow.python.eager.execution_callbacks import seterr +from tensorflow.python.framework.tensor_spec import TensorSpec from tensorflow.python.framework.ops import enable_eager_execution from tensorflow.python.framework.ops import enable_eager_execution_internal as enable_remote_eager_execution from tensorflow.python.framework.ops import eager_run as run diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD index 349f48f7f788b458af2639f7ad4cc4cd904465b4..82272bf1207c9b85243bb1c2d92a2c6704a2761e 100644 --- a/tensorflow/contrib/estimator/BUILD +++ b/tensorflow/contrib/estimator/BUILD @@ -487,6 +487,9 @@ py_test( size = "medium", srcs = ["python/estimator/saved_model_estimator_test.py"], srcs_version = "PY2AND3", + tags = [ + "notsan", + ], deps = [ ":export", ":saved_model_estimator", diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py index 2eef60c39f54bfb464b7da0eb57a47e9eee9b800..724bc2c82f8289bbaa19a1dbbc1dc81b6e158e02 100644 --- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py +++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py @@ -147,7 +147,7 @@ class DNNLinearCombinedEstimator(estimator.Estimator): if a categorical column is multivalent. One of "mean", "sqrtn", and "sum" -- these are effectively different ways to do example-level normalization, which can be useful for bag-of-words features. For more - details, see @{tf.feature_column.linear_model$linear_model}. + details, see `tf.feature_column.linear_model`. Raises: ValueError: If both linear_feature_columns and dnn_features_columns are diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py index bf08be09e7baf63e507a6a4db6a91e7b6bb20b74..26449b46516fe1d8c93a8e3567f93801c689a65a 100644 --- a/tensorflow/contrib/estimator/python/estimator/extenders.py +++ b/tensorflow/contrib/estimator/python/estimator/extenders.py @@ -34,7 +34,7 @@ _VALID_METRIC_FN_ARGS = set(['features', 'labels', 'predictions', 'config']) def add_metrics(estimator, metric_fn): - """Creates a new @{tf.estimator.Estimator} which has given metrics. + """Creates a new `tf.estimator.Estimator` which has given metrics. Example: @@ -61,7 +61,7 @@ def add_metrics(estimator, metric_fn): ``` Args: - estimator: A @{tf.estimator.Estimator} object. + estimator: A `tf.estimator.Estimator` object. metric_fn: A function which should obey the following signature: - Args: can only have following four arguments in any order: * predictions: Predictions `Tensor` or dict of `Tensor` created by given @@ -79,7 +79,7 @@ def add_metrics(estimator, metric_fn): function, namely a `(metric_tensor, update_op)` tuple. Returns: - A new @{tf.estimator.Estimator} which has a union of original metrics with + A new `tf.estimator.Estimator` which has a union of original metrics with given ones. """ _verify_metric_fn_args(metric_fn) @@ -165,14 +165,14 @@ def forward_features(estimator, keys=None): ``` Args: - estimator: A @{tf.estimator.Estimator} object. + estimator: A `tf.estimator.Estimator` object. keys: a `string` or a `list` of `string`. If it is `None`, all of the `features` in `dict` is forwarded to the `predictions`. If it is a `string`, only given key is forwarded. If it is a `list` of strings, all the given `keys` are forwarded. Returns: - A new @{tf.estimator.Estimator} which forwards features to predictions. + A new `tf.estimator.Estimator` which forwards features to predictions. Raises: ValueError: diff --git a/tensorflow/contrib/estimator/python/estimator/linear.py b/tensorflow/contrib/estimator/python/estimator/linear.py index 62a37abefb1f6ed291df1df3da6de35bfd2b6c52..2b68f24eb2d4c528bc1cb87e7d858014f66c0433 100644 --- a/tensorflow/contrib/estimator/python/estimator/linear.py +++ b/tensorflow/contrib/estimator/python/estimator/linear.py @@ -121,7 +121,7 @@ class LinearEstimator(estimator.Estimator): is multivalent. One of "mean", "sqrtn", and "sum" -- these are effectively different ways to do example-level normalization, which can be useful for bag-of-words features. for more details, see - @{tf.feature_column.linear_model$linear_model}. + `tf.feature_column.linear_model`. """ def _model_fn(features, labels, mode, config): return linear_lib._linear_model_fn( # pylint: disable=protected-access diff --git a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py index b0082f7e550b069c072654e3c3fec8f917a84478..ce98e9987ec728fadf170e56fe4bfe24fc9a0105 100644 --- a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py +++ b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py @@ -148,7 +148,7 @@ class SavedModelEstimator(estimator_lib.Estimator): super(SavedModelEstimator, self).__init__( model_fn=self._model_fn_from_saved_model, model_dir=model_dir, warm_start_from=warm_start_settings) - if self._distribution is not None: + if self._train_distribution or self._eval_distribution: raise NotImplementedError( 'SavedModelEstimator currently does not support ' 'DistributionStrategy.') diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD index effec42f028fe472593a8d06e15a0831346d6f50..9e1f14f9905d584287864c15d9b6f9c152d17787 100644 --- a/tensorflow/contrib/factorization/BUILD +++ b/tensorflow/contrib/factorization/BUILD @@ -65,7 +65,7 @@ tf_custom_op_py_library( "//tensorflow/python:variable_scope", "//tensorflow/python:variables", "//tensorflow/python/estimator", - "//tensorflow/python/estimator:model_fn", + "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/feature_column:feature_column_py", "//third_party/py/numpy", ], @@ -242,7 +242,7 @@ py_test( "//tensorflow/python:platform_benchmark", "//tensorflow/python:random_ops", "//tensorflow/python:training", - "//tensorflow/python/estimator:run_config", + "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/feature_column:feature_column_py", "//third_party/py/numpy", ], diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py index 9ffdd3ba5e8ac496533d0207f2b6846dbc92bc89..f384d761a8430074f022c973d7ec3d46cd90f70b 100644 --- a/tensorflow/contrib/factorization/python/ops/kmeans.py +++ b/tensorflow/contrib/factorization/python/ops/kmeans.py @@ -158,12 +158,12 @@ class _ModelFn(object): return either `features` or, equivalently, `(features, None)`. Args: - features: The input points. See @{tf.estimator.Estimator}. - mode: See @{tf.estimator.Estimator}. - config: See @{tf.estimator.Estimator}. + features: The input points. See `tf.estimator.Estimator`. + mode: See `tf.estimator.Estimator`. + config: See `tf.estimator.Estimator`. Returns: - A @{tf.estimator.EstimatorSpec} (see @{tf.estimator.Estimator}) specifying + A `tf.estimator.EstimatorSpec` (see `tf.estimator.Estimator`) specifying this behavior: * `train_op`: Execute one mini-batch or full-batch run of Lloyd's algorithm. @@ -188,7 +188,6 @@ class _ModelFn(object): # center. # is_initialized: scalar indicating whether the initial cluster centers # have been chosen; see init_op. - # cluster_centers_var: a Variable containing the cluster centers. # init_op: an op to choose the initial cluster centers. A single worker # repeatedly executes init_op until is_initialized becomes True. # training_op: an op that runs an iteration of training, either an entire @@ -394,7 +393,7 @@ class KMeansClustering(estimator.Estimator): relative_tolerance: A relative tolerance of change in the loss between iterations. Stops learning if the loss changes less than this amount. This may not work correctly if `use_mini_batch=True`. - config: See @{tf.estimator.Estimator}. + config: See `tf.estimator.Estimator`. feature_columns: An optionable iterable containing all the feature columns used by the model. All items in the set should be feature column instances that can be passed to `tf.feature_column.input_layer`. If this @@ -431,7 +430,7 @@ class KMeansClustering(estimator.Estimator): """Finds the index of the closest cluster center to each input point. Args: - input_fn: Input points. See @{tf.estimator.Estimator.predict}. + input_fn: Input points. See `tf.estimator.Estimator.predict`. Yields: The index of the closest cluster center for each input point. @@ -447,7 +446,7 @@ class KMeansClustering(estimator.Estimator): which returns the negative sum. Args: - input_fn: Input points. See @{tf.estimator.Estimator.evaluate}. Only one + input_fn: Input points. See `tf.estimator.Estimator.evaluate`. Only one batch is retrieved. Returns: @@ -465,7 +464,7 @@ class KMeansClustering(estimator.Estimator): sklearn function returns the Euclidean distance. Args: - input_fn: Input points. See @{tf.estimator.Estimator.predict}. + input_fn: Input points. See `tf.estimator.Estimator.predict`. Yields: The distances from each input point to each cluster center. diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py index dc49383c5c300e82839c478e097074b3e8776b3b..20d099fe5d49dac0caec4a28801f09e7bee4f2e2 100644 --- a/tensorflow/contrib/framework/__init__.py +++ b/tensorflow/contrib/framework/__init__.py @@ -100,6 +100,8 @@ See the @{$python/contrib.framework} guide. @@BoundedTensorSpec @@TensorSpec + +@@RecordInput """ from __future__ import absolute_import @@ -119,6 +121,7 @@ from tensorflow.python.framework.smart_cond import smart_cond from tensorflow.python.framework.smart_cond import smart_constant_value from tensorflow.python.framework.tensor_spec import BoundedTensorSpec from tensorflow.python.framework.tensor_spec import TensorSpec +from tensorflow.python.ops.data_flow_ops import RecordInput from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d @@ -133,6 +136,7 @@ _nest_allowed_symbols = [ 'flatten_dict_items', 'pack_sequence_as', 'map_structure', + 'map_structure_with_paths', 'assert_shallow_structure', 'flatten_up_to', 'map_structure_up_to', diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py index 9e356dd96562c28adec7fc28fe144394e1c2ed38..e7184a01fbf57319399fc6dd287b7387138b4058 100644 --- a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py +++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py @@ -27,7 +27,7 @@ from tensorflow.python.ops import variable_scope as vs from tensorflow.python.ops import variables from tensorflow.python.platform import gfile from tensorflow.python.platform import tf_logging as logging -from tensorflow.python.training import saver +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import training as train __all__ = [ @@ -40,7 +40,7 @@ __all__ = [ def _get_checkpoint_filename(filepattern): """Returns checkpoint filename given directory or specific filepattern.""" if gfile.IsDirectory(filepattern): - return saver.latest_checkpoint(filepattern) + return checkpoint_management.latest_checkpoint(filepattern) return filepattern diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py index 5b150339953f961c756c0909dd1795341159b9cd..0a02e76a265c8ad25d978e7d610fb50fc0fdfdb1 100644 --- a/tensorflow/contrib/framework/python/ops/arg_scope.py +++ b/tensorflow/contrib/framework/python/ops/arg_scope.py @@ -103,9 +103,8 @@ def _kwarg_names(func): def _add_op(op): - key = arg_scope_func_key(op) - if key not in _DECORATED_OPS: - _DECORATED_OPS[key] = _kwarg_names(op) + key_op = arg_scope_func_key(op) + _DECORATED_OPS[key_op] = _kwarg_names(op) @tf_contextlib.contextmanager diff --git a/tensorflow/contrib/framework/python/ops/arg_scope_test.py b/tensorflow/contrib/framework/python/ops/arg_scope_test.py index 4c3879d4fc08b53ea8be5f1256a830a64fb39af6..bcafc1a3280ba0435f655eacb8173e4e97051154 100644 --- a/tensorflow/contrib/framework/python/ops/arg_scope_test.py +++ b/tensorflow/contrib/framework/python/ops/arg_scope_test.py @@ -38,6 +38,12 @@ def func3(args, a=None, b=1, c=2): """Some cool doc string.""" return (args, a, b, c) +@add_arg_scope +def func4(x='x', y='y'): + if x: + pass + if y: + pass def _key_op(op): return getattr(op, '_key_op', str(op)) @@ -231,6 +237,15 @@ class ArgScopeTest(test.TestCase): self.assertTupleEqual(args, func2_args) self.assertDictEqual(kwargs, func2_kwargs) + def testAddArgScopeRaceCondition(self): + func4_kwargs = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h') + for i in range(4): + # redefine the function with different args + @add_arg_scope + def func4(a=1, b=2, c=3, d=4, e=5, f=6, g=7, h=8): + pass + self.assertTupleEqual(arg_scoped_arguments(func4), func4_kwargs) + def testDocString(self): self.assertEqual(func3.__doc__, 'Some cool doc string.') diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py index 322d5c335e6a77c46c7ce5dd795e21a2d5a1f8f9..a7acae804a0c71cc19757a48d47fd9cf9022b0e2 100644 --- a/tensorflow/contrib/framework/python/ops/variables.py +++ b/tensorflow/contrib/framework/python/ops/variables.py @@ -241,13 +241,13 @@ def variable(name, use_resource: If `True` use a ResourceVariable instead of a Variable. synchronization: Indicates when a distributed a variable will be aggregated. Accepted values are constants defined in the class - @{tf.VariableSynchronization}. By default the synchronization is set to + `tf.VariableSynchronization`. By default the synchronization is set to `AUTO` and the current `DistributionStrategy` chooses when to synchronize. If `synchronization` is set to `ON_READ`, `trainable` must not be set to `True`. aggregation: Indicates how a distributed variable will be aggregated. Accepted values are constants defined in the class - @{tf.VariableAggregation}. + `tf.VariableAggregation`. Returns: The created or existing variable. @@ -320,13 +320,13 @@ def model_variable(name, use_resource: If `True` use a ResourceVariable instead of a Variable. synchronization: Indicates when a distributed a variable will be aggregated. Accepted values are constants defined in the class - @{tf.VariableSynchronization}. By default the synchronization is set to + `tf.VariableSynchronization`. By default the synchronization is set to `AUTO` and the current `DistributionStrategy` chooses when to synchronize. If `synchronization` is set to `ON_READ`, `trainable` must not be set to `True`. aggregation: Indicates how a distributed variable will be aggregated. Accepted values are constants defined in the class - @{tf.VariableAggregation}. + `tf.VariableAggregation`. Returns: The created or existing variable. diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD index 053d4e3e977ed1baed8ceeca1a983e999b1ad1ff..9866fccfba3562221ea7fe845e860ab470e238a0 100644 --- a/tensorflow/contrib/gan/BUILD +++ b/tensorflow/contrib/gan/BUILD @@ -424,9 +424,11 @@ py_library( ":namedtuples", "//tensorflow/python:array_ops", "//tensorflow/python:framework_ops", + "//tensorflow/python:functional_ops", "//tensorflow/python:math_ops", "//tensorflow/python:summary", "//tensorflow/python:util", + "//tensorflow/python:variable_scope", "//tensorflow/python/ops/losses", ], ) @@ -459,8 +461,7 @@ py_library( ":train", "//tensorflow/python:framework_ops", "//tensorflow/python:util", - "//tensorflow/python/estimator:head", - "//tensorflow/python/estimator:model_fn", + "//tensorflow/python/estimator:estimator_py", ], ) @@ -477,7 +478,7 @@ py_test( "//tensorflow/python:math_ops", "//tensorflow/python:training", "//tensorflow/python:variable_scope", - "//tensorflow/python/estimator:model_fn", + "//tensorflow/python/estimator:estimator_py", ], ) @@ -497,8 +498,7 @@ py_library( "//tensorflow/python:metrics", "//tensorflow/python:util", "//tensorflow/python:variable_scope", - "//tensorflow/python/estimator", - "//tensorflow/python/estimator:model_fn", + "//tensorflow/python/estimator:estimator_py", ], ) @@ -526,8 +526,7 @@ py_test( "//tensorflow/python:training", "//tensorflow/python:training_util", "//tensorflow/python:variable_scope", - "//tensorflow/python/estimator:model_fn", - "//tensorflow/python/estimator:numpy_io", + "//tensorflow/python/estimator:estimator_py", "//third_party/py/numpy", "@absl_py//absl/testing:parameterized", "@six_archive//:six", diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py index 8e4affb9b4f95bf5afab0f50c86954e60a942279..ab9886580d1648852e08f64cb3e9b51f679c25de 100644 --- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py +++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py @@ -53,9 +53,6 @@ _summary_type_map = { } -# TODO(joelshor): For now, this only supports 1:1 generator:discriminator -# training sequentially. Find a nice way to expose options to the user without -# exposing internals. class GANEstimator(estimator.Estimator): """An estimator for Generative Adversarial Networks (GANs). diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py index 508f487722fba89cc8391a340f73673a526e86c4..f9995bb19d0d09eaf6fd96d039b0bba1d3a7055c 100644 --- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py +++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py @@ -22,7 +22,9 @@ from tensorflow.contrib.gan.python import namedtuples from tensorflow.contrib.gan.python.eval.python import eval_utils from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops +from tensorflow.python.ops import functional_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import variable_scope from tensorflow.python.ops.losses import util as loss_util from tensorflow.python.summary import summary @@ -32,6 +34,7 @@ __all__ = [ 'add_gan_model_summaries', 'add_regularization_loss_summaries', 'add_cyclegan_image_summaries', + 'add_stargan_image_summaries' ] @@ -179,6 +182,94 @@ def add_image_comparison_summaries(gan_model, num_comparisons=2, max_outputs=1) +def add_stargan_image_summaries(stargan_model, + num_images=2, + display_diffs=False): + """Adds image summaries to see StarGAN image results. + + If display_diffs is True, each image result has `2` rows and `num_domains + 1` + columns. + The first row looks like: + [original_image, transformed_to_domain_0, transformed_to_domain_1, ...] + The second row looks like: + [no_modification_baseline, transformed_to_domain_0-original_image, ...] + If display_diffs is False, only the first row is shown. + + IMPORTANT: + Since the model originally does not transformed the image to every domains, + we will transform them on-the-fly within this function in parallel. + + Args: + stargan_model: A StarGANModel tuple. + num_images: The number of examples/images to be transformed and shown. + display_diffs: Also display the difference between generated and target. + + Raises: + ValueError: If input_data is not images. + ValueError: If input_data_domain_label is not rank 2. + ValueError: If dimension 2 of input_data_domain_label is not fully defined. + """ + + _assert_is_image(stargan_model.input_data) + stargan_model.input_data_domain_label.shape.assert_has_rank(2) + stargan_model.input_data_domain_label.shape[1:].assert_is_fully_defined() + + num_domains = stargan_model.input_data_domain_label.get_shape().as_list()[-1] + + def _build_image(image): + """Helper function to create a result for each image on the fly.""" + + # Expand the first dimension as batch_size = 1. + images = array_ops.expand_dims(image, axis=0) + + # Tile the image num_domains times, so we can get all transformed together. + images = array_ops.tile(images, [num_domains, 1, 1, 1]) + + # Create the targets to 0, 1, 2, ..., num_domains-1. + targets = array_ops.one_hot(list(range(num_domains)), num_domains) + + with variable_scope.variable_scope( + stargan_model.generator_scope, reuse=True): + + # Add the original image. + output_images_list = [image] + + # Generate the image and add to the list. + gen_images = stargan_model.generator_fn(images, targets) + gen_images_list = array_ops.split(gen_images, num_domains) + gen_images_list = [ + array_ops.squeeze(img, axis=0) for img in gen_images_list + ] + output_images_list.extend(gen_images_list) + + # Display diffs. + if display_diffs: + diff_images = gen_images - images + diff_images_list = array_ops.split(diff_images, num_domains) + diff_images_list = [ + array_ops.squeeze(img, axis=0) for img in diff_images_list + ] + output_images_list.append(array_ops.zeros_like(image)) + output_images_list.extend(diff_images_list) + + # Create the final image. + final_image = eval_utils.image_reshaper( + output_images_list, num_cols=num_domains + 1) + + # Reduce the first rank. + return array_ops.squeeze(final_image, axis=0) + + summary.image( + 'stargan_image_generation', + functional_ops.map_fn( + _build_image, + stargan_model.input_data[:num_images], + parallel_iterations=num_images, + back_prop=False, + swap_memory=True), + max_outputs=num_images) + + def add_gan_model_summaries(gan_model): """Adds typical GANModel summaries. diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py index 33d51bfc218ab93fb52439b1eefed98a4568c4a1..54a6f8d4d9086ad7fc8db31032677628561e48e8 100644 --- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py +++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py @@ -18,7 +18,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function - from tensorflow.contrib.gan.python import namedtuples from tensorflow.contrib.gan.python.eval.python import summaries_impl as summaries from tensorflow.python.framework import ops @@ -37,6 +36,10 @@ def discriminator_model(inputs, _): return variable_scope.get_variable('dummy_d', initializer=2.0) * inputs +def stargan_generator_model(inputs, _): + return generator_model(inputs) + + def get_gan_model(): # TODO(joelshor): Find a better way of creating a variable scope. with variable_scope.variable_scope('generator') as gen_scope: @@ -57,6 +60,31 @@ def get_gan_model(): discriminator_fn=discriminator_model) +def get_stargan_model(): + """Similar to get_gan_model().""" + # TODO(joelshor): Find a better way of creating a variable scope. + with variable_scope.variable_scope('discriminator') as dis_scope: + pass + with variable_scope.variable_scope('generator') as gen_scope: + return namedtuples.StarGANModel( + input_data=array_ops.ones([1, 2, 2, 3]), + input_data_domain_label=array_ops.ones([1, 2]), + generated_data=stargan_generator_model( + array_ops.ones([1, 2, 2, 3]), None), + generated_data_domain_target=array_ops.ones([1, 2]), + reconstructed_data=array_ops.ones([1, 2, 2, 3]), + discriminator_input_data_source_predication=array_ops.ones([1]), + discriminator_generated_data_source_predication=array_ops.ones([1]), + discriminator_input_data_domain_predication=array_ops.ones([1, 2]), + discriminator_generated_data_domain_predication=array_ops.ones([1, 2]), + generator_variables=None, + generator_scope=gen_scope, + generator_fn=stargan_generator_model, + discriminator_variables=None, + discriminator_scope=dis_scope, + discriminator_fn=discriminator_model) + + def get_cyclegan_model(): with variable_scope.variable_scope('x2y'): model_x2y = get_gan_model() @@ -143,6 +171,16 @@ class SummariesTest(test.TestCase): with self.test_session(use_gpu=True): summary.merge_all().eval() + def test_add_image_comparison_summaries_for_stargan(self): + + summaries.add_stargan_image_summaries(get_stargan_model()) + + self.assertEquals(1, len(ops.get_collection(ops.GraphKeys.SUMMARIES))) + + with self.test_session(use_gpu=True) as sess: + sess.run(variables.global_variables_initializer()) + summary.merge_all().eval() + if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py index 03f52d214b5ac2fef075fb66018f88d2be5c1941..9e5aea1498a7e9d47480af18cad9f80ede84c0f9 100644 --- a/tensorflow/contrib/gan/python/train.py +++ b/tensorflow/contrib/gan/python/train.py @@ -52,7 +52,6 @@ from tensorflow.python.training import session_run_hook from tensorflow.python.training import sync_replicas_optimizer from tensorflow.python.training import training_util - __all__ = [ 'gan_model', 'infogan_model', @@ -61,6 +60,7 @@ __all__ = [ 'stargan_model', 'gan_loss', 'cyclegan_loss', + 'stargan_loss', 'gan_train_ops', 'gan_train', 'get_sequential_train_hooks', @@ -646,8 +646,9 @@ def gan_loss( type(model)) # Optionally create pooled model. - pooled_model = (_tensor_pool_adjusted_model(model, tensor_pool_fn) if - tensor_pool_fn else model) + pooled_model = ( + _tensor_pool_adjusted_model(model, tensor_pool_fn) + if tensor_pool_fn else model) # Create standard losses. gen_loss = generator_loss_fn(model, add_summaries=add_summaries) @@ -665,9 +666,10 @@ def gan_loss( if _use_aux_loss(mutual_information_penalty_weight): gen_info_loss = tfgan_losses.mutual_information_penalty( model, add_summaries=add_summaries) - dis_info_loss = (gen_info_loss if tensor_pool_fn is None else - tfgan_losses.mutual_information_penalty( - pooled_model, add_summaries=add_summaries)) + dis_info_loss = ( + gen_info_loss + if tensor_pool_fn is None else tfgan_losses.mutual_information_penalty( + pooled_model, add_summaries=add_summaries)) gen_loss += mutual_information_penalty_weight * gen_info_loss dis_loss += mutual_information_penalty_weight * dis_info_loss if _use_aux_loss(aux_cond_generator_weight): diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc index f3bbf6b4d78b50b11e23abd584bacff8f3d877c7..7e6a0f14f6f5e467801fef39ebb597565b3d7e98 100644 --- a/tensorflow/contrib/gdr/gdr_memory_manager.cc +++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc @@ -174,7 +174,7 @@ class GdrMemoryManager : public RemoteMemoryManager { // Client side endpoints mutex client_mu_; std::map, RdmaEndpointPtr> clients_ - GUARDED_BY(cient_mu_); + GUARDED_BY(client_mu_); // Managed memory regions mutex alloc_mu_; diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py index 026a3d1200033400472c4fd763a244c04b284a9b..e79ccd8da1f8952758ae322d3a92dec34910a9db 100644 --- a/tensorflow/contrib/graph_editor/transform.py +++ b/tensorflow/contrib/graph_editor/transform.py @@ -129,7 +129,7 @@ def transform_op_if_inside_handler(info, op, keep_if_possible=True): return None -def copy_op_handler(info, op, new_inputs, copy_shape=True, nodedef_fn=None): +def copy_op_handler(info, op, new_inputs, copy_shape=False, nodedef_fn=None): """Copy a `tf.Operation`. Args: diff --git a/tensorflow/contrib/hadoop/BUILD b/tensorflow/contrib/hadoop/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..ccad31efa1dba92d954ff1cb455b6c9c784b29bc --- /dev/null +++ b/tensorflow/contrib/hadoop/BUILD @@ -0,0 +1,117 @@ +package(default_visibility = ["//tensorflow:internal"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +load( + "//tensorflow:tensorflow.bzl", + "tf_custom_op_library", + "tf_custom_op_py_library", + "tf_gen_op_libs", + "tf_gen_op_wrapper_py", + "tf_kernel_library", + "tf_py_test", +) + +filegroup( + name = "test_data", + srcs = glob(["python/kernel_tests/testdata/*"]), +) + +py_library( + name = "hadoop", + srcs = ["__init__.py"], + srcs_version = "PY2AND3", + deps = [ + ":dataset_ops", + ], +) + +tf_custom_op_library( + name = "_dataset_ops.so", + srcs = ["ops/dataset_ops.cc"], + deps = [ + ":dataset_kernels", + ], +) + +tf_gen_op_libs( + op_lib_names = ["dataset_ops"], +) + +cc_library( + name = "dataset_kernels", + srcs = ["kernels/hadoop_dataset_ops.cc"], + deps = [ + "//tensorflow/core:framework_headers_lib", + "//third_party/eigen3", + "@protobuf_archive//:protobuf_headers", + ], + alwayslink = 1, +) + +py_library( + name = "dataset_ops", + srcs = [ + "python/ops/hadoop_dataset_ops.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":hadoop_op_loader", + "//tensorflow/python:dataset_ops_gen", + "//tensorflow/python:util", + "//tensorflow/python/data/ops:dataset_ops", + "//tensorflow/python/data/util:nest", + ], +) + +tf_gen_op_wrapper_py( + name = "gen_dataset_ops", + out = "python/ops/gen_dataset_ops.py", + deps = ["//tensorflow/contrib/hadoop:dataset_ops_op_lib"], +) + +tf_kernel_library( + name = "dataset_ops_kernels", + deps = [ + ":dataset_kernels", + "//tensorflow/core:framework", + ], + alwayslink = 1, +) + +tf_custom_op_py_library( + name = "hadoop_op_loader", + srcs = ["python/ops/hadoop_op_loader.py"], + dso = ["//tensorflow/contrib/hadoop:_dataset_ops.so"], + kernels = [ + ":dataset_ops_kernels", + "//tensorflow/contrib/hadoop:dataset_ops_op_lib", + ], + srcs_version = "PY2AND3", + deps = [ + ":gen_dataset_ops", + "//tensorflow/contrib/util:util_py", + "//tensorflow/python:platform", + ], +) + +tf_py_test( + name = "hadoop_test", + srcs = ["python/kernel_tests/hadoop_test.py"], + additional_deps = [ + ":hadoop", + "//third_party/py/numpy", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:platform_test", + ], + data = [ + ":test_data", + ], + tags = [ + "notap", + ], +) diff --git a/tensorflow/contrib/hadoop/__init__.py b/tensorflow/contrib/hadoop/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf8cd4845f9713ebd8a647af191000061e01ad1 --- /dev/null +++ b/tensorflow/contrib/hadoop/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Sequence File Dataset. + +@@SequenceFileDataset +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.hadoop.python.ops.hadoop_dataset_ops import SequenceFileDataset + +from tensorflow.python.util.all_util import remove_undocumented + +_allowed_symbols = [ + "SequenceFileDataset", +] + +remove_undocumented(__name__) diff --git a/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc b/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..80b2d3e08b6745b776aa7b4073e841145defd3c4 --- /dev/null +++ b/tensorflow/contrib/hadoop/kernels/hadoop_dataset_ops.cc @@ -0,0 +1,340 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/framework/dataset.h" +#include "tensorflow/core/lib/io/buffered_inputstream.h" +#include "tensorflow/core/platform/file_system.h" + +namespace tensorflow { +namespace { + +static const size_t kSyncMarkerSize = 16; +static const size_t kSequenceFileBufferSize = 1024 * 1024; + +class SequenceFileReader { + public: + explicit SequenceFileReader(RandomAccessFile* file) + : input_stream_( + new io::BufferedInputStream(file, kSequenceFileBufferSize)) {} + + Status ReadHeader() { + string version; + TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(4, &version)); + if (version.substr(0, 3) != "SEQ" || version[3] != 6) { + return errors::InvalidArgument( + "sequence file header must starts with `SEQ6`, received \"", + version.substr(0, 3), static_cast(version[3]), "\""); + } + TF_RETURN_IF_ERROR(ReadString(&key_class_name_)); + TF_RETURN_IF_ERROR(ReadString(&value_class_name_)); + + // At the moment we only support `org.apache.hadoop.io.Text` for key/value. + // TODO (yongtang): Add more class name support. + if (key_class_name_ != "org.apache.hadoop.io.Text" || + value_class_name_ != "org.apache.hadoop.io.Text") { + return errors::Unimplemented("key/value of '", key_class_name_, "/", + value_class_name_, + "' is currently not supported"); + } + + string buffer; + TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(2, &buffer)); + compression_ = buffer[0]; + block_compression_ = buffer[1]; + if (compression_ || block_compression_) { + TF_RETURN_IF_ERROR(ReadString(&compression_codec_class_name_)); + } + + // At the moment no compression is supported. + // TODO (yongtang): Add compression support. + if (compression_ || block_compression_) { + return errors::Unimplemented("compression is currently not supported"); + } + + // Not interested in metadata for now. + uint32 num_metadata_pairs = 0; + TF_RETURN_IF_ERROR(ReadUInt32(&num_metadata_pairs)); + if (num_metadata_pairs > 1024) { + return errors::InvalidArgument( + "sequence file metadata should have key value pairs < 1024, " + "received ", + num_metadata_pairs); + } + for (int i = 0; i < num_metadata_pairs; i++) { + TF_RETURN_IF_ERROR(ReadString(nullptr)); + TF_RETURN_IF_ERROR(ReadString(nullptr)); + } + + TF_RETURN_IF_ERROR( + input_stream_->ReadNBytes(kSyncMarkerSize, &sync_marker_)); + + return Status::OK(); + } + + Status ReadRecord(string* key, string* value) { + uint32 length = 0; + TF_RETURN_IF_ERROR(ReadUInt32(&length)); + if (length == static_cast(-1)) { + // Sync marker. + string sync_marker; + TF_RETURN_IF_ERROR( + input_stream_->ReadNBytes(kSyncMarkerSize, &sync_marker)); + if (sync_marker != sync_marker_) { + return errors::InvalidArgument( + "sequence file should have sync marker \"", sync_marker_, + "\" at pos ", input_stream_->Tell() - kSyncMarkerSize, + ", received \"", sync_marker, "\""); + } + return ReadRecord(key, value); + } + uint32 key_length = 0; + TF_RETURN_IF_ERROR(ReadUInt32(&key_length)); + if (key_length > length) { + return errors::InvalidArgument("key length (", key_length, + ") should be < record length (", length, + ")"); + } + // At the moment we only support `org.apache.hadoop.io.Text` for key/value. + // TODO (yongtang): Expand supported format. + TF_RETURN_IF_ERROR(ReadString(key)); + TF_RETURN_IF_ERROR(ReadString(value)); + return Status::OK(); + } + + Status ReadString(string* value) { + int64 length = 0; + TF_RETURN_IF_ERROR(ReadVInt(&length)); + if (value == nullptr) { + return input_stream_->SkipNBytes(length); + } + return input_stream_->ReadNBytes(length, value); + } + + Status ReadUInt32(uint32* value) { + string buffer; + TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(4, &buffer)); + *value = ((static_cast(buffer[0]) << 24) | + static_cast(buffer[1]) << 16) | + (static_cast(buffer[2]) << 8) | + static_cast(buffer[3]); + return Status::OK(); + } + + Status ReadVInt(int64* value) { + string buffer; + TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(1, &buffer)); + if (buffer[0] >= -112) { + *value = static_cast(buffer[0]); + return Status::OK(); + } + + int64 remaining = 0; + bool negative = false; + if (buffer[0] >= -120) { + remaining = static_cast(-112) - static_cast(buffer[0]); + } else { + remaining = static_cast(-120) - static_cast(buffer[0]); + negative = true; + } + buffer.clear(); + TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(remaining, &buffer)); + + uint64 v = 0; + for (int i = 0; i < buffer.size(); i++) { + v = (v << 8) | static_cast(buffer[i]); + } + if (negative) { + v = ~v; + } + *value = static_cast(v); + return Status::OK(); + } + + virtual ~SequenceFileReader() = default; + + private: + std::unique_ptr input_stream_; + string key_class_name_; + string value_class_name_; + string sync_marker_; + bool compression_; + bool block_compression_; + string compression_codec_class_name_; + TF_DISALLOW_COPY_AND_ASSIGN(SequenceFileReader); +}; +class SequenceFileDatasetOp : public DatasetOpKernel { + public: + using DatasetOpKernel::DatasetOpKernel; + explicit SequenceFileDatasetOp(OpKernelConstruction* ctx) + : DatasetOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_)); + for (const DataType& dt : output_types_) { + OP_REQUIRES(ctx, dt == DT_STRING, + errors::InvalidArgument( + "Each element of `output_types_` must be one of: " + "DT_STRING")); + } + } + void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override { + const Tensor* filenames_tensor; + OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor)); + OP_REQUIRES( + ctx, filenames_tensor->dims() <= 1, + errors::InvalidArgument("`filenames` must be a scalar or a vector.")); + + std::vector filenames; + filenames.reserve(filenames_tensor->NumElements()); + for (int i = 0; i < filenames_tensor->NumElements(); ++i) { + filenames.push_back(filenames_tensor->flat()(i)); + } + + *output = new Dataset(ctx, filenames, output_types_); + } + + private: + class Dataset : public DatasetBase { + public: + Dataset(OpKernelContext* ctx, const std::vector& filenames, + const DataTypeVector& output_types) + : DatasetBase(DatasetContext(ctx)), + filenames_(filenames), + output_types_(output_types) {} + + std::unique_ptr MakeIteratorInternal( + const string& prefix) const override { + return std::unique_ptr( + new Iterator({this, strings::StrCat(prefix, "::SequenceFile")})); + } + + const DataTypeVector& output_dtypes() const override { + return output_types_; + } + + const std::vector& output_shapes() const override { + static std::vector* shapes = + new std::vector({{}, {}}); + return *shapes; + } + + string DebugString() const override { + return "SequenceFileDatasetOp::Dataset"; + } + + protected: + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + Node* filenames = nullptr; + TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames)); + TF_RETURN_IF_ERROR(b->AddDataset(this, {filenames}, output)); + return Status::OK(); + } + + private: + class Iterator : public DatasetIterator { + public: + explicit Iterator(const Params& params) + : DatasetIterator(params) {} + + Status GetNextInternal(IteratorContext* ctx, + std::vector* out_tensors, + bool* end_of_sequence) override { + mutex_lock l(mu_); + do { + // We are currently processing a file, so try to read the next record. + if (reader_) { + string key, value; + Status status = reader_->ReadRecord(&key, &value); + if (!errors::IsOutOfRange(status)) { + TF_RETURN_IF_ERROR(status); + + Tensor key_tensor(ctx->allocator({}), DT_STRING, {}); + key_tensor.scalar()() = key; + out_tensors->emplace_back(std::move(key_tensor)); + + Tensor value_tensor(ctx->allocator({}), DT_STRING, {}); + value_tensor.scalar()() = value; + out_tensors->emplace_back(std::move(value_tensor)); + + *end_of_sequence = false; + return Status::OK(); + } + // We have reached the end of the current file, so maybe + // move on to next file. + ResetStreamsLocked(); + ++current_file_index_; + } + + // Iteration ends when there are no more files to process. + if (current_file_index_ == dataset()->filenames_.size()) { + *end_of_sequence = true; + return Status::OK(); + } + + TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env())); + } while (true); + } + + protected: + Status SaveInternal(IteratorStateWriter* writer) override { + return errors::Unimplemented("SaveInternal is currently not supported"); + } + + Status RestoreInternal(IteratorContext* ctx, + IteratorStateReader* reader) override { + return errors::Unimplemented( + "RestoreInternal is currently not supported"); + } + + private: + // Sets up SequenceFile streams to read from the topic at + // `current_file_index_`. + Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) { + if (current_file_index_ >= dataset()->filenames_.size()) { + return errors::InvalidArgument( + "current_file_index_:", current_file_index_, + " >= filenames_.size():", dataset()->filenames_.size()); + } + + // Actually move on to next file. + const string& filename = dataset()->filenames_[current_file_index_]; + TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file_)); + reader_.reset(new SequenceFileReader(file_.get())); + return reader_->ReadHeader(); + } + + // Resets all Hadoop SequenceFile streams. + void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { + reader_.reset(); + file_.reset(); + } + + mutex mu_; + size_t current_file_index_ GUARDED_BY(mu_) = 0; + std::unique_ptr file_ GUARDED_BY(mu_); + std::unique_ptr reader_ GUARDED_BY(mu_); + }; + + const std::vector filenames_; + const DataTypeVector output_types_; + }; + DataTypeVector output_types_; +}; +} // namespace + +REGISTER_KERNEL_BUILDER(Name("SequenceFileDataset").Device(DEVICE_CPU), + SequenceFileDatasetOp); + +} // namespace tensorflow diff --git a/tensorflow/core/platform/s3/s3_crypto.h b/tensorflow/contrib/hadoop/ops/dataset_ops.cc similarity index 51% rename from tensorflow/core/platform/s3/s3_crypto.h rename to tensorflow/contrib/hadoop/ops/dataset_ops.cc index e376b8b0c0e11f3115ddf1103b06dad16f3f12ce..66ad549b4756028a45c1ce76db4a2367517f81a5 100644 --- a/tensorflow/core/platform/s3/s3_crypto.h +++ b/tensorflow/contrib/hadoop/ops/dataset_ops.cc @@ -1,4 +1,4 @@ -/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,24 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include -#include -#include -#include -namespace tensorflow { -static const char* S3CryptoAllocationTag = "S3CryptoAllocation"; +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" -class S3SHA256Factory : public Aws::Utils::Crypto::HashFactory { - public: - std::shared_ptr CreateImplementation() - const override; -}; +namespace tensorflow { -class S3SHA256HmacFactory : public Aws::Utils::Crypto::HMACFactory { - public: - std::shared_ptr CreateImplementation() - const override; -}; +REGISTER_OP("SequenceFileDataset") + .Input("filenames: string") + .Output("handle: variant") + .Attr("output_types: list(type) >= 1") + .SetIsStateful() + .SetShapeFn(shape_inference::ScalarShape); } // namespace tensorflow diff --git a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py new file mode 100644 index 0000000000000000000000000000000000000000..d796e43d877e463fa4398741748013b2eb661155 --- /dev/null +++ b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py @@ -0,0 +1,66 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ============================================================================== +"""Tests for SequenceFileDataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from tensorflow.contrib.hadoop.python.ops import hadoop_dataset_ops +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors +from tensorflow.python.platform import resource_loader +from tensorflow.python.platform import test + + +class SequenceFileDatasetTest(test.TestCase): + + def test_sequence_file_dataset(self): + """Test case for SequenceFileDataset. + + The file is generated with `org.apache.hadoop.io.Text` for key/value. + There are 25 records in the file with the format of: + key = XXX + value = VALUEXXX + where XXX is replaced as the line number (starts with 001). + """ + filename = os.path.join(resource_loader.get_data_files_path(), + "testdata", "string.seq") + + filenames = constant_op.constant([filename], dtypes.string) + num_repeats = 2 + + dataset = hadoop_dataset_ops.SequenceFileDataset(filenames).repeat( + num_repeats) + iterator = dataset.make_initializable_iterator() + init_op = iterator.initializer + get_next = iterator.get_next() + + with self.test_session() as sess: + sess.run(init_op) + for _ in range(num_repeats): # Dataset is repeated. + for i in range(25): # 25 records. + v0 = b"%03d" % (i + 1) + v1 = b"VALUE%03d" % (i + 1) + self.assertEqual((v0, v1), sess.run(get_next)) + with self.assertRaises(errors.OutOfRangeError): + sess.run(get_next) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/hadoop/python/kernel_tests/testdata/string.seq b/tensorflow/contrib/hadoop/python/kernel_tests/testdata/string.seq new file mode 100755 index 0000000000000000000000000000000000000000..b7175338af3417a8858d66082ab5a616f87cb234 Binary files /dev/null and b/tensorflow/contrib/hadoop/python/kernel_tests/testdata/string.seq differ diff --git a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..6e0e628655fbc32a43fad2dc4883b26c6ad57c48 --- /dev/null +++ b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py @@ -0,0 +1,75 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""SequenceFile Dataset.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.hadoop.python.ops import gen_dataset_ops +from tensorflow.contrib.hadoop.python.ops import hadoop_op_loader # pylint: disable=unused-import +from tensorflow.python.data.ops.dataset_ops import Dataset +from tensorflow.python.data.util import nest +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape + + +class SequenceFileDataset(Dataset): + """A Sequence File Dataset that reads the sequence file.""" + + def __init__(self, filenames): + """Create a `SequenceFileDataset`. + + `SequenceFileDataset` allows a user to read data from a hadoop sequence + file. A sequence file consists of (key value) pairs sequentially. At + the moment, `org.apache.hadoop.io.Text` is the only serialization type + being supported, and there is no compression support. + + For example: + + ```python + dataset = tf.contrib.hadoop.SequenceFileDataset("/foo/bar.seq") + iterator = dataset.make_one_shot_iterator() + next_element = iterator.get_next() + # Prints the (key, value) pairs inside a hadoop sequence file. + while True: + try: + print(sess.run(next_element)) + except tf.errors.OutOfRangeError: + break + ``` + + Args: + filenames: A `tf.string` tensor containing one or more filenames. + """ + super(SequenceFileDataset, self).__init__() + self._filenames = ops.convert_to_tensor( + filenames, dtype=dtypes.string, name="filenames") + + def _as_variant_tensor(self): + return gen_dataset_ops.sequence_file_dataset( + self._filenames, nest.flatten(self.output_types)) + + @property + def output_classes(self): + return ops.Tensor, ops.Tensor + + @property + def output_shapes(self): + return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([])) + + @property + def output_types(self): + return dtypes.string, dtypes.string diff --git a/tensorflow/contrib/hadoop/python/ops/hadoop_op_loader.py b/tensorflow/contrib/hadoop/python/ops/hadoop_op_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..6dbf1253f3f746de0da9664b4262cb208bee9c98 --- /dev/null +++ b/tensorflow/contrib/hadoop/python/ops/hadoop_op_loader.py @@ -0,0 +1,24 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Python helper for loading hadoop ops and kernels.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.util import loader +from tensorflow.python.platform import resource_loader + +_dataset_ops = loader.load_op_library( + resource_loader.get_path_to_datafile("../../_dataset_ops.so")) diff --git a/tensorflow/contrib/image/kernels/image_ops.cc b/tensorflow/contrib/image/kernels/image_ops.cc index 022e17d13963a14f81d76e683d13060d1f3f8a7e..693724b45751b82f942bc9416e6fb2ba33b30e22 100644 --- a/tensorflow/contrib/image/kernels/image_ops.cc +++ b/tensorflow/contrib/image/kernels/image_ops.cc @@ -71,6 +71,7 @@ class ImageProjectiveTransform : public OpKernel { void Compute(OpKernelContext* ctx) override { const Tensor& images_t = ctx->input(0); const Tensor& transform_t = ctx->input(1); + const Tensor& shape_t = ctx->input(2); OP_REQUIRES(ctx, images_t.shape().dims() == 4, errors::InvalidArgument("Input images must have rank 4")); OP_REQUIRES(ctx, @@ -81,11 +82,28 @@ class ImageProjectiveTransform : public OpKernel { ProjectiveGenerator::kNumParameters), errors::InvalidArgument( "Input transform should be num_images x 8 or 1 x 8")); - auto images = images_t.tensor(); - auto transform = transform_t.matrix(); + OP_REQUIRES(ctx, shape_t.dims() == 1, + errors::InvalidArgument("output shape must be 1-dimensional", + shape_t.shape().DebugString())); + OP_REQUIRES(ctx, shape_t.NumElements() == 2, + errors::InvalidArgument("output shape must have two elements", + shape_t.shape().DebugString())); + auto shape_vec = shape_t.vec(); + int32 out_height = shape_vec(0); + int32 out_width = shape_vec(1); + OP_REQUIRES(ctx, out_height > 0 && out_width > 0, + errors::InvalidArgument("output dimensions must be positive")); + Tensor* output_t; - OP_REQUIRES_OK(ctx, ctx->allocate_output(0, images_t.shape(), &output_t)); + OP_REQUIRES_OK(ctx, ctx->allocate_output( + 0, + TensorShape({images_t.dim_size(0), out_height, + out_width, images_t.dim_size(3)}), + &output_t)); auto output = output_t->tensor(); + auto images = images_t.tensor(); + auto transform = transform_t.matrix(); + (FillProjectiveTransform(interpolation_))( ctx->eigen_device(), &output, images, transform); } @@ -129,10 +147,11 @@ TF_CALL_double(DECLARE_FUNCTOR); } // end namespace functor -#define REGISTER(TYPE) \ - REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransform") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("dtype"), \ +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransform") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("dtype") \ + .HostMemory("output_shape"), \ ImageProjectiveTransform) TF_CALL_uint8(REGISTER); diff --git a/tensorflow/contrib/image/kernels/image_ops.h b/tensorflow/contrib/image/kernels/image_ops.h index 209aa24548443bb10c13cd506b8c93c23cfff4a4..6b63eed1303accc330293b3a44cdb9def7881666 100644 --- a/tensorflow/contrib/image/kernels/image_ops.h +++ b/tensorflow/contrib/image/kernels/image_ops.h @@ -167,7 +167,7 @@ struct FillProjectiveTransform { void operator()(const Device& device, OutputType* output, const InputType& images, const TransformsType& transform) const { - output->device(device) = images.generate( + output->device(device) = output->generate( ProjectiveGenerator(images, transform, interpolation_)); } }; diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc index e59f1bf8443732a4b84fe7461439e3d0ee7dd158..4969ac58f96c8c0b829828ad7617a0bb5520cd6a 100644 --- a/tensorflow/contrib/image/ops/image_ops.cc +++ b/tensorflow/contrib/image/ops/image_ops.cc @@ -19,23 +19,66 @@ limitations under the License. namespace tensorflow { +using shape_inference::DimensionHandle; using shape_inference::InferenceContext; using shape_inference::ShapeHandle; +namespace { + +// Sets output[0] to shape [batch_dim,height,width,channel_dim], where +// height and width come from the size_tensor. +Status SetOutputToSizedImage(InferenceContext* c, DimensionHandle batch_dim, + int size_input_idx, DimensionHandle channel_dim) { + // Verify shape of size input. + ShapeHandle size; + TF_RETURN_IF_ERROR(c->WithRank(c->input(size_input_idx), 1, &size)); + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(size, 0), 2, &unused)); + + // Get size values from the size tensor. + const Tensor* size_tensor = c->input_tensor(size_input_idx); + DimensionHandle width; + DimensionHandle height; + if (size_tensor == nullptr) { + width = c->UnknownDim(); + height = c->UnknownDim(); + } else { + // TODO(petewarden) - Remove once we have constant evaluation in C++ only. + if (size_tensor->dtype() != DT_INT32) { + return errors::InvalidArgument( + "Bad size input type for SetOutputToSizedImage: Expected DT_INT32 " + "but got ", + DataTypeString(size_tensor->dtype()), " for input #", size_input_idx, + " in ", c->DebugString()); + } + auto vec = size_tensor->vec(); + height = c->MakeDim(vec(0)); + width = c->MakeDim(vec(1)); + } + c->set_output(0, c->MakeShape({batch_dim, height, width, channel_dim})); + return Status::OK(); +} + +// TODO(qyu): Move this to core/framework/common_shape_fns.h +Status ResizeShapeFn(InferenceContext* c) { + ShapeHandle input; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input)); + return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */, + c->Dim(input, 3)); +} + +} // namespace + // TODO(ringwalt): Add a "fill_mode" argument with "constant", "mirror", etc. // TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0). -// TODO(ringwalt): Add an "output_shape" argument. This is sufficient to -// implement "same" and "valid" modes in the Python function. REGISTER_OP("ImageProjectiveTransform") .Input("images: dtype") .Input("transforms: float32") + .Input("output_shape: int32") .Attr("dtype: {uint8, int32, int64, float16, float32, float64}") .Attr("interpolation: string") .Output("transformed_images: dtype") - .SetShapeFn([](InferenceContext* c) { - c->set_output(0, c->input(0)); - return Status::OK(); - }) + .SetShapeFn(ResizeShapeFn) .Doc(R"doc( Applies the given transform to each of the images. @@ -49,7 +92,7 @@ If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps the *output* point `(x, y)` to a transformed *input* point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where `k = c0 x + c1 y + 1`. If the transformed point lays outside of the input -image, the output pixel is set to 0. The output is the same size as the input, +image, the output pixel is set to 0. images: 4D `Tensor`, input image(s) in NHWC format. transforms: 2D `Tensor`, projective transform(s) to apply to the image(s). diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py index 62a22dcf3411fb160b3c432bbdd67303697f7262..f588eae923f403f07c7f502821db4ef6acad71d5 100644 --- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py +++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py @@ -27,6 +27,7 @@ from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import gradient_checker from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops from tensorflow.python.platform import googletest _DTYPES = set( @@ -194,6 +195,19 @@ class ImageOpsTest(test_util.TensorFlowTestCase): [0.0, 149, 233, 149, 0.0], [0.0, 0.0, 87., 0.0, 0.0]]) + def test_rotate_static_shape(self): + image = array_ops.diag([1., 2., 3.]) + result = image_ops.rotate( + image, random_ops.random_uniform((), -1, 1), interpolation="BILINEAR") + self.assertEqual(image.get_shape(), result.get_shape()) + + def test_transform_static_output_shape(self): + image = constant_op.constant([[1., 2.], [3., 4.]]) + result = image_ops.transform( + image, random_ops.random_uniform([8], -1, 1), + output_shape=constant_op.constant([3, 5])) + self.assertAllEqual([3, 5], result.get_shape()) + def _test_grad(self, shape_to_test): with self.test_session(): test_image_shape = shape_to_test @@ -213,10 +227,40 @@ class ImageOpsTest(test_util.TensorFlowTestCase): x_init_value=test_image) self.assertLess(left_err, 1e-10) + def _test_grad_different_shape(self, input_shape, output_shape): + with self.test_session(): + test_image_shape = input_shape + test_image = np.random.randn(*test_image_shape) + test_image_tensor = constant_op.constant( + test_image, shape=test_image_shape) + test_transform = image_ops.angles_to_projective_transforms( + np.pi / 2, 4, 4) + + if len(output_shape) == 2: + resize_shape = output_shape + elif len(output_shape) == 3: + resize_shape = output_shape[0:2] + elif len(output_shape) == 4: + resize_shape = output_shape[1:3] + output = image_ops.transform( + images=test_image_tensor, + transforms=test_transform, + output_shape=resize_shape) + left_err = gradient_checker.compute_gradient_error( + test_image_tensor, + test_image_shape, + output, + output_shape, + x_init_value=test_image) + self.assertLess(left_err, 1e-10) + def test_grad(self): self._test_grad([16, 16]) self._test_grad([4, 12, 12]) self._test_grad([3, 4, 12, 12]) + self._test_grad_different_shape([16, 16], [8, 8]) + self._test_grad_different_shape([4, 12, 3], [8, 24, 3]) + self._test_grad_different_shape([3, 4, 12, 3], [3, 8, 24, 3]) class BipartiteMatchTest(test_util.TensorFlowTestCase): diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py index 86b0ffe9a0f2236d5ac7d5f846e7b5d2615c9b09..e7a09041adb33981df0a8c8238bc5b9358f14180 100644 --- a/tensorflow/contrib/image/python/ops/image_ops.py +++ b/tensorflow/contrib/image/python/ops/image_ops.py @@ -23,6 +23,7 @@ from tensorflow.python.framework import common_shapes from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import linalg_ops @@ -40,6 +41,9 @@ ops.RegisterShape("ImageConnectedComponents")(common_shapes.call_cpp_shape_fn) ops.RegisterShape("ImageProjectiveTransform")(common_shapes.call_cpp_shape_fn) +# TODO(ringwalt): Support a "reshape" (name used by SciPy) or "expand" (name +# used by PIL, maybe more readable) mode, which determines the correct +# output_shape and translation for the transform. def rotate(images, angles, interpolation="NEAREST", name=None): """Rotate image(s) counterclockwise by the passed angle(s) in radians. @@ -213,7 +217,11 @@ def translations_to_projective_transforms(translations, name=None): axis=1) -def transform(images, transforms, interpolation="NEAREST", name=None): +def transform(images, + transforms, + interpolation="NEAREST", + output_shape=None, + name=None): """Applies the given transform(s) to the image(s). Args: @@ -230,6 +238,10 @@ def transform(images, transforms, interpolation="NEAREST", name=None): the transform mapping input points to output points. Note that gradients are not backpropagated into transformation parameters. interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR". + output_shape: Output dimesion after the transform, [height, width]. + If None, output is the same size as input image. + + name: The name of the op. Returns: Image(s) with the same type and shape as `images`, with the given @@ -238,6 +250,7 @@ def transform(images, transforms, interpolation="NEAREST", name=None): Raises: TypeError: If `image` is an invalid type. + ValueError: If output shape is not 1-D int32 Tensor. """ with ops.name_scope(name, "transform"): image_or_images = ops.convert_to_tensor(images, name="images") @@ -256,6 +269,17 @@ def transform(images, transforms, interpolation="NEAREST", name=None): else: raise TypeError("Images should have rank between 2 and 4.") + if output_shape is None: + output_shape = tensor_util.constant_value( + array_ops.shape(images)[1:3]) or array_ops.shape(images)[1:3] + + output_shape = ops.convert_to_tensor( + output_shape, dtypes.int32, name="output_shape") + + if not output_shape.get_shape().is_compatible_with([2]): + raise ValueError("output_shape must be a 1-D Tensor of 2 elements: " + "new_height, new_width") + if len(transform_or_transforms.get_shape()) == 1: transforms = transform_or_transforms[None] elif transform_or_transforms.get_shape().ndims is None: @@ -265,8 +289,12 @@ def transform(images, transforms, interpolation="NEAREST", name=None): transforms = transform_or_transforms else: raise TypeError("Transforms should have rank 1 or 2.") + output = gen_image_ops.image_projective_transform( - images, transforms, interpolation=interpolation.upper()) + images, + output_shape=output_shape, + transforms=transforms, + interpolation=interpolation.upper()) if len(image_or_images.get_shape()) == 2: return output[0, :, :, 0] elif len(image_or_images.get_shape()) == 3: @@ -376,14 +404,6 @@ def _image_projective_transform_grad(op, grad): if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES: raise TypeError("Invalid dtype %s." % image_or_images.dtype) - if len(image_or_images.get_shape()) == 2: - images = image_or_images[None, :, :, None] - elif len(image_or_images.get_shape()) == 3: - images = image_or_images[None, :, :, :] - elif len(image_or_images.get_shape()) == 4: - images = image_or_images - else: - raise TypeError("Images should have rank between 2 and 4") if len(transform_or_transforms.get_shape()) == 1: transforms = transform_or_transforms[None] elif len(transform_or_transforms.get_shape()) == 2: @@ -396,13 +416,11 @@ def _image_projective_transform_grad(op, grad): inverse = linalg_ops.matrix_inverse(transforms) transforms = matrices_to_flat_transforms(inverse) output = gen_image_ops.image_projective_transform( - grad, transforms, interpolation=interpolation) - if len(image_or_images.get_shape()) == 2: - return [output[0, :, :, 0], None] - elif len(image_or_images.get_shape()) == 3: - return [output[0, :, :, :], None] - else: - return [output, None] + images=grad, + transforms=transforms, + output_shape=array_ops.shape(image_or_images)[1:3], + interpolation=interpolation) + return [output, None, None] def bipartite_match(distance_mat, diff --git a/tensorflow/contrib/image/python/ops/sparse_image_warp.py b/tensorflow/contrib/image/python/ops/sparse_image_warp.py index 54a215d6db6ded56a1a4a018a7e176f35fe6397e..1ea8f705b7e6f522281de6384de0d42efab6a406 100644 --- a/tensorflow/contrib/image/python/ops/sparse_image_warp.py +++ b/tensorflow/contrib/image/python/ops/sparse_image_warp.py @@ -112,10 +112,10 @@ def sparse_image_warp(image, Apply a non-linear warp to the image, where the warp is specified by the source and destination locations of a (potentially small) number of control points. First, we use a polyharmonic spline - (@{tf.contrib.image.interpolate_spline}) to interpolate the displacements + (`tf.contrib.image.interpolate_spline`) to interpolate the displacements between the corresponding control points to a dense flow field. Then, we warp the image using this dense flow field - (@{tf.contrib.image.dense_image_warp}). + (`tf.contrib.image.dense_image_warp`). Let t index our control points. For regularization_weight=0, we have: warped_image[b, dest_control_point_locations[b, t, 0], @@ -126,7 +126,7 @@ def sparse_image_warp(image, For regularization_weight > 0, this condition is met approximately, since regularized interpolation trades off smoothness of the interpolant vs. reconstruction of the interpolant at the control points. - See @{tf.contrib.image.interpolate_spline} for further documentation of the + See `tf.contrib.image.interpolate_spline` for further documentation of the interpolation_order and regularization_weight arguments. diff --git a/tensorflow/contrib/integrate/python/ops/odes.py b/tensorflow/contrib/integrate/python/ops/odes.py index 61f78febfc07bb4e677259366a81c16b2b585244..7b7ac4f347e30d20eb2f4889e0cae5669c975e4f 100644 --- a/tensorflow/contrib/integrate/python/ops/odes.py +++ b/tensorflow/contrib/integrate/python/ops/odes.py @@ -73,7 +73,7 @@ def _scaled_dot_product(scale, xs, ys, name=None): # _possibly_nonzero lets us avoid wasted computation. return math_ops.add_n( [(scale * x) * y for x, y in zip(xs, ys) - if _possibly_nonzero(x) or _possibly_nonzero(y)], + if _possibly_nonzero(x) and _possibly_nonzero(y)], name=scope) @@ -122,7 +122,7 @@ def _runge_kutta_step(func, yi = y0 + _scaled_dot_product(dt_cast, beta_i, k) k.append(func(yi, ti)) - if not (tableau.c_sol[-1] == 0 and tableau.c_sol == tableau.beta[-1]): + if not (tableau.c_sol[-1] == 0 and tableau.c_sol[:-1] == tableau.beta[-1]): # This property (true for Dormand-Prince) lets us save a few FLOPs. yi = y0 + _scaled_dot_product(dt_cast, tableau.c_sol, k) diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc index 2638b25ec424b5b4ef556ff769e94e64da32fec2..d0ea961473c7d6a07b152d1450b0ca2fdf1dc11f 100644 --- a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc +++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/core/framework/dataset.h" -#include "src-cpp/rdkafkacpp.h" +#include "rdkafkacpp.h" namespace tensorflow { @@ -52,12 +52,12 @@ class KafkaDatasetOp : public DatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: Dataset(OpKernelContext* ctx, std::vector topics, const string& servers, const string& group, const bool eof, const int64 timeout) - : GraphDatasetBase(ctx), + : DatasetBase(DatasetContext(ctx)), topics_(std::move(topics)), servers_(servers), group_(group), @@ -84,7 +84,8 @@ class KafkaDatasetOp : public DatasetOpKernel { string DebugString() const override { return "KafkaDatasetOp::Dataset"; } protected: - Status AsGraphDefInternal(DatasetGraphDefBuilder* b, + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, Node** output) const override { Node* topics = nullptr; TF_RETURN_IF_ERROR(b->AddVector(topics_, &topics)); diff --git a/tensorflow/contrib/keras/__init__.py b/tensorflow/contrib/keras/__init__.py index a162f0cb584038b8df7d1ee6fe8237160ad8f695..cecf1ddcdb1c6e1b6a6f895b83a6c4f2a2aae1f7 100644 --- a/tensorflow/contrib/keras/__init__.py +++ b/tensorflow/contrib/keras/__init__.py @@ -15,7 +15,7 @@ # ============================================================================== """Implementation of the Keras API meant to be a high-level API for TensorFlow. -This module an alias for @{tf.keras}, for backwards compatibility. +This module an alias for `tf.keras`, for backwards compatibility. Detailed documentation and user guides are also available at [keras.io](https://keras.io). diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py index 1f9e82b41bf09b235e93fa512a50ea4c3047c01b..cb649a37510c301cb3df997f844617e9a4e6c7be 100644 --- a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py +++ b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py @@ -18,10 +18,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.python.keras.preprocessing.image import apply_transform from tensorflow.python.keras.preprocessing.image import array_to_img from tensorflow.python.keras.preprocessing.image import DirectoryIterator -from tensorflow.python.keras.preprocessing.image import flip_axis from tensorflow.python.keras.preprocessing.image import ImageDataGenerator from tensorflow.python.keras.preprocessing.image import img_to_array from tensorflow.python.keras.preprocessing.image import Iterator diff --git a/tensorflow/contrib/kernel_methods/README.md b/tensorflow/contrib/kernel_methods/README.md index 44ed9670a09ece8fb11e79a3e58725e2a54e513b..1bce3277ff46ac91a8de118db17041a0e424ebc0 100644 --- a/tensorflow/contrib/kernel_methods/README.md +++ b/tensorflow/contrib/kernel_methods/README.md @@ -21,13 +21,15 @@ Currently, there is a [RandomFourierFeatureMapper](https://www.tensorflow.org/co output. More mappers are on the way. ## Kernel-based Estimators -These are estimators inheriting from the @{tf.contrib.learn.Estimator} class and -use kernel mappers internally to discover non-linearities in the data. These -canned estimators map their input features using kernel mapper Ops and then -apply linear models to the mapped features. Combining kernel mappers with linear -models and different loss functions leads to a variety of models: linear and -non-linear SVMs, linear regression (with and without kernels) and (multinomial) -logistic regression (with and without kernels). + +These estimators inherit from the +[`tf.contrib.learn.Estimator`](https://www.tensorflow.org/code/tensorflow/contrib/learn/python/learn/estimators/estimator.py) +class and use kernel mappers internally to discover non-linearities in the +data. These canned estimators map their input features using kernel mapper +Ops and then apply linear models to the mapped features. Combining kernel +mappers with linear models and different loss functions leads to a variety of +models: linear and non-linear SVMs, linear regression (with and without +kernels) and (multinomial) logistic regression (with and without kernels). Currently there is a [KernelLinearClassifier](https://www.tensorflow.org/code/tensorflow/contrib/kernel_methods/python/kernel_estimators.py) implemented but more pre-packaged estimators are on the way. diff --git a/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc index 3212279c4c50efb92acc712b82cb3e1a22c76870..95c7001371a9b43f2e6c0c66245cc4f1fafc486d 100644 --- a/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc +++ b/tensorflow/contrib/kinesis/kernels/kinesis_dataset_ops.cc @@ -164,11 +164,11 @@ class KinesisDatasetOp : public DatasetOpKernel { } private: - class Dataset : public GraphDatasetBase { + class Dataset : public DatasetBase { public: Dataset(OpKernelContext* ctx, const string& stream, const string& shard, const bool read_indefinitely, const int64 interval) - : GraphDatasetBase(ctx), + : DatasetBase(DatasetContext(ctx)), stream_(stream), shard_(shard), read_indefinitely_(read_indefinitely), @@ -194,7 +194,8 @@ class KinesisDatasetOp : public DatasetOpKernel { string DebugString() const override { return "KinesisDatasetOp::Dataset"; } protected: - Status AsGraphDefInternal(DatasetGraphDefBuilder* b, + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, Node** output) const override { Node* stream = nullptr; TF_RETURN_IF_ERROR(b->AddScalar(stream_, &stream)); diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py index bc3359693562deb1229a78a2db5c256c76f7fd8d..a7b41b714ffaa062e2eba8caf9b4fa033c7633cd 100644 --- a/tensorflow/contrib/layers/__init__.py +++ b/tensorflow/contrib/layers/__init__.py @@ -121,6 +121,7 @@ from tensorflow.contrib.layers.python.layers import * from tensorflow.python.util.all_util import remove_undocumented _allowed_symbols = ['bias_add', + 'conv1d', 'conv2d', 'conv3d', 'elu', diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py index 51610f21b24f1d40f26630cc1e69ca723d130639..1192198ec26c9db749a9bd1ee07f52395fd16a0f 100644 --- a/tensorflow/contrib/layers/python/layers/initializers.py +++ b/tensorflow/contrib/layers/python/layers/initializers.py @@ -47,7 +47,7 @@ def xavier_initializer(uniform=True, seed=None, dtype=dtypes.float32): Args: uniform: Whether to use uniform or normal distributed random initialization. seed: A Python integer. Used to create random seeds. See - @{tf.set_random_seed} for behavior. + `tf.set_random_seed` for behavior. dtype: The data type. Only floating point types are supported. Returns: @@ -98,7 +98,7 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False, mode: String. 'FAN_IN', 'FAN_OUT', 'FAN_AVG'. uniform: Whether to use uniform or normal distributed random initialization. seed: A Python integer. Used to create random seeds. See - @{tf.set_random_seed} for behavior. + `tf.set_random_seed` for behavior. dtype: The data type. Only floating point types are supported. Returns: diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py index dd602cf3a9b7826a19408a78ef543bb0c4fbf84e..04668f112d85b946f313f85e60ee607fe761f63c 100644 --- a/tensorflow/contrib/layers/python/layers/layers.py +++ b/tensorflow/contrib/layers/python/layers/layers.py @@ -55,9 +55,9 @@ from tensorflow.python.training import moving_averages # TODO(b/28426988): Replace legacy_* fns migrated from slim. # TODO(b/28426988): Remove legacy_* when all uses have migrated to new API. __all__ = [ - 'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv2d', 'conv3d', - 'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution', - 'convolution1d', 'convolution2d', 'convolution2d_in_plane', + 'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv1d', 'conv2d', + 'conv3d', 'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', + 'convolution', 'convolution1d', 'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose', 'convolution3d', 'convolution3d_transpose', 'dense_to_sparse', 'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn', 'images_to_sequence', 'layer_norm', 'linear', 'pool', 'max_pool2d', @@ -1584,7 +1584,7 @@ def dropout(inputs, outputs_collections: Collection to add the outputs. scope: Optional scope for name_scope. seed: A Python integer. Used to create random seeds. See - @{tf.set_random_seed} for behavior. + `tf.set_random_seed` for behavior. Returns: A tensor representing the output of the operation. @@ -2660,7 +2660,7 @@ def separable_convolution2d( inputs, num_outputs, kernel_size, - depth_multiplier, + depth_multiplier=1, stride=1, padding='SAME', data_format=DATA_FORMAT_NHWC, @@ -3320,6 +3320,7 @@ relu6 = functools.partial(fully_connected, activation_fn=nn.relu6) linear = functools.partial(fully_connected, activation_fn=None) # Simple alias. +conv1d = convolution1d conv2d = convolution2d conv3d = convolution3d conv2d_transpose = convolution2d_transpose diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py index c5c7269b1f15849956e90654e3bcf8ab0eebc393..51c7abb105a29ff0dfab49d77bc62d5b51517179 100644 --- a/tensorflow/contrib/layers/python/layers/layers_test.py +++ b/tensorflow/contrib/layers/python/layers/layers_test.py @@ -1189,7 +1189,7 @@ class ConvolutionInPlaneTest(test.TestCase): result = sess.run(horz_gradients) expected = np.zeros((1, 10, 9, 1)) - self.assertAllEqual(result, expected) + self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5) def testHorzConvWithBlankImageAndPlaceholder(self): image = array_ops.placeholder(dtypes.float32, shape=(None, None, None, 1)) @@ -1209,7 +1209,7 @@ class ConvolutionInPlaneTest(test.TestCase): }) expected = np.zeros((1, 10, 9, 1)) - self.assertAllEqual(result, expected) + self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5) def testHorzConvWithRandomImageMultiBatch(self): np.random.seed(1) diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD index b56a88659bbd4467600788fc8e3e9dbf38ce8244..d3aa3fa92c3ca8b67e81c4600c4ccce8a54d5792 100644 --- a/tensorflow/contrib/learn/BUILD +++ b/tensorflow/contrib/learn/BUILD @@ -79,16 +79,7 @@ py_library( "//tensorflow/python:variable_scope", "//tensorflow/python:variables", "//tensorflow/python:weights_broadcast_ops", - "//tensorflow/python/estimator", "//tensorflow/python/estimator:estimator_py", - "//tensorflow/python/estimator:export_export", - "//tensorflow/python/estimator:export_output", - "//tensorflow/python/estimator:inputs", - "//tensorflow/python/estimator:inputs_queues", - "//tensorflow/python/estimator:model_fn", - "//tensorflow/python/estimator:numpy_io", - "//tensorflow/python/estimator:pandas_io", - "//tensorflow/python/estimator:run_config", "//tensorflow/python/feature_column", "//tensorflow/python/feature_column:feature_column_py", "//tensorflow/python/ops/losses", @@ -171,7 +162,7 @@ tf_py_test( "//tensorflow/python:training", "//tensorflow/python:util", "//tensorflow/python:variables", - "//tensorflow/python/estimator", + "//tensorflow/python/estimator:estimator_py", ], tags = ["no_windows"], # TODO: needs investigation on Windows ) @@ -220,7 +211,7 @@ py_test( "//tensorflow/contrib/training:training_py", "//tensorflow/python:client_testlib", "//tensorflow/python:platform", - "//tensorflow/python/estimator:run_config", + "//tensorflow/python/estimator:estimator_py", ], ) @@ -245,7 +236,7 @@ py_test( "//tensorflow/python:summary", "//tensorflow/python:training", "//tensorflow/python:variables", - "//tensorflow/python/estimator", + "//tensorflow/python/estimator:estimator_py", ], ) @@ -259,7 +250,7 @@ py_test( "//tensorflow/core:protos_all_py", "//tensorflow/python:client_testlib", "//tensorflow/python:training", - "//tensorflow/python/estimator:run_config", + "//tensorflow/python/estimator:estimator_py", ], ) @@ -621,7 +612,7 @@ py_test( "//tensorflow/python:control_flow_ops", "//tensorflow/python:session", "//tensorflow/python:training", - "//tensorflow/python/estimator:export_output", + "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/saved_model:signature_constants", "@six_archive//:six", ], diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py index 7a026a15e4aeea0dde4ed9f7de053a757a0abb58..c1de42782efb3497660affb3ef7162457977c150 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py @@ -72,6 +72,7 @@ from tensorflow.python.saved_model import builder as saved_model_builder from tensorflow.python.saved_model import tag_constants from tensorflow.python.summary import summary as core_summary from tensorflow.python.training import basic_session_run_hooks +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import device_setter from tensorflow.python.training import monitored_session from tensorflow.python.training import saver @@ -891,7 +892,7 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable, # Check that model has been trained (if nothing has been set explicitly). if not checkpoint_path: - latest_path = saver.latest_checkpoint(self._model_dir) + latest_path = checkpoint_management.latest_checkpoint(self._model_dir) if not latest_path: raise NotFittedError( "Couldn't find trained model at %s." % self._model_dir) @@ -956,7 +957,7 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable, as_iterable=True, iterate_batches=False): # Check that model has been trained. - checkpoint_path = saver.latest_checkpoint(self._model_dir) + checkpoint_path = checkpoint_management.latest_checkpoint(self._model_dir) if not checkpoint_path: raise NotFittedError( "Couldn't find trained model at %s." % self._model_dir) @@ -1364,7 +1365,7 @@ class Estimator(BaseEstimator): if not checkpoint_path: # Locate the latest checkpoint - checkpoint_path = saver.latest_checkpoint(self._model_dir) + checkpoint_path = checkpoint_management.latest_checkpoint(self._model_dir) if not checkpoint_path: raise NotFittedError( "Couldn't find trained model at %s." % self._model_dir) diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py index 66ebcfd1d81904b9afe5be6bd1a648fe325e1e0b..21f7dcc5e427bf00ffbc71150475d94f5336f8aa 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py +++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py @@ -15,9 +15,9 @@ """Implementation of k-means clustering on top of `Estimator` API (deprecated). This module is deprecated. Please use -@{tf.contrib.factorization.KMeansClustering} instead of -@{tf.contrib.learn.KMeansClustering}. It has a similar interface, but uses the -@{tf.estimator.Estimator} API instead of @{tf.contrib.learn.Estimator}. +`tf.contrib.factorization.KMeansClustering` instead of +`tf.contrib.learn.KMeansClustering`. It has a similar interface, but uses the +`tf.estimator.Estimator` API instead of `tf.contrib.learn.Estimator`. """ from __future__ import absolute_import diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py index 7cb87619d960a03f342c7441730aaf2c4f15eb38..08f23aa2231424887f3c935dbb8368a2aa46cc63 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py +++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py @@ -221,7 +221,7 @@ class ClusterConfig(object): class RunConfig(ClusterConfig, core_run_config.RunConfig): """This class specifies the configurations for an `Estimator` run. - This class is a deprecated implementation of @{tf.estimator.RunConfig} + This class is a deprecated implementation of `tf.estimator.RunConfig` interface. """ _USE_DEFAULT = 0 @@ -302,6 +302,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig): # so instead of breaking compatibility with that assumption, we # just manually initialize this field: self._train_distribute = None + self._eval_distribute = None self._device_fn = None gpu_options = config_pb2.GPUOptions( diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py index f8a3709ee57a32734afa7ac8133271c75d152b2c..4e64efdd959eef0951c9ab782996fc2bd5919cc5 100644 --- a/tensorflow/contrib/learn/python/learn/experiment.py +++ b/tensorflow/contrib/learn/python/learn/experiment.py @@ -41,7 +41,7 @@ from tensorflow.python.estimator import estimator as core_estimator from tensorflow.python.framework import ops from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import basic_session_run_hooks -from tensorflow.python.training import saver +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import server_lib from tensorflow.python.util import compat from tensorflow.python.util import function_utils @@ -95,7 +95,7 @@ class _EvalAndExportListener(basic_session_run_hooks.CheckpointSaverListener): # Load and cache the path of the most recent checkpoint to avoid duplicate # searches on GCS. logging.info("Checking for checkpoint in %s", self._model_dir) - latest_path = saver.latest_checkpoint(self._model_dir) + latest_path = checkpoint_management.latest_checkpoint(self._model_dir) if not latest_path: logging.warning("Skipping evaluation and export since model has not been " @@ -162,16 +162,16 @@ class Experiment(object): Args: estimator: Object implementing Estimator interface, which could be a - combination of @{tf.contrib.learn.Trainable} and - @{tf.contrib.learn.Evaluable} (deprecated), or - @{tf.estimator.Estimator}. + combination of `tf.contrib.learn.Trainable` and + `tf.contrib.learn.Evaluable` (deprecated), or + `tf.estimator.Estimator`. train_input_fn: function, returns features and labels for training. eval_input_fn: function, returns features and labels for evaluation. If `eval_steps` is `None`, this should be configured only to produce for a finite number of batches (generally, 1 epoch over the evaluation data). eval_metrics: `dict` of string, metric function. If `None`, default set is used. This should be `None` if the `estimator` is - @{tf.estimator.Estimator}. If metrics are provided they will be + `tf.estimator.Estimator`. If metrics are provided they will be *appended* to the default set. train_steps: Perform this many steps of training. `None`, the default, means train forever. @@ -516,7 +516,8 @@ class Experiment(object): start = time.time() error_msg = None - latest_path = saver.latest_checkpoint(self._estimator.model_dir) + latest_path = checkpoint_management.latest_checkpoint( + self._estimator.model_dir) if not latest_path: error_msg = ("Estimator is not fitted yet. " "Will start an evaluation when a checkpoint is ready.") @@ -778,7 +779,8 @@ class Experiment(object): saving_listeners=self._saving_listeners) logging.info("Evaluating model now.") - latest_checkpoint = saver.latest_checkpoint(self._estimator.model_dir) + latest_checkpoint = checkpoint_management.latest_checkpoint( + self._estimator.model_dir) eval_result = self._call_evaluate( input_fn=self._eval_input_fn, steps=self._eval_steps, diff --git a/tensorflow/contrib/learn/python/learn/graph_actions_test.py b/tensorflow/contrib/learn/python/learn/graph_actions_test.py index 0d039d593b7850ead34484f88426255dc659b7fc..df156da3f467538ed1c6b640d651fdfd33ce243d 100644 --- a/tensorflow/contrib/learn/python/learn/graph_actions_test.py +++ b/tensorflow/contrib/learn/python/learn/graph_actions_test.py @@ -35,6 +35,7 @@ from tensorflow.python.ops import state_ops from tensorflow.python.ops import variables from tensorflow.python.platform import test from tensorflow.python.summary import summary +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import saver as saver_lib @@ -124,7 +125,7 @@ class GraphActionsTest(test.TestCase): # TODO(ptucker): Test number and contents of checkpoint files. def _assert_ckpt(self, output_dir, expected=True): - ckpt_state = saver_lib.get_checkpoint_state(output_dir) + ckpt_state = checkpoint_management.get_checkpoint_state(output_dir) if expected: pattern = '%s/model.ckpt-.*' % output_dir primary_ckpt_path = ckpt_state.model_checkpoint_path @@ -434,7 +435,7 @@ class GraphActionsTrainTest(test.TestCase): # TODO(ptucker): Test number and contents of checkpoint files. def _assert_ckpt(self, output_dir, expected=True): - ckpt_state = saver_lib.get_checkpoint_state(output_dir) + ckpt_state = checkpoint_management.get_checkpoint_state(output_dir) if expected: pattern = '%s/model.ckpt-.*' % output_dir primary_ckpt_path = ckpt_state.model_checkpoint_path diff --git a/tensorflow/contrib/learn/python/learn/monitors.py b/tensorflow/contrib/learn/python/learn/monitors.py index 77f7c73d5412d40b338eaff4cf04d99fd0892723..3d691d434044aab1e3e86457cee6aadb5bf798c7 100644 --- a/tensorflow/contrib/learn/python/learn/monitors.py +++ b/tensorflow/contrib/learn/python/learn/monitors.py @@ -51,7 +51,7 @@ from tensorflow.python.estimator import estimator as core_estimator from tensorflow.python.framework import ops from tensorflow.python.platform import tf_logging as logging from tensorflow.python.summary import summary as core_summary -from tensorflow.python.training import saver as saver_lib +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import session_run_hook from tensorflow.python.training import training_util from tensorflow.python.util import deprecation @@ -735,7 +735,8 @@ class ValidationMonitor(EveryN): return False self._last_checkpoint_check_time = current_time # Check that we are not running evaluation on the same checkpoint. - latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir) + latest_path = checkpoint_management.latest_checkpoint( + self._estimator.model_dir) if latest_path is None: logging.debug("Skipping evaluation since model has not been saved yet " "at step %d.", step) @@ -1059,7 +1060,8 @@ class ExportMonitor(EveryN): def end(self, session=None): super(ExportMonitor, self).end(session=session) - latest_path = saver_lib.latest_checkpoint(self._estimator.model_dir) + latest_path = checkpoint_management.latest_checkpoint( + self._estimator.model_dir) if latest_path is None: logging.info("Skipping export at the end since model has not been saved " "yet.") diff --git a/tensorflow/contrib/learn/python/learn/monitors_test.py b/tensorflow/contrib/learn/python/learn/monitors_test.py index 5c34d0ddb01f3bcdc407e6926e7c5b73be1863b4..ff1da32c218b4e105b5503426ac01410665f9c7e 100644 --- a/tensorflow/contrib/learn/python/learn/monitors_test.py +++ b/tensorflow/contrib/learn/python/learn/monitors_test.py @@ -39,9 +39,9 @@ from tensorflow.python.ops import variables from tensorflow.python.platform import test from tensorflow.python.platform import tf_logging as logging from tensorflow.python.summary import summary +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import gradient_descent from tensorflow.python.training import monitored_session -from tensorflow.python.training import saver from tensorflow.python.training import training_util @@ -317,7 +317,7 @@ class MonitorsTest(test.TestCase): self._run_monitor(monitor) @test.mock.patch.object(estimators, 'Estimator', autospec=True) - @test.mock.patch.object(saver, 'latest_checkpoint') + @test.mock.patch.object(checkpoint_management, 'latest_checkpoint') def test_validation_monitor_no_ckpt(self, mock_latest_checkpoint, mock_estimator_class): estimator = mock_estimator_class() @@ -336,7 +336,7 @@ class MonitorsTest(test.TestCase): mock_latest_checkpoint.assert_called_with(model_dir) @test.mock.patch.object(estimators, 'Estimator', autospec=True) - @test.mock.patch.object(saver, 'latest_checkpoint') + @test.mock.patch.object(checkpoint_management, 'latest_checkpoint') def test_validation_monitor_no_early_stopping_rounds(self, mock_latest_checkpoint, mock_estimator_class): @@ -356,7 +356,7 @@ class MonitorsTest(test.TestCase): self._assert_validation_monitor(monitor) @test.mock.patch.object(estimators, 'Estimator', autospec=True) - @test.mock.patch.object(saver, 'latest_checkpoint') + @test.mock.patch.object(checkpoint_management, 'latest_checkpoint') def test_validation_monitor_invalid_metric(self, mock_latest_checkpoint, mock_estimator_class): estimator = mock_estimator_class() @@ -375,7 +375,7 @@ class MonitorsTest(test.TestCase): self._run_monitor(monitor, num_epochs=1, num_steps_per_epoch=1) @test.mock.patch.object(estimators, 'Estimator', autospec=True) - @test.mock.patch.object(saver, 'latest_checkpoint') + @test.mock.patch.object(checkpoint_management, 'latest_checkpoint') def test_validation_monitor(self, mock_latest_checkpoint, mock_estimator_class): estimator = mock_estimator_class() @@ -464,7 +464,7 @@ class MonitorsTest(test.TestCase): monitor.epoch_end(epoch=0) monitor.end() - @test.mock.patch.object(saver, 'latest_checkpoint') + @test.mock.patch.object(checkpoint_management, 'latest_checkpoint') def test_validation_monitor_with_core_estimator(self, mock_latest_checkpoint): estimator = test.mock.Mock(spec=core_estimator.Estimator) model_dir = 'model/dir' @@ -495,7 +495,7 @@ class MonitorsTest(test.TestCase): expected_best_metrics={'loss': 42.0, 'auc': 0.5}) monitor.post_step(step=step, session=None) - @test.mock.patch.object(saver, 'latest_checkpoint') + @test.mock.patch.object(checkpoint_management, 'latest_checkpoint') def test_validation_monitor_fail_with_core_estimator_and_metrics( self, mock_latest_checkpoint): estimator = test.mock.Mock(spec=core_estimator.Estimator) diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py index 3eacac7a3d3dcff4d39025fdee88e16e385b1b84..0144b93814a174cfb8c3162f407a595ac637f4f5 100644 --- a/tensorflow/contrib/learn/python/learn/utils/export.py +++ b/tensorflow/contrib/learn/python/learn/utils/export.py @@ -35,6 +35,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import lookup_ops from tensorflow.python.ops import variables from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import saver as tf_saver from tensorflow.python.training import training_util @@ -298,7 +299,8 @@ def _export_estimator(estimator, # If checkpoint_path is specified, use the specified checkpoint path. checkpoint_path = (checkpoint_path or - tf_saver.latest_checkpoint(estimator._model_dir)) + checkpoint_management.latest_checkpoint( + estimator._model_dir)) with ops.Graph().as_default() as g: training_util.create_global_step(g) diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py index f8106d1e4a7e79f1cd651c40995be480721a8129..4f22054af3077fa5322b52f56e815fe76104f602 100644 --- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py +++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py @@ -55,7 +55,7 @@ from tensorflow.python.platform import tf_logging as logging from tensorflow.python.saved_model import signature_constants from tensorflow.python.saved_model import signature_def_utils from tensorflow.python.summary import summary_iterator -from tensorflow.python.training import saver +from tensorflow.python.training import checkpoint_management from tensorflow.python.util import compat from tensorflow.python.util.deprecation import deprecated @@ -415,7 +415,7 @@ def make_export_strategy(serving_input_fn, `InputFnOps`. default_output_alternative_key: the name of the head to serve when an incoming serving request does not explicitly request a specific head. - Must be `None` if the estimator inherits from @{tf.estimator.Estimator} + Must be `None` if the estimator inherits from `tf.estimator.Estimator` or for single-headed models. assets_extra: A dict specifying how to populate the assets.extra directory within the exported SavedModel. Each key should give the destination @@ -453,7 +453,7 @@ def make_export_strategy(serving_input_fn, The string path to the exported directory. Raises: - ValueError: If `estimator` is a @{tf.estimator.Estimator} instance + ValueError: If `estimator` is a `tf.estimator.Estimator` instance and `default_output_alternative_key` was specified. """ if isinstance(estimator, core_estimator.Estimator): @@ -504,7 +504,7 @@ def make_parsing_export_strategy(feature_columns, that must be provided at serving time (excluding labels!). default_output_alternative_key: the name of the head to serve when an incoming serving request does not explicitly request a specific head. - Must be `None` if the estimator inherits from @{tf.estimator.Estimator} + Must be `None` if the estimator inherits from `tf.estimator.Estimator` or for single-headed models. assets_extra: A dict specifying how to populate the assets.extra directory within the exported SavedModel. Each key should give the destination @@ -714,7 +714,8 @@ def make_best_model_export_strategy( # as soon as contrib is cleaned up and we can thus be sure that # estimator is a tf.estimator.Estimator and not a # tf.contrib.learn.Estimator - checkpoint_path = saver.latest_checkpoint(estimator.model_dir) + checkpoint_path = checkpoint_management.latest_checkpoint( + estimator.model_dir) export_checkpoint_path, export_eval_result = best_model_selector.update( checkpoint_path, eval_result) @@ -766,7 +767,7 @@ def extend_export_strategy(base_export_strategy, The string path to the SavedModel indicated by post_export_fn. Raises: - ValueError: If `estimator` is a @{tf.estimator.Estimator} instance + ValueError: If `estimator` is a `tf.estimator.Estimator` instance and `default_output_alternative_key` was specified or if post_export_fn does not return a valid directory. RuntimeError: If unable to create temporary or final export directory. diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD index fe0ba19fcbe90edbeb1445e1fea77c36cf3ba170..7534b50a4ae0076fb27fb9cd0d1dd58b29192876 100644 --- a/tensorflow/contrib/linear_optimizer/BUILD +++ b/tensorflow/contrib/linear_optimizer/BUILD @@ -41,7 +41,10 @@ py_test( size = "medium", srcs = ["python/kernel_tests/sdca_ops_test.py"], srcs_version = "PY2AND3", - tags = ["no_windows_gpu"], + tags = [ + "no_gpu", + "no_pip_gpu", + ], deps = [ ":sdca_ops_py", ":sparse_feature_column_py", diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD index 7d7dd6b7088f457b1a14a3ff30b7eef98c00d18a..1e6f1e7da212c3aeb1563dc2f4b6dff2cb550736 100644 --- a/tensorflow/contrib/lite/BUILD +++ b/tensorflow/contrib/lite/BUILD @@ -125,10 +125,22 @@ cc_library( "graph_info.cc", "interpreter.cc", "model.cc", - "nnapi_delegate.cc", "op_resolver.cc", "optional_debug_tools.cc", - ], + ] + select({ + "//tensorflow:android": [ + "nnapi_delegate.cc", + "mmap_allocation.cc", + ], + "//tensorflow:windows": [ + "nnapi_delegate_disabled.cc", + "mmap_allocation_disabled.cc", + ], + "//conditions:default": [ + "nnapi_delegate_disabled.cc", + "mmap_allocation.cc", + ], + }), hdrs = [ "allocation.h", "context.h", diff --git a/tensorflow/contrib/lite/allocation.cc b/tensorflow/contrib/lite/allocation.cc index ef6c14f08532a8d25ab9be6000bc0f24559074d2..89462618148a2afbcf2ef6b1dd2985bcd0178734 100644 --- a/tensorflow/contrib/lite/allocation.cc +++ b/tensorflow/contrib/lite/allocation.cc @@ -13,61 +13,22 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include -#ifndef TFLITE_MCU -#include -#endif +#include "tensorflow/contrib/lite/allocation.h" + #include #include -#include #include #include #include #include #include -#include "tensorflow/contrib/lite/allocation.h" #include "tensorflow/contrib/lite/context.h" #include "tensorflow/contrib/lite/error_reporter.h" -#ifndef TFLITE_MCU -#include "tensorflow/contrib/lite/nnapi_delegate.h" -#endif namespace tflite { #ifndef TFLITE_MCU -MMAPAllocation::MMAPAllocation(const char* filename, - ErrorReporter* error_reporter) - : Allocation(error_reporter), mmapped_buffer_(MAP_FAILED) { - mmap_fd_ = open(filename, O_RDONLY); - if (mmap_fd_ == -1) { - error_reporter_->Report("Could not open '%s'.", filename); - return; - } - struct stat sb; - fstat(mmap_fd_, &sb); - buffer_size_bytes_ = sb.st_size; - mmapped_buffer_ = - mmap(nullptr, buffer_size_bytes_, PROT_READ, MAP_SHARED, mmap_fd_, 0); - if (mmapped_buffer_ == MAP_FAILED) { - error_reporter_->Report("Mmap of '%s' failed.", filename); - return; - } -} - -MMAPAllocation::~MMAPAllocation() { - if (valid()) { - munmap(const_cast(mmapped_buffer_), buffer_size_bytes_); - } - if (mmap_fd_ != -1) close(mmap_fd_); -} - -const void* MMAPAllocation::base() const { return mmapped_buffer_; } - -size_t MMAPAllocation::bytes() const { return buffer_size_bytes_; } - -bool MMAPAllocation::valid() const { return mmapped_buffer_ != MAP_FAILED; } - FileCopyAllocation::FileCopyAllocation(const char* filename, ErrorReporter* error_reporter) : Allocation(error_reporter) { @@ -111,6 +72,7 @@ const void* FileCopyAllocation::base() const { return copied_buffer_.get(); } size_t FileCopyAllocation::bytes() const { return buffer_size_bytes_; } bool FileCopyAllocation::valid() const { return copied_buffer_ != nullptr; } +#endif MemoryAllocation::MemoryAllocation(const void* ptr, size_t num_bytes, ErrorReporter* error_reporter) @@ -118,7 +80,6 @@ MemoryAllocation::MemoryAllocation(const void* ptr, size_t num_bytes, buffer_ = ptr; buffer_size_bytes_ = num_bytes; } -#endif MemoryAllocation::~MemoryAllocation() {} diff --git a/tensorflow/contrib/lite/allocation.h b/tensorflow/contrib/lite/allocation.h index 827ea86503f910714971e2b138295b9a5809dfd5..121f3d264687933f45f3a2c5d2a53ad80d594ca9 100644 --- a/tensorflow/contrib/lite/allocation.h +++ b/tensorflow/contrib/lite/allocation.h @@ -52,6 +52,8 @@ class MMAPAllocation : public Allocation { size_t bytes() const override; bool valid() const override; + static bool IsSupported(); + protected: // Data required for mmap. int mmap_fd_ = -1; // mmap file descriptor diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl index 422584c0eac6e703257bc58f138695d8f580a126..81844756bc7239fa798ff96b8b093afdf9ea9557 100644 --- a/tensorflow/contrib/lite/build_def.bzl +++ b/tensorflow/contrib/lite/build_def.bzl @@ -247,7 +247,9 @@ def generated_test_models(): "local_response_norm", "log_softmax", "log", + "logical_and", "logical_or", + "logical_xor", "lstm", "max_pool", "maximum", @@ -263,7 +265,7 @@ def generated_test_models(): "prelu", "pow", "reduce_max", - #"reduce_prod", # disabled due to b/111823366 + "reduce_prod", "relu", "relu1", "relu6", diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h index 0b6568fd2fec583914de1d1594f29912425d8b40..8a8eb9856886538a1483141ab5f67f54613ea2a1 100644 --- a/tensorflow/contrib/lite/builtin_ops.h +++ b/tensorflow/contrib/lite/builtin_ops.h @@ -111,6 +111,8 @@ typedef enum { kTfLiteBuiltinPack = 83, kTfLiteBuiltinLogicalOr = 84, kTfLiteBuiltinOneHot = 85, + kTfLiteBuiltinLogicalAnd = 86, + kTfLiteBuiltinLogicalNot = 87, } TfLiteBuiltinOperator; #ifdef __cplusplus diff --git a/tensorflow/contrib/lite/delegates/eager/BUILD b/tensorflow/contrib/lite/delegates/eager/BUILD index a28707382ebaac421a077432a6efd4ea1f6bb0fb..bb518becc582b776096fc0d2720042286b0b871e 100644 --- a/tensorflow/contrib/lite/delegates/eager/BUILD +++ b/tensorflow/contrib/lite/delegates/eager/BUILD @@ -7,6 +7,8 @@ package(default_visibility = [ licenses(["notice"]) # Apache 2.0 +load("//tensorflow:tensorflow.bzl", "tf_cc_test") + cc_library( name = "buffer_map", srcs = ["buffer_map.cc"], @@ -21,12 +23,11 @@ cc_library( ], ) -cc_test( +tf_cc_test( name = "buffer_map_test", size = "small", srcs = ["buffer_map_test.cc"], tags = [ - "no_oss", "tflite_not_portable", ], deps = [ @@ -38,6 +39,41 @@ cc_test( ], ) +cc_library( + name = "delegate", + srcs = [ + "delegate.cc", + ], + hdrs = [ + "delegate.h", + ], + deps = [ + ":buffer_map", + ":delegate_data", + ":kernel", + ":util", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite:kernel_api", + "//tensorflow/contrib/lite:util", + "//tensorflow/core:lib", + ], +) + +tf_cc_test( + name = "delegate_test", + size = "small", + srcs = ["delegate_test.cc"], + tags = [ + "tflite_not_portable", + ], + deps = [ + ":delegate", + ":test_util", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + ], +) + cc_library( name = "delegate_data", srcs = ["delegate_data.cc"], @@ -50,12 +86,11 @@ cc_library( ], ) -cc_test( +tf_cc_test( name = "delegate_data_test", size = "small", srcs = ["delegate_data_test.cc"], tags = [ - "no_oss", "tflite_not_portable", ], deps = [ @@ -76,6 +111,7 @@ cc_library( ":util", "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:kernel_api", + "//tensorflow/contrib/lite:string", "//tensorflow/contrib/lite/kernels:kernel_util", "//tensorflow/core:protos_all_cc", "//tensorflow/core/common_runtime/eager:context", @@ -85,21 +121,31 @@ cc_library( ], ) -cc_test( +tf_cc_test( name = "kernel_test", size = "small", srcs = ["kernel_test.cc"], tags = [ - "no_oss", "tflite_not_portable", ], deps = [ ":delegate_data", ":kernel", + ":test_util", + "@com_google_googletest//:gtest", + ], +) + +cc_library( + name = "test_util", + testonly = True, + srcs = ["test_util.cc"], + hdrs = ["test_util.h"], + deps = [ + "//tensorflow/c:c_api_internal", + "//tensorflow/contrib/lite:string", "//tensorflow/contrib/lite/kernels:test_util", - "//tensorflow/contrib/lite/testing:util", "@com_google_absl//absl/memory", - "@com_google_googletest//:gtest", "@flatbuffers", ], ) @@ -109,6 +155,7 @@ cc_library( srcs = ["util.cc"], hdrs = ["util.h"], deps = [ + ":constants", "//tensorflow/c:c_api_internal", "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:kernel_api", @@ -117,18 +164,22 @@ cc_library( ], ) -cc_test( +tf_cc_test( name = "util_test", size = "small", srcs = ["util_test.cc"], tags = [ - "no_oss", "tflite_not_portable", ], deps = [ ":util", + "//tensorflow/contrib/lite:string", "//tensorflow/contrib/lite/testing:util", - "//tensorflow/core:lib", "@com_google_googletest//:gtest", ], ) + +cc_library( + name = "constants", + hdrs = ["constants.h"], +) diff --git a/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc b/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc index dcb3f6c94150892f565380ff0598a7a28f9399b1..a046943e56d2b80f2670b7fc3dd57b36dc4d2425 100644 --- a/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc +++ b/tensorflow/contrib/lite/delegates/eager/buffer_map_test.cc @@ -56,8 +56,8 @@ tensorflow::Tensor MakeTensor(const std::vector& shape, return buffer_map.GetTensor(0); } -std::vector GetTensorShape(const tensorflow::Tensor& t) { - std::vector shape(t.dims()); +std::vector GetTensorShape(const tensorflow::Tensor& t) { + std::vector shape(t.dims()); for (int i = 0; i < t.dims(); ++i) { shape[i] = t.dim_size(i); } diff --git a/tensorflow/contrib/lite/delegates/eager/constants.h b/tensorflow/contrib/lite/delegates/eager/constants.h new file mode 100644 index 0000000000000000000000000000000000000000..7ed6ab7552792c68e6d90056c83c3c574c3f69f7 --- /dev/null +++ b/tensorflow/contrib/lite/delegates/eager/constants.h @@ -0,0 +1,29 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_CONSTANTS_H_ +#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_CONSTANTS_H_ + +namespace tflite { +namespace eager { + +// The prefix of Eager op custom code. +// This will be matched agains the `custom_code` field in `OperatorCode` +// Flatbuffer Table. +constexpr char kCustomCodePrefix[] = "Eager"; + +} // namespace eager +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_CONSTANTS_H_ diff --git a/tensorflow/contrib/lite/delegates/eager/delegate.cc b/tensorflow/contrib/lite/delegates/eager/delegate.cc new file mode 100644 index 0000000000000000000000000000000000000000..7d22b454199e2c0d9b8fea05086a7c62d7cdbe81 --- /dev/null +++ b/tensorflow/contrib/lite/delegates/eager/delegate.cc @@ -0,0 +1,110 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/delegates/eager/delegate.h" + +#include + +#include "tensorflow/contrib/lite/context_util.h" +#include "tensorflow/contrib/lite/delegates/eager/buffer_map.h" +#include "tensorflow/contrib/lite/delegates/eager/kernel.h" +#include "tensorflow/contrib/lite/delegates/eager/util.h" +#include "tensorflow/contrib/lite/util.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tflite { +namespace eager { +namespace delegate { + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) { + // Get the nodes in the current execution plan. Interpreter owns this array. + TfLiteIntArray* plan; + TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan)); + + // Add all custom ops starting with "Eager" to list of supported nodes. + std::vector supported_nodes; + for (int node_index : TfLiteIntArrayView(plan)) { + TfLiteNode* node; + TfLiteRegistration* registration; + TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration( + context, node_index, &node, ®istration)); + + if (IsEagerOp(registration->custom_name)) { + supported_nodes.push_back(node_index); + } + } + + // Request TFLite to partition the graph and make kernels for each independent + // subgraph. + TfLiteIntArray* size_and_nodes = + ConvertVectorToTfLiteIntArray(supported_nodes); + context->ReplaceSubgraphsWithDelegateKernels(context, GetKernel(), + size_and_nodes, delegate); + TfLiteIntArrayFree(size_and_nodes); + return kTfLiteOk; +} + +TfLiteStatus CopyFromBufferHandle(TfLiteDelegate* delegate, + TfLiteBufferHandle buffer_handle, void* data, + size_t size) { + // TODO(nupurgarg): Make BufferMap unique to each interpreter in order to + // support multiple interpreters using a single delegate. + BufferMap* buffer_map = + reinterpret_cast(delegate->data_)->GetBufferMap(); + + // TODO(nupurgarg): Use TfLiteContext's ReportError instead of fprinf. + if (!buffer_map->HasTensor(buffer_handle)) { + fprintf(stderr, "Invalid tensor index %d.\n", buffer_handle); + return kTfLiteError; + } + + tensorflow::Tensor t = buffer_map->GetTensor(buffer_handle); + tensorflow::StringPiece t_data = t.tensor_data(); + + if (size != t_data.size()) { + fprintf(stderr, "Not enough space to store TensorFlow's aligned buffer.\n"); + return kTfLiteError; + } + + memcpy(data, t_data.data(), t_data.size()); + return kTfLiteOk; +} + +} // namespace delegate +} // namespace eager + +EagerDelegate::EagerDelegate() {} + +EagerDelegate::~EagerDelegate() {} + +TfLiteStatus EagerDelegate::Apply(Interpreter* interpreter) { + if (!delegate_) { + if (!eager::DelegateData::Create(&delegate_data_).ok()) { + fprintf(stderr, "Unable to initialize TensorFlow context.\n"); + return kTfLiteError; + } + + delegate_.reset(new TfLiteDelegate{ + /*data_=*/delegate_data_.get(), + /*nullptr,*/ &eager::delegate::Prepare, + /*CopyFromBufferHandle=*/&eager::delegate::CopyFromBufferHandle, + /*CopyToBufferHandle=*/nullptr, + /*FreeBufferHandle=*/nullptr}); + } + + return interpreter->ModifyGraphWithDelegate(delegate_.get(), + /*allow_dynamic_tensors=*/true); +} + +} // namespace tflite diff --git a/tensorflow/contrib/lite/delegates/eager/delegate.h b/tensorflow/contrib/lite/delegates/eager/delegate.h new file mode 100644 index 0000000000000000000000000000000000000000..0defca7c323e81bfb211ac56fd59c8656b320574 --- /dev/null +++ b/tensorflow/contrib/lite/delegates/eager/delegate.h @@ -0,0 +1,55 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_DELEGATE_H_ +#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_DELEGATE_H_ + +#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h" +#include "tensorflow/contrib/lite/interpreter.h" + +namespace tflite { + +// WARNING: This is an experimental interface that is subject to change. +// Delegate that can be used to extract parts of a graph that are designed to be +// executed by TensorFlow's runtime via Eager. +// +// The interpreter must be constructed after the EagerDelegate and destructed +// before the EagerDelegate. This delegate can only be used with one +// interpreter. +// +// Usage: +// EagerDelegate delegate; +// ... build interpreter ... +// +// delegate.Apply(interpreter); +// ... run inference ... +// ... destroy interpreter ... +// ... destroy delegate ... +class EagerDelegate { + public: + EagerDelegate(); + ~EagerDelegate(); + + // Modifies the graph loaded in the interpreter. + TfLiteStatus Apply(Interpreter* interpreter); + + private: + std::unique_ptr delegate_data_; + std::unique_ptr delegate_; +}; + +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_DELEGATE_H_ diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_test.cc b/tensorflow/contrib/lite/delegates/eager/delegate_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..88fb34044ec5f8e5b4593638163cd4e6407bf8c8 --- /dev/null +++ b/tensorflow/contrib/lite/delegates/eager/delegate_test.cc @@ -0,0 +1,150 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/delegates/eager/delegate.h" + +#include +#include +#include "tensorflow/contrib/lite/delegates/eager/test_util.h" + +namespace tflite { +namespace eager { +namespace { + +using ::testing::ContainsRegex; +using ::testing::ElementsAre; + +// TODO(nupurgarg): Add a test with multiple interpreters for one delegate. + +class DelegateTest : public testing::EagerModelTest { + public: + DelegateTest() { + // The delegate needs to be constructed before the interpreter because the + // interpreter references data contained in the delegate. + delegate_.reset(new EagerDelegate()); + interpreter_.reset(new Interpreter(&error_reporter_)); + } + + ~DelegateTest() override { + // The delegate needs to be destructed after the interpreter because the + // interpreter references data contained in the delegate. + delete interpreter_.release(); + delete delegate_.release(); + } + + void ConfigureDelegate() { + CHECK(delegate_->Apply(interpreter_.get()) == kTfLiteOk); + } + + private: + std::unique_ptr delegate_; +}; + +TEST_F(DelegateTest, FullGraph) { + // Define the graph. + AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3}); + + AddTfOp(testing::kUnpack, {0}, {1, 2}); + AddTfOp(testing::kUnpack, {3}, {4, 5}); + AddTfOp(testing::kAdd, {1, 4}, {6}); + AddTfOp(testing::kAdd, {2, 5}, {7}); + AddTfOp(testing::kMul, {6, 7}, {8}); + + // Apply the delegate. + ConfigureDelegate(); + + // Define inputs. + SetShape(0, {2, 2, 1}); + SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f}); + SetShape(3, {2, 2, 1}); + SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f}); + + ASSERT_TRUE(Invoke()); + + ASSERT_THAT(GetShape(8), ElementsAre(2, 1)); + ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f)); +} + +TEST_F(DelegateTest, MixedGraph) { + AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3}); + + AddTfOp(testing::kUnpack, {0}, {1, 2}); + AddTfOp(testing::kUnpack, {3}, {4, 5}); + AddTfOp(testing::kAdd, {1, 4}, {6}); + AddTfOp(testing::kAdd, {2, 5}, {7}); + AddTfLiteMulOp({6, 7}, {8}); + + ConfigureDelegate(); + + SetShape(0, {2, 2, 1}); + SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f}); + SetShape(3, {2, 2, 1}); + SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f}); + + ASSERT_TRUE(Invoke()); + + ASSERT_THAT(GetShape(8), ElementsAre(2, 1)); + ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f)); +} + +TEST_F(DelegateTest, SplitGraph) { + AddTensors(10, {0}, {9}, kTfLiteFloat32, {3}); + + AddTfOp(testing::kUnpack, {0}, {1, 2}); + AddTfOp(testing::kAdd, {1, 2}, {3}); + AddTfOp(testing::kUnpack, {3}, {4, 5}); + + AddTfLiteMulOp({4, 5}, {6}); + + AddTfOp(testing::kUnpack, {6}, {7, 8}); + AddTfOp(testing::kAdd, {7, 8}, {9}); + + ConfigureDelegate(); + + SetShape(0, {2, 2, 2, 1}); + SetValues(0, {3.0f, 1.0f, 0.5f, -1.0f, 0.0f, 1.0f, 1.5f, 3.0f}); + + ASSERT_TRUE(Invoke()); + + ASSERT_THAT(GetShape(9), ElementsAre(1)); + ASSERT_THAT(GetValues(9), ElementsAre(10.0f)); +} + +TEST_F(DelegateTest, OnlyTFLite) { + // Only TFLite single op model. + AddTensors(10, {0, 1}, {2}, kTfLiteFloat32, {3}); + AddTfLiteMulOp({0, 1}, {2}); + + ConfigureDelegate(); + + SetShape(0, {2, 2, 1}); + SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f}); + SetShape(1, {2, 2, 1}); + SetValues(1, {1.0f, 2.0f, 3.0f, 4.0f}); + + ASSERT_TRUE(Invoke()); + + ASSERT_THAT(GetShape(2), ElementsAre(2, 2, 1)); + ASSERT_THAT(GetValues(2), ElementsAre(1.1f, 4.4f, 9.9f, 17.6f)); +} + +} // namespace +} // namespace eager +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.cc b/tensorflow/contrib/lite/delegates/eager/kernel.cc index 172798180762f87e1c080be7788db661a63208b5..1bd17a3bcae727e8908ce669472f79595a8916a0 100644 --- a/tensorflow/contrib/lite/delegates/eager/kernel.cc +++ b/tensorflow/contrib/lite/delegates/eager/kernel.cc @@ -14,13 +14,14 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/delegates/eager/kernel.h" -#include "third_party/flatbuffers/include/flatbuffers/flexbuffers.h" +#include "flatbuffers/flexbuffers.h" #include "tensorflow/contrib/lite/builtin_ops.h" #include "tensorflow/contrib/lite/context.h" #include "tensorflow/contrib/lite/context_util.h" #include "tensorflow/contrib/lite/delegates/eager/delegate_data.h" #include "tensorflow/contrib/lite/delegates/eager/util.h" #include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/string.h" #include "tensorflow/core/common_runtime/eager/context.h" #include "tensorflow/core/common_runtime/eager/execute.h" #include "tensorflow/core/common_runtime/eager/tensor_handle.h" diff --git a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc index 7d9dddef93346c8e20df0d3f84ece6197a605c86..b7bfbb34e49c71142e28f0bf1b2f84e0ff570734 100644 --- a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc +++ b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc @@ -16,26 +16,16 @@ limitations under the License. #include #include -#include "absl/memory/memory.h" -#include "third_party/flatbuffers/include/flatbuffers/flexbuffers.h" #include "tensorflow/contrib/lite/delegates/eager/delegate_data.h" -#include "tensorflow/contrib/lite/kernels/test_util.h" -#include "tensorflow/contrib/lite/testing/util.h" +#include "tensorflow/contrib/lite/delegates/eager/test_util.h" namespace tflite { namespace eager { namespace { -using tensorflow::protobuf::TextFormat; using ::testing::ContainsRegex; using ::testing::ElementsAre; -// We will use these are custom_names, so they need to be static. -static const char kIdentity[] = "Identity"; -static const char kUnpack[] = "Unpack"; -static const char kAdd[] = "Add"; -static const char kMul[] = "Mul"; - TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate, const std::vector& supported_nodes) { TfLiteIntArray* size_and_nodes = @@ -46,39 +36,18 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate, return kTfLiteOk; } -class KernelTest : public ::testing::Test { +class KernelTest : public testing::EagerModelTest { public: KernelTest() { CHECK(DelegateData::Create(&delegate_data_).ok()); interpreter_.reset(new Interpreter(&error_reporter_)); } - bool Invoke() { return interpreter_->Invoke() == kTfLiteOk; } - - void SetValues(int tensor_index, const std::vector& values) { - float* v = interpreter_->typed_tensor(tensor_index); - for (float f : values) { - *v++ = f; - } - } - - std::vector GetValues(int tensor_index) { - TfLiteTensor* o = interpreter_->tensor(tensor_index); - return std::vector(o->data.f, o->data.f + o->bytes / sizeof(float)); - } - - void SetShape(int tensor_index, const std::vector& values) { - ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk); - ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk); - } - - std::vector GetShape(int tensor_index) { - std::vector result; - auto* dims = interpreter_->tensor(tensor_index)->dims; - for (int i = 0; i < dims->size; ++i) { - result.push_back(dims->data[i]); - } - return result; + ~KernelTest() override { + // The data needs to be released before the interpreter because the + // interpreter references the data. + delegate_data_.reset(); + interpreter_.reset(); } template @@ -99,112 +68,20 @@ class KernelTest : public ::testing::Test { &delegate_, /*allow_dynamic_tensors=*/true) == kTfLiteOk); } - void AddOp(const char* name, const std::vector& inputs, - const std::vector& outputs) { - auto attr = [](const string& key, const string& value) { - return " attr{ key: '" + key + "' value {" + value + "}}"; - }; - - string attributes; - if (name == string(kUnpack)) { - attributes = attr("T", "type: DT_FLOAT") + attr("num", "i: 2") + - attr("axis", "i: 0"); - } else if (name == string(kIdentity)) { - attributes = attr("T", "type: DT_FLOAT"); - } else if (name == string(kAdd)) { - attributes = attr("T", "type: DT_FLOAT"); - } else if (name == string(kMul)) { - attributes = attr("T", "type: DT_FLOAT"); - } - AddTfOp(name, attributes, inputs, outputs); - } - - void AddTensors(int num_tensors, const std::vector& inputs, - const std::vector& outputs) { - interpreter_->AddTensors(num_tensors); - for (int i = 0; i < num_tensors; ++i) { - TfLiteQuantizationParams quant; - CHECK_EQ(interpreter_->SetTensorParametersReadWrite(i, kTfLiteFloat32, - /*name=*/"", - /*dims=*/{3}, quant), - kTfLiteOk); - } - - CHECK_EQ(interpreter_->SetInputs(inputs), kTfLiteOk); - CHECK_EQ(interpreter_->SetOutputs(outputs), kTfLiteOk); - } - - const TestErrorReporter& error_reporter() const { return error_reporter_; } - - void AddTfLiteOp(const char* name, const std::vector& inputs, - const std::vector& outputs) { - CHECK_EQ(string(name), kMul); // can only add MUL - static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr}; - reg.builtin_code = BuiltinOperator_MUL; - reg.prepare = [](TfLiteContext* context, TfLiteNode* node) { - auto* i0 = &context->tensors[node->inputs->data[0]]; - auto* o = &context->tensors[node->outputs->data[0]]; - return context->ResizeTensor(context, o, TfLiteIntArrayCopy(i0->dims)); - }; - reg.invoke = [](TfLiteContext* context, TfLiteNode* node) { - auto* i0 = &context->tensors[node->inputs->data[0]]; - auto* i1 = &context->tensors[node->inputs->data[1]]; - auto* o = &context->tensors[node->outputs->data[0]]; - for (int i = 0; i < o->bytes / sizeof(float); ++i) { - o->data.f[i] = i0->data.f[i] * i1->data.f[i]; - } - return kTfLiteOk; - }; - - CHECK_EQ(interpreter_->AddNodeWithParameters(inputs, outputs, nullptr, 0, - nullptr, ®), - kTfLiteOk); - } - private: - void AddTfOp(const char* name, const string& nodedef_str, - const std::vector& inputs, - const std::vector& outputs) { - static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr}; - reg.builtin_code = BuiltinOperator_CUSTOM; - reg.custom_name = name; - - tensorflow::NodeDef nodedef; - CHECK(TextFormat::ParseFromString(nodedef_str + " op: '" + name + "'", - &nodedef)); - string serialized_nodedef; - CHECK(nodedef.SerializeToString(&serialized_nodedef)); - flexbuffers::Builder fbb; - fbb.Vector([&]() { - fbb.String(nodedef.op()); - fbb.String(serialized_nodedef); - }); - fbb.Finish(); - - flexbuffers_.push_back(fbb.GetBuffer()); - auto& buffer = flexbuffers_.back(); - CHECK_EQ(interpreter_->AddNodeWithParameters( - inputs, outputs, reinterpret_cast(buffer.data()), - buffer.size(), nullptr, ®), - kTfLiteOk); - } - - std::unique_ptr interpreter_; std::unique_ptr delegate_data_; TfLiteDelegate delegate_; - std::vector> flexbuffers_; - TestErrorReporter error_reporter_; }; TEST_F(KernelTest, FullGraph) { // Define the graph. - AddTensors(9, {0, 3}, {8}); + AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3}); - AddOp(kUnpack, {0}, {1, 2}); - AddOp(kUnpack, {3}, {4, 5}); - AddOp(kAdd, {1, 4}, {6}); - AddOp(kAdd, {2, 5}, {7}); - AddOp(kMul, {6, 7}, {8}); + AddTfOp(testing::kUnpack, {0}, {1, 2}); + AddTfOp(testing::kUnpack, {3}, {4, 5}); + AddTfOp(testing::kAdd, {1, 4}, {6}); + AddTfOp(testing::kAdd, {2, 5}, {7}); + AddTfOp(testing::kMul, {6, 7}, {8}); // Apply Delegate. ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) { @@ -224,8 +101,8 @@ TEST_F(KernelTest, FullGraph) { } TEST_F(KernelTest, BadTensorFlowOp) { - AddTensors(2, {0}, {1}); - AddOp("NonExistentOp", {0}, {1}); + AddTensors(2, {0}, {1}, kTfLiteFloat32, {3}); + AddTfOp(testing::kNonExistent, {0}, {1}); ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) { return GenericPrepare(context, delegate, {0}); @@ -240,8 +117,8 @@ TEST_F(KernelTest, BadTensorFlowOp) { } TEST_F(KernelTest, BadNumberOfOutputs) { - AddTensors(3, {0}, {1, 2}); - AddOp(kIdentity, {0}, {1, 2}); + AddTensors(3, {0}, {1, 2}, kTfLiteFloat32, {3}); + AddTfOp(testing::kIdentity, {0}, {1, 2}); ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) { return GenericPrepare(context, delegate, {0}); @@ -256,10 +133,10 @@ TEST_F(KernelTest, BadNumberOfOutputs) { } TEST_F(KernelTest, IncompatibleNodeDef) { - AddTensors(2, {0}, {1}); + AddTensors(2, {0}, {1}, kTfLiteFloat32, {3}); - // Cast is a TF op, but we don't add the proper nodedef to it in AddOp. - AddOp("Cast", {0}, {1}); + // Cast is a TF op, but we don't add the proper nodedef to it in AddTfOp. + AddTfOp(testing::kIncompatibleNodeDef, {0}, {1}); ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) { return GenericPrepare(context, delegate, {0}); @@ -274,11 +151,11 @@ TEST_F(KernelTest, IncompatibleNodeDef) { } TEST_F(KernelTest, WrongSetOfNodes) { - AddTensors(4, {0}, {3}); - AddOp(kUnpack, {0}, {1, 2}); - AddTfLiteOp(kMul, {1, 2}, {3}); + AddTensors(4, {0}, {3}, kTfLiteFloat32, {3}); + AddTfOp(testing::kUnpack, {0}, {1, 2}); + AddTfLiteMulOp({1, 2}, {3}); - // Specify that kMul (#1) is supported when it actually isn't. + // Specify that testing::kMul (#1) is supported when it actually isn't. ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) { return GenericPrepare(context, delegate, {0, 1}); }); @@ -292,13 +169,13 @@ TEST_F(KernelTest, WrongSetOfNodes) { } TEST_F(KernelTest, MixedGraph) { - AddTensors(9, {0, 3}, {8}); + AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3}); - AddOp(kUnpack, {0}, {1, 2}); - AddOp(kUnpack, {3}, {4, 5}); - AddOp(kAdd, {1, 4}, {6}); - AddOp(kAdd, {2, 5}, {7}); - AddTfLiteOp(kMul, {6, 7}, {8}); + AddTfOp(testing::kUnpack, {0}, {1, 2}); + AddTfOp(testing::kUnpack, {3}, {4, 5}); + AddTfOp(testing::kAdd, {1, 4}, {6}); + AddTfOp(testing::kAdd, {2, 5}, {7}); + AddTfLiteMulOp({6, 7}, {8}); ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) { return GenericPrepare(context, delegate, {0, 1, 2, 3}); @@ -316,16 +193,16 @@ TEST_F(KernelTest, MixedGraph) { } TEST_F(KernelTest, SplitGraph) { - AddTensors(10, {0}, {9}); + AddTensors(10, {0}, {9}, kTfLiteFloat32, {3}); - AddOp(kUnpack, {0}, {1, 2}); - AddOp(kAdd, {1, 2}, {3}); - AddOp(kUnpack, {3}, {4, 5}); + AddTfOp(testing::kUnpack, {0}, {1, 2}); + AddTfOp(testing::kAdd, {1, 2}, {3}); + AddTfOp(testing::kUnpack, {3}, {4, 5}); - AddTfLiteOp(kMul, {4, 5}, {6}); + AddTfLiteMulOp({4, 5}, {6}); - AddOp(kUnpack, {6}, {7, 8}); - AddOp(kAdd, {7, 8}, {9}); + AddTfOp(testing::kUnpack, {6}, {7, 8}); + AddTfOp(testing::kAdd, {7, 8}, {9}); ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) { return GenericPrepare(context, delegate, {0, 1, 2, 4, 5}); diff --git a/tensorflow/contrib/lite/delegates/eager/test_util.cc b/tensorflow/contrib/lite/delegates/eager/test_util.cc new file mode 100644 index 0000000000000000000000000000000000000000..26d96acc82064ba1046555940e1b1132874ef23e --- /dev/null +++ b/tensorflow/contrib/lite/delegates/eager/test_util.cc @@ -0,0 +1,155 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/lite/delegates/eager/test_util.h" + +#include "absl/memory/memory.h" +#include "flatbuffers/flexbuffers.h" +#include "tensorflow/contrib/lite/string.h" + +namespace tflite { +namespace eager { +namespace testing { + +bool EagerModelTest::Invoke() { return interpreter_->Invoke() == kTfLiteOk; } + +void EagerModelTest::SetValues(int tensor_index, + const std::vector& values) { + float* v = interpreter_->typed_tensor(tensor_index); + for (float f : values) { + *v++ = f; + } +} + +std::vector EagerModelTest::GetValues(int tensor_index) { + TfLiteTensor* o = interpreter_->tensor(tensor_index); + return std::vector(o->data.f, o->data.f + o->bytes / sizeof(float)); +} + +void EagerModelTest::SetShape(int tensor_index, + const std::vector& values) { + ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk); + ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk); +} + +std::vector EagerModelTest::GetShape(int tensor_index) { + std::vector result; + auto* dims = interpreter_->tensor(tensor_index)->dims; + result.reserve(dims->size); + for (int i = 0; i < dims->size; ++i) { + result.push_back(dims->data[i]); + } + return result; +} + +void EagerModelTest::AddTensors(int num_tensors, const std::vector& inputs, + const std::vector& outputs, + const TfLiteType& type, + const std::vector& dims) { + interpreter_->AddTensors(num_tensors); + for (int i = 0; i < num_tensors; ++i) { + TfLiteQuantizationParams quant; + CHECK_EQ(interpreter_->SetTensorParametersReadWrite(i, type, + /*name=*/"", + /*dims=*/dims, quant), + kTfLiteOk); + } + + CHECK_EQ(interpreter_->SetInputs(inputs), kTfLiteOk); + CHECK_EQ(interpreter_->SetOutputs(outputs), kTfLiteOk); +} + +void EagerModelTest::AddTfLiteMulOp(const std::vector& inputs, + const std::vector& outputs) { + static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr}; + reg.builtin_code = BuiltinOperator_MUL; + reg.prepare = [](TfLiteContext* context, TfLiteNode* node) { + auto* i0 = &context->tensors[node->inputs->data[0]]; + auto* o = &context->tensors[node->outputs->data[0]]; + return context->ResizeTensor(context, o, TfLiteIntArrayCopy(i0->dims)); + }; + reg.invoke = [](TfLiteContext* context, TfLiteNode* node) { + auto* i0 = &context->tensors[node->inputs->data[0]]; + auto* i1 = &context->tensors[node->inputs->data[1]]; + auto* o = &context->tensors[node->outputs->data[0]]; + for (int i = 0; i < o->bytes / sizeof(float); ++i) { + o->data.f[i] = i0->data.f[i] * i1->data.f[i]; + } + return kTfLiteOk; + }; + + CHECK_EQ(interpreter_->AddNodeWithParameters(inputs, outputs, nullptr, 0, + nullptr, ®), + kTfLiteOk); +} + +void EagerModelTest::AddTfOp(TfOpType op, const std::vector& inputs, + const std::vector& outputs) { + auto attr = [](const string& key, const string& value) { + return " attr{ key: '" + key + "' value {" + value + "}}"; + }; + + if (op == kUnpack) { + string attributes = attr("T", "type: DT_FLOAT") + attr("num", "i: 2") + + attr("axis", "i: 0"); + AddTfOp("EagerUnpack", "Unpack", attributes, inputs, outputs); + } else if (op == kIdentity) { + string attributes = attr("T", "type: DT_FLOAT"); + AddTfOp("EagerIdentity", "Identity", attributes, inputs, outputs); + } else if (op == kAdd) { + string attributes = attr("T", "type: DT_FLOAT"); + AddTfOp("EagerAdd", "Add", attributes, inputs, outputs); + } else if (op == kMul) { + string attributes = attr("T", "type: DT_FLOAT"); + AddTfOp("EagerMul", "Mul", attributes, inputs, outputs); + } else if (op == kNonExistent) { + AddTfOp("NonExistentOp", "NonExistentOp", "", inputs, outputs); + } else if (op == kIncompatibleNodeDef) { + // "Cast" op is created without attributes - making it incompatible. + AddTfOp("EagerCast", "Cast", "", inputs, outputs); + } +} + +void EagerModelTest::AddTfOp(const char* tflite_name, const string& tf_name, + const string& nodedef_str, + const std::vector& inputs, + const std::vector& outputs) { + static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr}; + reg.builtin_code = BuiltinOperator_CUSTOM; + reg.custom_name = tflite_name; + + tensorflow::NodeDef nodedef; + CHECK(tensorflow::protobuf::TextFormat::ParseFromString( + nodedef_str + " op: '" + tf_name + "'", &nodedef)); + string serialized_nodedef; + CHECK(nodedef.SerializeToString(&serialized_nodedef)); + flexbuffers::Builder fbb; + fbb.Vector([&]() { + fbb.String(nodedef.op()); + fbb.String(serialized_nodedef); + }); + fbb.Finish(); + + flexbuffers_.push_back(fbb.GetBuffer()); + auto& buffer = flexbuffers_.back(); + CHECK_EQ(interpreter_->AddNodeWithParameters( + inputs, outputs, reinterpret_cast(buffer.data()), + buffer.size(), nullptr, ®), + kTfLiteOk); +} + +} // namespace testing +} // namespace eager +} // namespace tflite diff --git a/tensorflow/contrib/lite/delegates/eager/test_util.h b/tensorflow/contrib/lite/delegates/eager/test_util.h new file mode 100644 index 0000000000000000000000000000000000000000..0eab9e1135f02b4f22a4b36a85cf6771fbbb81d5 --- /dev/null +++ b/tensorflow/contrib/lite/delegates/eager/test_util.h @@ -0,0 +1,97 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_TEST_UTIL_H_ +#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_TEST_UTIL_H_ + +#include "tensorflow/c/c_api_internal.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" + +namespace tflite { +namespace eager { +namespace testing { + +enum TfOpType { + kUnpack, + kIdentity, + kAdd, + kMul, + // Represents an op that does not exist in TensorFlow. + kNonExistent, + // Represents an valid TensorFlow op where the NodeDef is incompatible. + kIncompatibleNodeDef, +}; + +// This class creates models with TF and TFLite ops. In order to use this class +// to test the Eager delegate, implement a function that calls +// interpreter->ModifyGraphWithDelegate. +class EagerModelTest : public ::testing::Test { + public: + EagerModelTest() {} + ~EagerModelTest() {} + + bool Invoke(); + + // Sets the tensor's values at the given index. + void SetValues(int tensor_index, const std::vector& values); + + // Returns the tensor's values at the given index. + std::vector GetValues(int tensor_index); + + // Sets the tensor's shape at the given index. + void SetShape(int tensor_index, const std::vector& values); + + // Returns the tensor's shape at the given index. + std::vector GetShape(int tensor_index); + + const TestErrorReporter& error_reporter() const { return error_reporter_; } + + // Adds `num_tensor` tensors to the model. `inputs` contains the indices of + // the input tensors and `outputs` contains the indices of the output + // tensors. All tensors are set to have `type` and `dims`. + void AddTensors(int num_tensors, const std::vector& inputs, + const std::vector& outputs, const TfLiteType& type, + const std::vector& dims); + + // Adds a TFLite Mul op. `inputs` contains the indices of the input tensors + // and `outputs` contains the indices of the output tensors. + void AddTfLiteMulOp(const std::vector& inputs, + const std::vector& outputs); + + // Adds a TensorFlow op. `inputs` contains the indices of the + // input tensors and `outputs` contains the indices of the output tensors. + // This function is limited to the set of ops defined in TfOpType. + void AddTfOp(TfOpType op, const std::vector& inputs, + const std::vector& outputs); + + protected: + std::unique_ptr interpreter_; + TestErrorReporter error_reporter_; + + private: + // Helper method to add a TensorFlow op. tflite_names needs to start with + // "Eager" in order to work with the Eager delegate. + void AddTfOp(const char* tflite_name, const string& tf_name, + const string& nodedef_str, const std::vector& inputs, + const std::vector& outputs); + + std::vector> flexbuffers_; +}; + +} // namespace testing +} // namespace eager +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_TEST_UTIL_H_ diff --git a/tensorflow/contrib/lite/delegates/eager/util.cc b/tensorflow/contrib/lite/delegates/eager/util.cc index 4426c653e6ff80aac52b50e06a3005173490433d..c8aa0b7f69f8f6bd3bff52b13f3cc7d689a514da 100644 --- a/tensorflow/contrib/lite/delegates/eager/util.cc +++ b/tensorflow/contrib/lite/delegates/eager/util.cc @@ -13,10 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/delegates/eager/util.h" +#include "tensorflow/contrib/lite/delegates/eager/constants.h" namespace tflite { namespace eager { +bool IsEagerOp(const char* custom_name) { + return custom_name && strncmp(custom_name, kCustomCodePrefix, + strlen(kCustomCodePrefix)) == 0; +} + TfLiteStatus ConvertStatus(TfLiteContext* context, const tensorflow::Status& status) { if (!status.ok()) { diff --git a/tensorflow/contrib/lite/delegates/eager/util.h b/tensorflow/contrib/lite/delegates/eager/util.h index a9407be071192e9b7f25f95df9e76a5f44e7c9e3..b7363361bec47f30e0741e3a76a5a375d7d9aeb1 100644 --- a/tensorflow/contrib/lite/delegates/eager/util.h +++ b/tensorflow/contrib/lite/delegates/eager/util.h @@ -23,6 +23,10 @@ limitations under the License. namespace tflite { namespace eager { +// Checks whether the prefix of the custom name indicates the operation is an +// Eager operation. +bool IsEagerOp(const char* custom_name); + // Converts a tensorflow:Status into a TfLiteStatus. If the original status // represented an error, reports it using the given 'context'. TfLiteStatus ConvertStatus(TfLiteContext* context, diff --git a/tensorflow/contrib/lite/delegates/eager/util_test.cc b/tensorflow/contrib/lite/delegates/eager/util_test.cc index c4fbf5412776a2c5743e8d72fc6729cfd709c545..541d0b170197f7ac657cccfb79769522887e87e5 100644 --- a/tensorflow/contrib/lite/delegates/eager/util_test.cc +++ b/tensorflow/contrib/lite/delegates/eager/util_test.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "tensorflow/contrib/lite/string.h" #include "tensorflow/contrib/lite/testing/util.h" namespace tflite { @@ -102,6 +103,16 @@ TEST(UtilTest, TypeConversions) { EXPECT_EQ(TF_BOOL, GetTensorFlowDataType(kTfLiteBool)); } +TEST(UtilTest, IsEagerOp) { + EXPECT_TRUE(IsEagerOp("Eager")); + EXPECT_TRUE(IsEagerOp("EagerOp")); + EXPECT_FALSE(IsEagerOp("eager")); + EXPECT_FALSE(IsEagerOp("Eage")); + EXPECT_FALSE(IsEagerOp("OpEager")); + EXPECT_FALSE(IsEagerOp(nullptr)); + EXPECT_FALSE(IsEagerOp("")); +} + } // namespace } // namespace eager } // namespace tflite diff --git a/tensorflow/contrib/lite/delegates/nnapi/BUILD b/tensorflow/contrib/lite/delegates/nnapi/BUILD index 091f8fbce734b466de33bb4b84e5e0fc3e4a71ef..954955f24b87f79a8dbe2863f608d532e25902c6 100644 --- a/tensorflow/contrib/lite/delegates/nnapi/BUILD +++ b/tensorflow/contrib/lite/delegates/nnapi/BUILD @@ -22,7 +22,10 @@ tf_cc_test( name = "nnapi_delegate_test", size = "small", srcs = ["nnapi_delegate_test.cc"], - tags = ["no_oss"], + tags = [ + "no_oss", + "noasan", # TODO(b/112326936): re-enable for asan once fixed. + ], deps = [ ":nnapi_delegate", "//tensorflow/contrib/lite:framework", diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc index 60855eb8edc4fb708d76b1e3a4ac37d462a64465..e6cc3dd99c2e18bf297f8fac244e5d809954a01a 100644 --- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc +++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc @@ -27,7 +27,9 @@ limitations under the License. #include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h" #ifdef __ANDROID__ +#include #include +#include #endif namespace tflite { @@ -80,6 +82,44 @@ struct NNFreeCompilation { } }; +// Manage NNAPI shared memory handle +class NNMemory { + public: + NNMemory(const char* name, size_t size) { +#ifdef __ANDROID__ + byte_size_ = size; + fd_ = ASharedMemory_create(name, size); + data_ptr_ = reinterpret_cast( + mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0)); + ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE, fd_, 0, + &nn_memory_handle_); +#endif + } + + ~NNMemory() { +#ifdef __ANDROID__ + if (data_ptr_) { + munmap(data_ptr_, byte_size_); + } + if (nn_memory_handle_) { + ANeuralNetworksMemory_free(nn_memory_handle_); + } + if (fd_ > 0) close(fd_); +#endif + } + + ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; } + uint8_t* get_data_ptr() { return data_ptr_; } + + private: +#ifdef __ANDROID__ + int fd_ = 0; + size_t byte_size_ = 0; +#endif + uint8_t* data_ptr_ = nullptr; + ANeuralNetworksMemory* nn_memory_handle_ = nullptr; +}; // namespace + // Track tensor indices to NN API tensor indices mapping. class OperandMapping { public: @@ -142,6 +182,12 @@ class NNAPIOpBuilder { ANEURALNETWORKS_TENSOR_INT32); } + TfLiteStatus AddVectorFloat32Operand(const float* values, + uint32_t num_values) { + return AddVectorOperand(values, num_values, + ANEURALNETWORKS_TENSOR_FLOAT32); + } + TfLiteStatus AddPoolingParams(void* data) { auto builtin = reinterpret_cast(data); AddScalarInt32Operand(builtin->padding); @@ -167,6 +213,37 @@ class NNAPIOpBuilder { return kTfLiteOk; } + TfLiteStatus AddAdditionalFloat32OutputTensor(uint32_t dimension_count) { + std::vector dims(dimension_count, 0); + ANeuralNetworksOperandType operand_type{ + .type = ANEURALNETWORKS_TENSOR_FLOAT32, + .dimensionCount = dimension_count, + .dimensions = dims.data()}; + CHECK_NN(context_, + ANeuralNetworksModel_addOperand(nn_model_, &operand_type)); + int ann_operand = operand_mapping_->add_new_non_tensor_operand(); + augmented_outputs_.push_back(ann_operand); + return kTfLiteOk; + } + + TfLiteStatus AddStateFloat32Tensor(int tensor_index, + int* ann_tensor_index_out) { + TfLiteTensor* tensor = &context_->tensors[tensor_index]; + int ann_index = operand_mapping_->add_new_non_tensor_operand(); + + ANeuralNetworksOperandType operand_type{ + ANEURALNETWORKS_TENSOR_FLOAT32, + static_cast(tensor->dims->size), + reinterpret_cast(tensor->dims->data), tensor->params.scale, + tensor->params.zero_point}; + CHECK_NN(context_, + ANeuralNetworksModel_addOperand(nn_model_, &operand_type)); + augmented_inputs_.push_back(ann_index); + + *ann_tensor_index_out = ann_index; + return kTfLiteOk; + } + // Adds a new NN API tensor that shadows the TF Lite tensor `tensor_index`. // This returns the NN API tensor index corresponding to the created tensor. // If another caller previously created a NN API tensor for `tensor_index` @@ -198,6 +275,10 @@ class NNAPIOpBuilder { nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM; scale = tensor->params.scale; zeroPoint = tensor->params.zero_point; + if (scale == 0) { + // TENSOR_QUANT8_ASYMM with zero scale is not valid in NNAPI. + scale = 1; + } break; case kTfLiteInt32: nn_type = ANEURALNETWORKS_TENSOR_INT32; @@ -285,14 +366,21 @@ class NNAPIOpBuilder { std::vector augmented_outputs_; }; +struct NNAPIOpMappingArgs { + TfLiteContext* context; + NNAPIOpBuilder* builder; + TfLiteNode* node; + std::vector* model_state_inputs; + std::vector* model_state_tfl_outputs; +}; + // The kernel that represents the subgraph of TF Lite being run on NN API. class NNAPIDelegateKernel { public: NNAPIDelegateKernel() = default; - typedef ANeuralNetworksOperationType (*MappingFn)(TfLiteContext*, - NNAPIOpBuilder* builder, - TfLiteNode* node); + typedef ANeuralNetworksOperationType (*MappingFn)( + const NNAPIOpMappingArgs& mapping_args); // Return a function that knows how to translate a node into its operands // when called. You can use this function to see if a node is supported @@ -302,11 +390,11 @@ class NNAPIDelegateKernel { switch (builtin_code) { case kTfLiteBuiltinAdd: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - auto builtin = - reinterpret_cast(node->builtin_data); - builder->AddScalarInt32Operand(builtin->activation); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); return ANEURALNETWORKS_ADD; }; } else { @@ -315,11 +403,11 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinMul: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - auto builtin = - reinterpret_cast(node->builtin_data); - builder->AddScalarInt32Operand(builtin->activation); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); return ANEURALNETWORKS_MUL; }; } else { @@ -328,9 +416,10 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinAveragePool2d: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - builder->AddPoolingParams(node->builtin_data); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + mapping_args.builder->AddPoolingParams( + mapping_args.node->builtin_data); return ANEURALNETWORKS_AVERAGE_POOL_2D; }; } else { @@ -339,9 +428,10 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinMaxPool2d: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - builder->AddPoolingParams(node->builtin_data); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + mapping_args.builder->AddPoolingParams( + mapping_args.node->builtin_data); return ANEURALNETWORKS_MAX_POOL_2D; }; } else { @@ -350,9 +440,10 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinL2Pool2d: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - builder->AddPoolingParams(node->builtin_data); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + mapping_args.builder->AddPoolingParams( + mapping_args.node->builtin_data); return ANEURALNETWORKS_L2_POOL_2D; }; } else { @@ -368,14 +459,14 @@ class NNAPIDelegateKernel { // NNAPI does not support dilated Conv2D. return nullptr; } - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - auto builtin = - reinterpret_cast(node->builtin_data); - builder->AddScalarInt32Operand(builtin->padding); - builder->AddScalarInt32Operand(builtin->stride_width); - builder->AddScalarInt32Operand(builtin->stride_height); - builder->AddScalarInt32Operand(builtin->activation); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->padding); + mapping_args.builder->AddScalarInt32Operand(builtin->stride_width); + mapping_args.builder->AddScalarInt32Operand(builtin->stride_height); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); return ANEURALNETWORKS_CONV_2D; }; } else { @@ -384,15 +475,16 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinDepthwiseConv2d: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { auto builtin = reinterpret_cast( - node->builtin_data); - builder->AddScalarInt32Operand(builtin->padding); - builder->AddScalarInt32Operand(builtin->stride_width); - builder->AddScalarInt32Operand(builtin->stride_height); - builder->AddScalarInt32Operand(builtin->depth_multiplier); - builder->AddScalarInt32Operand(builtin->activation); + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->padding); + mapping_args.builder->AddScalarInt32Operand(builtin->stride_width); + mapping_args.builder->AddScalarInt32Operand(builtin->stride_height); + mapping_args.builder->AddScalarInt32Operand( + builtin->depth_multiplier); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); return ANEURALNETWORKS_DEPTHWISE_CONV_2D; }; } else { @@ -401,11 +493,11 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinFullyConnected: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { auto builtin = reinterpret_cast( - node->builtin_data); - builder->AddScalarInt32Operand(builtin->activation); + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); return ANEURALNETWORKS_FULLY_CONNECTED; }; } else { @@ -414,11 +506,11 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinSoftmax: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - auto builtin = - reinterpret_cast(node->builtin_data); - builder->AddScalarFloat32Operand(builtin->beta); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarFloat32Operand(builtin->beta); return ANEURALNETWORKS_SOFTMAX; }; } else { @@ -427,8 +519,8 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinReshape: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_RESHAPE; }; } else { @@ -437,13 +529,13 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinSqueeze: if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - auto builtin = - reinterpret_cast(node->builtin_data); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); // Note that we add the squeeze dimensions even if the dimensions // were unspecified (empty), as NNAPI requires the operand. - builder->AddVectorInt32Operand( + mapping_args.builder->AddVectorInt32Operand( builtin->squeeze_dims, static_cast(builtin->num_squeeze_dims)); return ANEURALNETWORKS_SQUEEZE; @@ -458,21 +550,21 @@ class NNAPIDelegateKernel { // NNAPI does not support activations return nullptr; } - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_L2_NORMALIZATION; }; } case kTfLiteBuiltinLocalResponseNormalization: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { auto builtin = reinterpret_cast( - node->builtin_data); - builder->AddScalarInt32Operand(builtin->radius); - builder->AddScalarFloat32Operand(builtin->bias); - builder->AddScalarFloat32Operand(builtin->alpha); - builder->AddScalarFloat32Operand(builtin->beta); + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->radius); + mapping_args.builder->AddScalarFloat32Operand(builtin->bias); + mapping_args.builder->AddScalarFloat32Operand(builtin->alpha); + mapping_args.builder->AddScalarFloat32Operand(builtin->beta); return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION; }; } else { @@ -488,11 +580,11 @@ class NNAPIDelegateKernel { ->type == kTfLiteLshProjectionSparse) { return nullptr; } - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { auto builtin = reinterpret_cast( - node->builtin_data); - builder->AddScalarInt32Operand(builtin->type); + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->type); return ANEURALNETWORKS_LSH_PROJECTION; }; } else { @@ -515,11 +607,11 @@ class NNAPIDelegateKernel { } } } - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { auto builtin = reinterpret_cast( - node->builtin_data); - builder->AddScalarInt32Operand(builtin->axis); + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->axis); return ANEURALNETWORKS_CONCATENATION; }; } else { @@ -528,8 +620,8 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinDequantize: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_DEQUANTIZE; }; } else { @@ -538,8 +630,8 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinFloor: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_FLOOR; }; } else { @@ -548,8 +640,8 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinRelu: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_RELU; }; } else { @@ -558,8 +650,8 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinReluN1To1: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_RELU1; }; } else { @@ -568,8 +660,8 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinRelu6: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_RELU6; }; } else { @@ -578,8 +670,8 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinLogistic: if (version == 1) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_LOGISTIC; }; } else { @@ -591,8 +683,8 @@ class NNAPIDelegateKernel { if (version == 1 && context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) { // NNAPI only support float tanh. - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_TANH; }; } else { @@ -603,11 +695,11 @@ class NNAPIDelegateKernel { if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 && context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) { // NNAPI only support float sub. - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - auto builtin = - reinterpret_cast(node->builtin_data); - builder->AddScalarInt32Operand(builtin->activation); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); return ANEURALNETWORKS_SUB; }; } else { @@ -618,11 +710,11 @@ class NNAPIDelegateKernel { if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 && context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) { // NNAPI only support float div. - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - auto builtin = - reinterpret_cast(node->builtin_data); - builder->AddScalarInt32Operand(builtin->activation); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); return ANEURALNETWORKS_DIV; }; } else { @@ -636,8 +728,8 @@ class NNAPIDelegateKernel { // NNAPI does not support specifying the padding value. // NNAPI pads physical zero for quantized tensors, so only delegate // float pad to NNAPI. - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_PAD; }; } else { @@ -646,8 +738,8 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinSpaceToBatchNd: if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_SPACE_TO_BATCH_ND; }; } else { @@ -656,13 +748,14 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinStridedSlice: if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { - auto builtin = - reinterpret_cast(node->builtin_data); - builder->AddScalarInt32Operand(builtin->begin_mask); - builder->AddScalarInt32Operand(builtin->end_mask); - builder->AddScalarInt32Operand(builtin->shrink_axis_mask); + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->begin_mask); + mapping_args.builder->AddScalarInt32Operand(builtin->end_mask); + mapping_args.builder->AddScalarInt32Operand( + builtin->shrink_axis_mask); return ANEURALNETWORKS_STRIDED_SLICE; }; } else { @@ -678,14 +771,146 @@ class NNAPIDelegateKernel { (node->inputs->size > 1) && (context->tensors[node->inputs->data[1]].allocation_type == kTfLiteMmapRo)) { - return [](TfLiteContext* context, NNAPIOpBuilder* builder, - TfLiteNode* node) -> ANeuralNetworksOperationType { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { return ANEURALNETWORKS_TRANSPOSE; }; } else { return nullptr; } break; + case kTfLiteBuiltinRnn: + // NNAPI only support float32 weights. + // TODO(miaowang): check the number of inputs before accessing it. + if (version == 1 && + context->tensors[node->inputs->data[/*kWeightsTensor*/ 1]].type == + kTfLiteFloat32) { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + // NNAPI need both state_in and state_out. + int ann_index; + mapping_args.builder->AddStateFloat32Tensor( + mapping_args.node->outputs->data[/*kHiddenStateTensor*/ 0], + &ann_index); + mapping_args.model_state_inputs->push_back(ann_index); + mapping_args.model_state_tfl_outputs->push_back( + mapping_args.node->outputs->data[/*kHiddenStateTensor*/ 0]); + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); + return ANEURALNETWORKS_RNN; + }; + } else { + return nullptr; + } + break; + case kTfLiteBuiltinSvdf: + // NNAPI only support float32 weights. + if (version == 1 && + context->tensors[node->inputs->data[/*kWeightsFeatureTensor*/ 1]] + .type == kTfLiteFloat32) { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + // NNAPI need both state_in and state_out. + int ann_index; + mapping_args.builder->AddStateFloat32Tensor( + mapping_args.node->outputs->data[/*kStateTensor*/ 0], + &ann_index); + mapping_args.model_state_inputs->push_back(ann_index); + mapping_args.model_state_tfl_outputs->push_back( + mapping_args.node->outputs->data[/*kStateTensor*/ 0]); + + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->rank); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); + return ANEURALNETWORKS_SVDF; + }; + } else { + return nullptr; + } + break; + case kTfLiteBuiltinLstm: + // NNAPI only support float32 weights. + // TODO(miaowang): add loggings to indicate why the op is rejected. + if (version == 1 && node->inputs->size == 18 && + context->tensors[node->inputs + ->data[/*kInputToOutputWeightsTensor*/ 4]] + .type == kTfLiteFloat32) { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + // NNAPI need both state_in and state_out for cell_state and + // output_state. + int ann_index; + mapping_args.builder->AddStateFloat32Tensor( + mapping_args.node->outputs->data[/*kOutputStateTensor*/ 0], + &ann_index); + mapping_args.model_state_inputs->push_back(ann_index); + mapping_args.model_state_tfl_outputs->push_back( + mapping_args.node->outputs->data[/*kOutputStateTensor*/ 0]); + mapping_args.builder->AddStateFloat32Tensor( + mapping_args.node->outputs->data[/*kCellStateTensor*/ 1], + &ann_index); + mapping_args.model_state_inputs->push_back(ann_index); + mapping_args.model_state_tfl_outputs->push_back( + mapping_args.node->outputs->data[/*kCellStateTensor*/ 1]); + + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarInt32Operand(builtin->activation); + mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip); + mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip); + + // Current NNAPI implementation requires the sratch_buffer as + // output. + mapping_args.builder->AddAdditionalFloat32OutputTensor(2); + return ANEURALNETWORKS_LSTM; + }; + } else { + return nullptr; + } + break; + case kTfLiteBuiltinMean: + // NNAPI does not support generating a scalar as output for MEAN. + if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 && + context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 && + context->tensors[node->outputs->data[0]].dims->size > 0) { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + int32_t keep_dims = 0; + if (builtin->keep_dims) keep_dims = 1; + mapping_args.builder->AddScalarInt32Operand(keep_dims); + return ANEURALNETWORKS_MEAN; + }; + } else { + return nullptr; + } + case kTfLiteBuiltinEmbeddingLookup: + // NNAPI only support float32 values. + if (version == 1 && + context->tensors[node->inputs->data[1]].type == kTfLiteFloat32) { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + return ANEURALNETWORKS_EMBEDDING_LOOKUP; + }; + } else { + return nullptr; + } + break; + case kTfLiteBuiltinHashtableLookup: + // NNAPI only support float32 output. + if (version == 1 && + context->tensors[node->outputs->data[0]].type == kTfLiteFloat32) { + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + return ANEURALNETWORKS_HASHTABLE_LOOKUP; + }; + } else { + return nullptr; + } + break; default: return nullptr; } @@ -725,27 +950,56 @@ class NNAPIDelegateKernel { // Set the input tensor buffers. Note: we access tflite tensors using // absolute indices but NN api indices inputs by relative indices. int relative_input_index = 0; + int num_optional_tensors = 0; + + size_t input_offset = 0; for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) { + if (absolute_input_index == kOptionalTensor) { + num_optional_tensors++; + continue; + } TfLiteTensor* tensor = &context->tensors[absolute_input_index]; // TODO(miaowang): make sure the delegation works with dequantized weights // as intermediate tensors. if (tensor->allocation_type != kTfLiteMmapRo) { - CHECK_NN(context, ANeuralNetworksExecution_setInput( + // copy data to pre-allocated shared memory. + memcpy(nn_input_memory_->get_data_ptr() + input_offset, + tensor->data.raw, tensor->bytes); + CHECK_NN(context, ANeuralNetworksExecution_setInputFromMemory( execution, relative_input_index, nullptr, - tensor->data.raw, tensor->bytes)); + nn_input_memory_->get_handle(), input_offset, + tensor->bytes)); + input_offset += tensor->bytes; relative_input_index++; } } // Set the output tensor buffers. int relative_output_index = 0; + size_t output_offset = 0; for (auto output_index : TfLiteIntArrayView(node->outputs)) { TfLiteTensor* tensor = &context->tensors[output_index]; - CHECK_NN(context, ANeuralNetworksExecution_setOutput( + CHECK_NN(context, ANeuralNetworksExecution_setOutputFromMemory( execution, relative_output_index, nullptr, - tensor->data.raw, tensor->bytes)); + nn_output_memory_->get_handle(), output_offset, + tensor->bytes)); + output_offset += tensor->bytes; relative_output_index++; } + + // The state_out of previous invocation need to be mapped to state_in of + // current invocation. + for (size_t i = 0; i < model_state_tfl_outputs_.size(); i++) { + int state_tensor_idx = model_state_tfl_outputs_[i]; + TfLiteTensor* tensor = &context->tensors[state_tensor_idx]; + // Here we are using a deep copy for state_in tensors so that we are not + // reading and writing into the same buffer during a invocation. + // TODO(110369471): using double shared buffer to minimize the copies. + CHECK_NN(context, + ANeuralNetworksExecution_setInput( + execution, i + node->inputs->size - num_optional_tensors, + nullptr, tensor->data.raw, tensor->bytes)); + } // Invoke ANN in blocking fashion. ANeuralNetworksEvent* event = nullptr; CHECK_NN(context, ANeuralNetworksExecution_startCompute(execution, &event)); @@ -753,6 +1007,15 @@ class NNAPIDelegateKernel { ANeuralNetworksEvent_free(event); ANeuralNetworksExecution_free(execution); + // copy results from shared memory to the destination. + output_offset = 0; + for (auto output_index : TfLiteIntArrayView(node->outputs)) { + TfLiteTensor* tensor = &context->tensors[output_index]; + memcpy(tensor->data.raw, + nn_output_memory_->get_data_ptr() + output_offset, tensor->bytes); + output_offset += tensor->bytes; + } + return kTfLiteOk; } @@ -767,6 +1030,12 @@ class NNAPIDelegateKernel { // Track indices we use OperandMapping operand_mapping_; + std::vector model_state_inputs_; + std::vector model_state_tfl_outputs_; + + std::unique_ptr nn_input_memory_; + std::unique_ptr nn_output_memory_; + TfLiteStatus AddOpsAndTensors(TfLiteContext* context) { // The operand builder allows creating a single op. We create it at this // reduced power position rather than in the for loop to avoid reallocating @@ -781,11 +1050,22 @@ class NNAPIDelegateKernel { context->GetNodeAndRegistration(context, node_index, &node, ®); // Map inputs to NN API tensor indices. for (auto input_index : TfLiteIntArrayView(node->inputs)) { - TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index)); + if (input_index == kOptionalTensor && + (reg->builtin_code == kTfLiteBuiltinLstm || + reg->builtin_code == kTfLiteBuiltinSvdf)) { + // properly handle the optional tensor for LSTM and SVDF. + // currently only support float32. + // TODO(miaowang): make sure this is also able to handle quantized + // tensor when supported by NNAPI. + TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0)); + } else { + TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index)); + } } // Get op type and operands - int nn_op_type = Map(context, reg->builtin_code, reg->version, node)( - context, &builder, node); + int nn_op_type = Map(context, reg->builtin_code, reg->version, + node)({context, &builder, node, &model_state_inputs_, + &model_state_tfl_outputs_}); // Map outputs to NN API tensor indices. for (auto output_index : TfLiteIntArrayView(node->outputs)) { TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index)); @@ -806,15 +1086,29 @@ class NNAPIDelegateKernel { inputs.reserve(input_tensors->size); std::vector outputs; outputs.reserve(output_tensors->size); + + size_t total_input_byte_size = 0; // Make the TensorFlow lite inputs and outputs to ann_indices. for (int i : TfLiteIntArrayView(input_tensors)) { // Constant tensors are not NNAPI inputs. - if (context->tensors[i].allocation_type != kTfLiteMmapRo) { + if (i != kOptionalTensor && + context->tensors[i].allocation_type != kTfLiteMmapRo) { inputs.push_back(operand_mapping_.lite_index_to_ann(i)); + total_input_byte_size += context->tensors[i].bytes; } } - for (int i : TfLiteIntArrayView(output_tensors)) + + // Add state input tensors as model inputs + for (int i : model_state_inputs_) { + inputs.push_back(i); + } + + size_t total_output_byte_size = 0; + for (int i : TfLiteIntArrayView(output_tensors)) { outputs.push_back(operand_mapping_.lite_index_to_ann(i)); + total_output_byte_size += context->tensors[i].bytes; + } + // Tell ANN to declare inputs/outputs CHECK_NN(context, ANeuralNetworksModel_identifyInputsAndOutputs( nn_model_.get(), inputs.size(), inputs.data(), @@ -822,6 +1116,11 @@ class NNAPIDelegateKernel { // Finalize the model CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get())); + // Create shared memory pool for inputs and outputs. + nn_input_memory_.reset(new NNMemory("input_pool", total_input_byte_size)); + nn_output_memory_.reset( + new NNMemory("output_pool", total_output_byte_size)); + return kTfLiteOk; } }; diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc index b7b159c59f2f81b055d5d06436b70331cff3dea8..3224b23a0c3bc8456bd75f2923d16f0eed7d53ff 100644 --- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc +++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc @@ -1623,6 +1623,1898 @@ TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxisMask) { EXPECT_THAT(m.GetOutput(), ElementsAreArray({1})); } +static float rnn_input[] = { + 0.23689353, 0.285385, 0.037029743, -0.19858193, -0.27569133, + 0.43773448, 0.60379338, 0.35562468, -0.69424844, -0.93421471, + -0.87287879, 0.37144363, -0.62476718, 0.23791671, 0.40060222, + 0.1356622, -0.99774903, -0.98858172, -0.38952237, -0.47685933, + 0.31073618, 0.71511042, -0.63767755, -0.31729108, 0.33468103, + 0.75801885, 0.30660987, -0.37354088, 0.77002847, -0.62747043, + -0.68572164, 0.0069220066, 0.65791464, 0.35130811, 0.80834007, + -0.61777675, -0.21095741, 0.41213346, 0.73784804, 0.094794154, + 0.47791874, 0.86496925, -0.53376222, 0.85315156, 0.10288584, + 0.86684, -0.011186242, 0.10513687, 0.87825835, 0.59929144, + 0.62827742, 0.18899453, 0.31440187, 0.99059987, 0.87170351, + -0.35091716, 0.74861872, 0.17831337, 0.2755419, 0.51864719, + 0.55084288, 0.58982027, -0.47443086, 0.20875752, -0.058871567, + -0.66609079, 0.59098077, 0.73017097, 0.74604273, 0.32882881, + -0.17503482, 0.22396147, 0.19379807, 0.29120302, 0.077113032, + -0.70331609, 0.15804303, -0.93407321, 0.40182066, 0.036301374, + 0.66521823, 0.0300982, -0.7747041, -0.02038002, 0.020698071, + -0.90300065, 0.62870288, -0.23068321, 0.27531278, -0.095755219, + -0.712036, -0.17384434, -0.50593495, -0.18646687, -0.96508682, + 0.43519354, 0.14744234, 0.62589407, 0.1653645, -0.10651493, + -0.045277178, 0.99032974, -0.88255352, -0.85147917, 0.28153265, + 0.19455957, -0.55479527, -0.56042433, 0.26048636, 0.84702539, + 0.47587705, -0.074295521, -0.12287641, 0.70117295, 0.90532446, + 0.89782166, 0.79817224, 0.53402734, -0.33286154, 0.073485017, + -0.56172788, -0.044897556, 0.89964068, -0.067662835, 0.76863563, + 0.93455386, -0.6324693, -0.083922029}; + +static float rnn_golden_output[] = { + 0.496726, 0, 0.965996, 0, 0.0584254, 0, + 0, 0.12315, 0, 0, 0.612266, 0.456601, + 0, 0.52286, 1.16099, 0.0291232, + + 0, 0, 0.524901, 0, 0, 0, + 0, 1.02116, 0, 1.35762, 0, 0.356909, + 0.436415, 0.0355727, 0, 0, + + 0, 0, 0, 0.262335, 0, 0, + 0, 1.33992, 0, 2.9739, 0, 0, + 1.31914, 2.66147, 0, 0, + + 0.942568, 0, 0, 0, 0.025507, 0, + 0, 0, 0.321429, 0.569141, 1.25274, 1.57719, + 0.8158, 1.21805, 0.586239, 0.25427, + + 1.04436, 0, 0.630725, 0, 0.133801, 0.210693, + 0.363026, 0, 0.533426, 0, 1.25926, 0.722707, + 0, 1.22031, 1.30117, 0.495867, + + 0.222187, 0, 0.72725, 0, 0.767003, 0, + 0, 0.147835, 0, 0, 0, 0.608758, + 0.469394, 0.00720298, 0.927537, 0, + + 0.856974, 0.424257, 0, 0, 0.937329, 0, + 0, 0, 0.476425, 0, 0.566017, 0.418462, + 0.141911, 0.996214, 1.13063, 0, + + 0.967899, 0, 0, 0, 0.0831304, 0, + 0, 1.00378, 0, 0, 0, 1.44818, + 1.01768, 0.943891, 0.502745, 0, + + 0.940135, 0, 0, 0, 0, 0, + 0, 2.13243, 0, 0.71208, 0.123918, 1.53907, + 1.30225, 1.59644, 0.70222, 0, + + 0.804329, 0, 0.430576, 0, 0.505872, 0.509603, + 0.343448, 0, 0.107756, 0.614544, 1.44549, 1.52311, + 0.0454298, 0.300267, 0.562784, 0.395095, + + 0.228154, 0, 0.675323, 0, 1.70536, 0.766217, + 0, 0, 0, 0.735363, 0.0759267, 1.91017, + 0.941888, 0, 0, 0, + + 0, 0, 1.5909, 0, 0, 0, + 0, 0.5755, 0, 0.184687, 0, 1.56296, + 0.625285, 0, 0, 0, + + 0, 0, 0.0857888, 0, 0, 0, + 0, 0.488383, 0.252786, 0, 0, 0, + 1.02817, 1.85665, 0, 0, + + 0.00981836, 0, 1.06371, 0, 0, 0, + 0, 0, 0, 0.290445, 0.316406, 0, + 0.304161, 1.25079, 0.0707152, 0, + + 0.986264, 0.309201, 0, 0, 0, 0, + 0, 1.64896, 0.346248, 0, 0.918175, 0.78884, + 0.524981, 1.92076, 2.07013, 0.333244, + + 0.415153, 0.210318, 0, 0, 0, 0, + 0, 2.02616, 0, 0.728256, 0.84183, 0.0907453, + 0.628881, 3.58099, 1.49974, 0}; + +static std::initializer_list rnn_weights = { + 0.461459, 0.153381, 0.529743, -0.00371218, 0.676267, -0.211346, + 0.317493, 0.969689, -0.343251, 0.186423, 0.398151, 0.152399, + 0.448504, 0.317662, 0.523556, -0.323514, 0.480877, 0.333113, + -0.757714, -0.674487, -0.643585, 0.217766, -0.0251462, 0.79512, + -0.595574, -0.422444, 0.371572, -0.452178, -0.556069, -0.482188, + -0.685456, -0.727851, 0.841829, 0.551535, -0.232336, 0.729158, + -0.00294906, -0.69754, 0.766073, -0.178424, 0.369513, -0.423241, + 0.548547, -0.0152023, -0.757482, -0.85491, 0.251331, -0.989183, + 0.306261, -0.340716, 0.886103, -0.0726757, -0.723523, -0.784303, + 0.0354295, 0.566564, -0.485469, -0.620498, 0.832546, 0.697884, + -0.279115, 0.294415, -0.584313, 0.548772, 0.0648819, 0.968726, + 0.723834, -0.0080452, -0.350386, -0.272803, 0.115121, -0.412644, + -0.824713, -0.992843, -0.592904, -0.417893, 0.863791, -0.423461, + -0.147601, -0.770664, -0.479006, 0.654782, 0.587314, -0.639158, + 0.816969, -0.337228, 0.659878, 0.73107, 0.754768, -0.337042, + 0.0960841, 0.368357, 0.244191, -0.817703, -0.211223, 0.442012, + 0.37225, -0.623598, -0.405423, 0.455101, 0.673656, -0.145345, + -0.511346, -0.901675, -0.81252, -0.127006, 0.809865, -0.721884, + 0.636255, 0.868989, -0.347973, -0.10179, -0.777449, 0.917274, + 0.819286, 0.206218, -0.00785118, 0.167141, 0.45872, 0.972934, + -0.276798, 0.837861, 0.747958, -0.0151566, -0.330057, -0.469077, + 0.277308, 0.415818}; + +static std::initializer_list rnn_recurrent_weights = { + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0.1}; + +static std::initializer_list rnn_bias = { + 0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068, -0.23566568, + -0.389184, 0.47481549, -0.4791103, 0.29931796, 0.10463274, 0.83918178, + 0.37197268, 0.61957061, 0.3956964, -0.37609905}; + +class RNNOpModel : public SingleOpModelWithNNAPI { + public: + RNNOpModel(int batches, int units, int size, + const TensorType& weights = TensorType_FLOAT32, + const TensorType& recurrent_weights = TensorType_FLOAT32) + : batches_(batches), units_(units), input_size_(size) { + input_ = AddInput(TensorType_FLOAT32); + weights_ = AddInput(weights); + recurrent_weights_ = AddInput(recurrent_weights); + bias_ = AddInput(TensorType_FLOAT32); + hidden_state_ = AddOutput(TensorType_FLOAT32); + output_ = AddOutput(TensorType_FLOAT32); + SetBuiltinOp( + BuiltinOperator_RNN, BuiltinOptions_RNNOptions, + CreateRNNOptions(builder_, ActivationFunctionType_RELU).Union()); + BuildInterpreter({{batches_, input_size_}, + {units_, input_size_}, + {units_, units_}, + {units_}}); + } + + void SetBias(std::initializer_list f) { PopulateTensor(bias_, f); } + + void SetWeights(std::initializer_list f) { + PopulateTensor(weights_, f); + } + + void SetRecurrentWeights(std::initializer_list f) { + PopulateTensor(recurrent_weights_, f); + } + + void SetInput(std::initializer_list data) { + PopulateTensor(input_, data); + } + + void SetInput(int offset, float* begin, float* end) { + PopulateTensor(input_, offset, begin, end); + } + + void ResetHiddenState() { + const int zero_buffer_size = units_ * batches_; + std::unique_ptr zero_buffer(new float[zero_buffer_size]); + memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); + PopulateTensor(hidden_state_, 0, zero_buffer.get(), + zero_buffer.get() + zero_buffer_size); + } + + std::vector GetOutput() { return ExtractVector(output_); } + + int input_size() { return input_size_; } + int num_units() { return units_; } + int num_batches() { return batches_; } + + protected: + int input_; + int weights_; + int recurrent_weights_; + int bias_; + int hidden_state_; + int output_; + + int batches_; + int units_; + int input_size_; +}; + +TEST(NNAPIDelegate, RnnBlackBoxTest) { + RNNOpModel rnn(2, 16, 8); + rnn.SetWeights(rnn_weights); + rnn.SetBias(rnn_bias); + rnn.SetRecurrentWeights(rnn_recurrent_weights); + + rnn.ResetHiddenState(); + const int input_sequence_size = sizeof(rnn_input) / sizeof(float) / + (rnn.input_size() * rnn.num_batches()); + + for (int i = 0; i < input_sequence_size; i++) { + float* batch_start = rnn_input + i * rnn.input_size(); + float* batch_end = batch_start + rnn.input_size(); + rnn.SetInput(0, batch_start, batch_end); + rnn.SetInput(rnn.input_size(), batch_start, batch_end); + + rnn.Invoke(); + + float* golden_start = rnn_golden_output + i * rnn.num_units(); + float* golden_end = golden_start + rnn.num_units(); + std::vector expected; + expected.insert(expected.end(), golden_start, golden_end); + expected.insert(expected.end(), golden_start, golden_end); + + EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected))); + } +} + +static float svdf_input[] = { + 0.12609188, -0.46347019, -0.89598465, + 0.35867718, 0.36897406, 0.73463392, + + 0.14278367, -1.64410412, -0.75222826, + -0.57290924, 0.12729003, 0.7567004, + + 0.49837467, 0.19278903, 0.26584083, + 0.17660543, 0.52949083, -0.77931279, + + -0.11186574, 0.13164264, -0.05349274, + -0.72674477, -0.5683046, 0.55900657, + + -0.68892461, 0.37783599, 0.18263303, + -0.63690937, 0.44483393, -0.71817774, + + -0.81299269, -0.86831826, 1.43940818, + -0.95760226, 1.82078898, 0.71135032, + + -1.45006323, -0.82251364, -1.69082689, + -1.65087092, -1.89238167, 1.54172635, + + 0.03966608, -0.24936394, -0.77526885, + 2.06740379, -1.51439476, 1.43768692, + + 0.11771342, -0.23761693, -0.65898693, + 0.31088525, -1.55601168, -0.87661445, + + -0.89477462, 1.67204106, -0.53235275, + -0.6230064, 0.29819036, 1.06939757, +}; + +static float svdf_golden_output_rank_1[] = { + 0.014899, -0.0517661, -0.143725, -0.00271883, + -0.03004015, 0.09565311, 0.1587342, 0.00784263, + + 0.068281, -0.162217, -0.152268, 0.00323521, + 0.01582633, 0.03858774, -0.03001583, -0.02671271, + + -0.0317821, -0.0333089, 0.0609602, 0.0333759, + -0.01432795, 0.05524484, 0.1101355, -0.02382665, + + -0.00623099, -0.077701, -0.391193, -0.0136691, + -0.02333033, 0.02293761, 0.12338032, 0.04326871, + + 0.201551, -0.164607, -0.179462, -0.0592739, + 0.01064911, -0.17503069, 0.07821996, -0.00224009, + + 0.0886511, -0.0875401, -0.269283, 0.0281379, + -0.02282338, 0.09741908, 0.32973239, 0.12281385, + + -0.201174, -0.586145, -0.628624, -0.0330412, + 0.24780814, -0.39304617, -0.22473189, 0.02589256, + + -0.0839096, -0.299329, 0.108746, 0.109808, + 0.10084175, -0.06416984, 0.28936723, 0.0026358, + + 0.419114, -0.237824, -0.422627, 0.175115, + -0.2314795, -0.18584411, -0.4228974, -0.12928449, + + 0.36726, -0.522303, -0.456502, -0.175475, + 0.17012937, -0.34447709, 0.38505614, -0.28158101, +}; + +static float svdf_golden_output_rank_2[] = { + -0.09623547, -0.10193135, 0.11083051, -0.0347917, + 0.1141196, 0.12965347, -0.12652366, 0.01007236, + + -0.16396809, -0.21247184, 0.11259045, -0.04156673, + 0.10132131, -0.06143532, -0.00924693, 0.10084561, + + 0.01257364, 0.0506071, -0.19287863, -0.07162561, + -0.02033747, 0.22673416, 0.15487903, 0.02525555, + + -0.1411963, -0.37054959, 0.01774767, 0.05867489, + 0.09607603, -0.0141301, -0.08995658, 0.12867066, + + -0.27142537, -0.16955489, 0.18521598, -0.12528358, + 0.00331409, 0.11167502, 0.02218599, -0.07309391, + + 0.09593632, -0.28361851, -0.0773851, 0.17199151, + -0.00075242, 0.33691186, -0.1536046, 0.16572715, + + -0.27916506, -0.27626723, 0.42615682, 0.3225764, + -0.37472126, -0.55655634, -0.05013514, 0.289112, + + -0.24418658, 0.07540751, -0.1940318, -0.08911639, + 0.00732617, 0.46737891, 0.26449674, 0.24888524, + + -0.17225097, -0.54660404, -0.38795233, 0.08389944, + 0.07736043, -0.28260678, 0.15666828, 1.14949894, + + -0.57454878, -0.64704704, 0.73235172, -0.34616736, + 0.21120001, -0.22927976, 0.02455296, -0.35906726, +}; + +class BaseSVDFOpModel : public SingleOpModelWithNNAPI { + public: + BaseSVDFOpModel(int batches, int units, int input_size, int memory_size, + int rank, + TensorType weights_feature_type = TensorType_FLOAT32, + TensorType weights_time_type = TensorType_FLOAT32) + : batches_(batches), + units_(units), + input_size_(input_size), + memory_size_(memory_size), + rank_(rank) { + input_ = AddInput(TensorType_FLOAT32); + weights_feature_ = AddInput(weights_feature_type); + weights_time_ = AddInput(weights_time_type); + bias_ = AddNullInput(); + state_ = AddOutput(TensorType_FLOAT32); + output_ = AddOutput(TensorType_FLOAT32); + SetBuiltinOp( + BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions, + CreateSVDFOptions(builder_, rank, ActivationFunctionType_NONE).Union()); + BuildInterpreter({ + {batches_, input_size_}, // Input tensor + {units_ * rank, input_size_}, // weights_feature tensor + {units_ * rank, memory_size_}, // weights_time tensor + {units_} // bias tensor + }); + } + + // Populates the weights_feature tensor. + void SetWeightsFeature(std::initializer_list f) { + PopulateTensor(weights_feature_, f); + } + + // Populates the weights_time tensor. + void SetWeightsTime(std::initializer_list f) { + PopulateTensor(weights_time_, f); + } + + // Populates the input tensor. + void SetInput(int offset, float* begin, float* end) { + PopulateTensor(input_, offset, begin, end); + } + + // Resets the state of SVDF op by filling it with 0's. + void ResetState() { + const int zero_buffer_size = rank_ * units_ * batches_ * memory_size_; + std::unique_ptr zero_buffer(new float[zero_buffer_size]); + memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); + PopulateTensor(state_, 0, zero_buffer.get(), + zero_buffer.get() + zero_buffer_size); + } + + // Extracts the output tensor from the SVDF op. + std::vector GetOutput() { return ExtractVector(output_); } + + int input_size() { return input_size_; } + int num_units() { return units_; } + int num_batches() { return batches_; } + + protected: + int input_; + int weights_feature_; + int weights_time_; + int bias_; + int state_; + int output_; + + int batches_; + int units_; + int input_size_; + int memory_size_; + int rank_; +}; + +class SVDFOpModel : public BaseSVDFOpModel { + public: + using BaseSVDFOpModel::BaseSVDFOpModel; + + void VerifyGoldens(float golden_input[], float golden_output[], + int golden_size, float tolerance = 1e-5) { + const int svdf_num_batches = num_batches(); + const int svdf_input_size = input_size(); + const int svdf_num_units = num_units(); + const int input_sequence_size = + golden_size / sizeof(float) / (svdf_input_size * svdf_num_batches); + // Going over each input batch, setting the input tensor, invoking the SVDF + // op and checking the output with the expected golden values. + for (int i = 0; i < input_sequence_size; i++) { + float* batch_start = + golden_input + i * svdf_input_size * svdf_num_batches; + float* batch_end = batch_start + svdf_input_size * svdf_num_batches; + SetInput(0, batch_start, batch_end); + + Invoke(); + + const float* golden_start = + golden_output + i * svdf_num_units * svdf_num_batches; + const float* golden_end = + golden_start + svdf_num_units * svdf_num_batches; + std::vector expected; + expected.insert(expected.end(), golden_start, golden_end); + + EXPECT_THAT(GetOutput(), + ElementsAreArray(ArrayFloatNear(expected, tolerance))); + } + } +}; + +TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) { + SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, + /*memory_size=*/10, /*rank=*/1); + svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347, + 0.22197971, 0.12416199, 0.27901134, 0.27557442, + 0.3905206, -0.36137494, -0.06634006, -0.10640851}); + + svdf.SetWeightsTime( + {-0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156, + 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199, + + 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518, + -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296, + + -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236, + 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846, + + -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166, + -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657}); + + svdf.ResetState(); + svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input)); +} + +TEST(NNAPIDelegate, SVDFBlackBoxTestRank2) { + SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3, + /*memory_size=*/10, /*rank=*/2); + svdf.SetWeightsFeature({-0.31930989, 0.0079667, 0.39296314, 0.37613347, + 0.12416199, 0.15785322, 0.27901134, 0.3905206, + 0.21931258, -0.36137494, -0.10640851, 0.31053296, + -0.36118156, -0.0976817, -0.36916667, 0.22197971, + 0.15294972, 0.38031587, 0.27557442, 0.39635518, + -0.21580373, -0.06634006, -0.02702999, 0.27072677}); + + svdf.SetWeightsTime( + {-0.31930989, 0.37613347, 0.27901134, -0.36137494, -0.36118156, + 0.22197971, 0.27557442, -0.06634006, 0.0079667, 0.12416199, + + 0.3905206, -0.10640851, -0.0976817, 0.15294972, 0.39635518, + -0.02702999, 0.39296314, 0.15785322, 0.21931258, 0.31053296, + + -0.36916667, 0.38031587, -0.21580373, 0.27072677, 0.23622236, + 0.34936687, 0.18174365, 0.35907319, -0.17493086, 0.324846, + + -0.10781813, 0.27201805, 0.14324132, -0.23681851, -0.27115166, + -0.01580888, -0.14943552, 0.15465137, 0.09784451, -0.0337657, + + -0.14884081, 0.19931212, -0.36002168, 0.34663299, -0.11405486, + 0.12672701, 0.39463779, -0.07886535, -0.06384811, 0.08249187, + + -0.26816407, -0.19905911, 0.29211238, 0.31264046, -0.28664589, + 0.05698794, 0.11613581, 0.14078894, 0.02187902, -0.21781836, + + -0.15567942, 0.08693647, -0.38256618, 0.36580828, -0.22922277, + -0.0226903, 0.12878349, -0.28122205, -0.10850525, -0.11955214, + + 0.27179423, -0.04710215, 0.31069002, 0.22672787, 0.09580326, + 0.08682203, 0.1258215, 0.1851041, 0.29228821, 0.12366763}); + + svdf.ResetState(); + svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input)); +} + +class LSTMOpModel : public SingleOpModelWithNNAPI { + public: + LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg, + bool use_peephole, bool use_projection_weights, + bool use_projection_bias, float cell_clip, float proj_clip, + const std::vector>& input_shapes, + const TensorType& weight_type = TensorType_FLOAT32) + : n_batch_(n_batch), + n_input_(n_input), + n_cell_(n_cell), + n_output_(n_output) { + input_ = AddInput(TensorType_FLOAT32); + + if (use_cifg) { + input_to_input_weights_ = AddNullInput(); + } else { + input_to_input_weights_ = AddInput(weight_type); + } + + input_to_forget_weights_ = AddInput(weight_type); + input_to_cell_weights_ = AddInput(weight_type); + input_to_output_weights_ = AddInput(weight_type); + + if (use_cifg) { + recurrent_to_input_weights_ = AddNullInput(); + } else { + recurrent_to_input_weights_ = AddInput(weight_type); + } + + recurrent_to_forget_weights_ = AddInput(weight_type); + recurrent_to_cell_weights_ = AddInput(weight_type); + recurrent_to_output_weights_ = AddInput(weight_type); + + if (use_peephole) { + if (use_cifg) { + cell_to_input_weights_ = AddNullInput(); + } else { + cell_to_input_weights_ = AddInput(weight_type); + } + cell_to_forget_weights_ = AddInput(weight_type); + cell_to_output_weights_ = AddInput(weight_type); + } else { + cell_to_input_weights_ = AddNullInput(); + cell_to_forget_weights_ = AddNullInput(); + cell_to_output_weights_ = AddNullInput(); + } + + if (use_cifg) { + input_gate_bias_ = AddNullInput(); + } else { + input_gate_bias_ = AddInput(TensorType_FLOAT32); + } + forget_gate_bias_ = AddInput(TensorType_FLOAT32); + cell_bias_ = AddInput(TensorType_FLOAT32); + output_gate_bias_ = AddInput(TensorType_FLOAT32); + + if (use_projection_weights) { + projection_weights_ = AddInput(weight_type); + if (use_projection_bias) { + projection_bias_ = AddInput(TensorType_FLOAT32); + } else { + projection_bias_ = AddNullInput(); + } + } else { + projection_weights_ = AddNullInput(); + projection_bias_ = AddNullInput(); + } + + output_state_ = AddOutput(TensorType_FLOAT32); + cell_state_ = AddOutput(TensorType_FLOAT32); + output_ = AddOutput(TensorType_FLOAT32); + + SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions, + CreateLSTMOptions(builder_, ActivationFunctionType_TANH, + cell_clip, proj_clip) + .Union()); + BuildInterpreter(input_shapes); + } + + void SetInputToInputWeights(std::initializer_list f) { + PopulateTensor(input_to_input_weights_, f); + } + + void SetInputToForgetWeights(std::initializer_list f) { + PopulateTensor(input_to_forget_weights_, f); + } + + void SetInputToCellWeights(std::initializer_list f) { + PopulateTensor(input_to_cell_weights_, f); + } + + void SetInputToOutputWeights(std::initializer_list f) { + PopulateTensor(input_to_output_weights_, f); + } + + void SetRecurrentToInputWeights(std::initializer_list f) { + PopulateTensor(recurrent_to_input_weights_, f); + } + + void SetRecurrentToForgetWeights(std::initializer_list f) { + PopulateTensor(recurrent_to_forget_weights_, f); + } + + void SetRecurrentToCellWeights(std::initializer_list f) { + PopulateTensor(recurrent_to_cell_weights_, f); + } + + void SetRecurrentToOutputWeights(std::initializer_list f) { + PopulateTensor(recurrent_to_output_weights_, f); + } + + void SetCellToInputWeights(std::initializer_list f) { + PopulateTensor(cell_to_input_weights_, f); + } + + void SetCellToForgetWeights(std::initializer_list f) { + PopulateTensor(cell_to_forget_weights_, f); + } + + void SetCellToOutputWeights(std::initializer_list f) { + PopulateTensor(cell_to_output_weights_, f); + } + + void SetInputGateBias(std::initializer_list f) { + PopulateTensor(input_gate_bias_, f); + } + + void SetForgetGateBias(std::initializer_list f) { + PopulateTensor(forget_gate_bias_, f); + } + + void SetCellBias(std::initializer_list f) { + PopulateTensor(cell_bias_, f); + } + + void SetOutputGateBias(std::initializer_list f) { + PopulateTensor(output_gate_bias_, f); + } + + void SetProjectionWeights(std::initializer_list f) { + PopulateTensor(projection_weights_, f); + } + + void SetProjectionBias(std::initializer_list f) { + PopulateTensor(projection_bias_, f); + } + + void ResetOutputState() { + const int zero_buffer_size = n_cell_ * n_batch_; + std::unique_ptr zero_buffer(new float[zero_buffer_size]); + memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); + PopulateTensor(output_state_, 0, zero_buffer.get(), + zero_buffer.get() + zero_buffer_size); + } + + void ResetCellState() { + const int zero_buffer_size = n_cell_ * n_batch_; + std::unique_ptr zero_buffer(new float[zero_buffer_size]); + memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float)); + PopulateTensor(cell_state_, 0, zero_buffer.get(), + zero_buffer.get() + zero_buffer_size); + } + + void SetInput(int offset, const float* begin, const float* end) { + PopulateTensor(input_, offset, const_cast(begin), + const_cast(end)); + } + + std::vector GetOutput() { return ExtractVector(output_); } + + int num_inputs() { return n_input_; } + int num_outputs() { return n_output_; } + int num_cells() { return n_cell_; } + int num_batches() { return n_batch_; } + + protected: + int input_; + int input_to_input_weights_; + int input_to_forget_weights_; + int input_to_cell_weights_; + int input_to_output_weights_; + + int recurrent_to_input_weights_; + int recurrent_to_forget_weights_; + int recurrent_to_cell_weights_; + int recurrent_to_output_weights_; + + int cell_to_input_weights_; + int cell_to_forget_weights_; + int cell_to_output_weights_; + + int input_gate_bias_; + int forget_gate_bias_; + int cell_bias_; + int output_gate_bias_; + + int projection_weights_; + int projection_bias_; + int input_activation_state_; + int input_cell_state_; + + int output_; + int output_state_; + int cell_state_; + + int n_batch_; + int n_input_; + int n_cell_; + int n_output_; +}; + +class BaseLstmTest : public ::testing::Test { + protected: + // Weights of the LSTM model. Some are optional. + std::initializer_list input_to_input_weights_; + std::initializer_list input_to_cell_weights_; + std::initializer_list input_to_forget_weights_; + std::initializer_list input_to_output_weights_; + std::initializer_list input_gate_bias_; + std::initializer_list cell_gate_bias_; + std::initializer_list forget_gate_bias_; + std::initializer_list output_gate_bias_; + std::initializer_list recurrent_to_input_weights_; + std::initializer_list recurrent_to_cell_weights_; + std::initializer_list recurrent_to_forget_weights_; + std::initializer_list recurrent_to_output_weights_; + std::initializer_list cell_to_input_weights_; + std::initializer_list cell_to_forget_weights_; + std::initializer_list cell_to_output_weights_; + std::initializer_list projection_weights_; + + // LSTM input is stored as num_batch x num_inputs vector. + std::vector> lstm_input_; + // LSTM output is stored as num_batch x num_outputs vector. + std::vector> lstm_golden_output_; + + // Compares output up to tolerance to the result of the lstm given the input. + void VerifyGoldens(const std::vector>& input, + const std::vector>& output, + LSTMOpModel* lstm, float tolerance = 1e-5) { + const int num_batches = input.size(); + EXPECT_GT(num_batches, 0); + const int num_inputs = lstm->num_inputs(); + EXPECT_GT(num_inputs, 0); + const int input_sequence_size = input[0].size() / num_inputs; + EXPECT_GT(input_sequence_size, 0); + for (int i = 0; i < input_sequence_size; ++i) { + for (int b = 0; b < num_batches; ++b) { + const float* batch_start = input[b].data() + i * num_inputs; + const float* batch_end = batch_start + num_inputs; + + lstm->SetInput(b * lstm->num_inputs(), batch_start, batch_end); + } + + lstm->Invoke(); + + const int num_outputs = lstm->num_outputs(); + std::vector expected; + for (int b = 0; b < num_batches; ++b) { + const float* golden_start_batch = output[b].data() + i * num_outputs; + const float* golden_end_batch = golden_start_batch + num_outputs; + expected.insert(expected.end(), golden_start_batch, golden_end_batch); + } + EXPECT_THAT(lstm->GetOutput(), + ElementsAreArray(ArrayFloatNear(expected, tolerance))); + } + } +}; + +class NoCifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest { + void SetUp() override { + input_to_input_weights_ = {-0.45018822, -0.02338299, -0.0870589, + -0.34550029, 0.04266912, -0.15680569, + -0.34856534, 0.43890524}; + input_to_cell_weights_ = {-0.50013041, 0.1370284, 0.11810488, 0.2013163, + -0.20583314, 0.44344562, 0.22077113, -0.29909778}; + input_to_forget_weights_ = {0.09701663, 0.20334584, -0.50592935, + -0.31343272, -0.40032279, 0.44781327, + 0.01387155, -0.35593212}; + input_to_output_weights_ = {-0.25065863, -0.28290087, 0.04613829, + 0.40525138, 0.44272184, 0.03897077, + -0.1556896, 0.19487578}; + input_gate_bias_ = {0., 0., 0., 0.}; + cell_gate_bias_ = {0., 0., 0., 0.}; + forget_gate_bias_ = {1., 1., 1., 1.}; + output_gate_bias_ = {0., 0., 0., 0.}; + + recurrent_to_input_weights_ = { + -0.0063535, -0.2042388, 0.31454784, -0.35746509, + 0.28902304, 0.08183324, -0.16555229, 0.02286911, + -0.13566875, 0.03034258, 0.48091322, -0.12528998, + 0.24077177, -0.51332325, -0.33502164, 0.10629296}; + + recurrent_to_cell_weights_ = { + -0.3407414, 0.24443203, -0.2078532, 0.26320225, + 0.05695659, -0.00123841, -0.4744786, -0.35869038, + -0.06418842, -0.13502428, -0.501764, 0.22830659, + -0.46367589, 0.26016325, -0.03894562, -0.16368064}; + + recurrent_to_forget_weights_ = { + -0.48684245, -0.06655136, 0.42224967, 0.2112639, + 0.27654213, 0.20864892, -0.07646349, 0.45877004, + 0.00141793, -0.14609534, 0.36447752, 0.09196436, + 0.28053468, 0.01560611, -0.20127171, -0.01140004}; + + recurrent_to_output_weights_ = { + 0.43385774, -0.17194885, 0.2718237, 0.09215671, + 0.24107647, -0.39835793, 0.18212086, 0.01301402, + 0.48572797, -0.50656658, 0.20047462, -0.20607421, + -0.51818722, -0.15390486, 0.0468148, 0.39922136}; + + lstm_input_ = {{2., 3., 3., 4., 1., 1.}}; + lstm_golden_output_ = {{-0.02973187, 0.1229473, 0.20885126, -0.15358765, + -0.03716109, 0.12507336, 0.41193449, -0.20860538, + -0.15053082, 0.09120187, 0.24278517, -0.12222792}}; + } +}; + +TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) { + const int n_batch = 1; + const int n_input = 2; + // n_cell and n_output have the same size when there is no projection. + const int n_cell = 4; + const int n_output = 4; + + LSTMOpModel lstm(n_batch, n_input, n_cell, n_output, + /*use_cifg=*/false, /*use_peephole=*/false, + /*use_projection_weights=*/false, + /*use_projection_bias=*/false, + /*cell_clip=*/0.0, /*proj_clip=*/0.0, + { + {n_batch, n_input}, // input tensor + + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight_tensor + {n_cell, n_output}, // recurrent_to_forget_weight_tensor + {n_cell, n_output}, // recurrent_to_cell_weight_tensor + {n_cell, n_output}, // recurrent_to_output_weight_tensor + + {0}, // cell_to_input_weight tensor + {0}, // cell_to_forget_weight tensor + {0}, // cell_to_output_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + }); + + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); + + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); + + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + // Resetting cell_state and output_state + lstm.ResetCellState(); + lstm.ResetOutputState(); + + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); +} + +class CifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest { + void SetUp() override { + input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726, + 0.05100781, 0.04717243, 0.48944736, + -0.38535351, -0.17212132}; + + input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988, + -0.3633365, -0.22755712, 0.28253698, + 0.24407166, 0.33826375}; + + input_to_output_weights_ = {0.10725588, -0.02335852, -0.55932593, + -0.09426838, -0.44257352, 0.54939759, + 0.01533556, 0.42751634}; + cell_gate_bias_ = {0., 0., 0., 0.}; + forget_gate_bias_ = {1., 1., 1., 1.}; + output_gate_bias_ = {0., 0., 0., 0.}; + + recurrent_to_cell_weights_ = { + 0.54066205, -0.32668582, -0.43562764, -0.56094903, + 0.42957711, 0.01841056, -0.32764608, -0.33027974, + -0.10826075, 0.20675004, 0.19069612, -0.03026325, + -0.54532051, 0.33003211, 0.44901288, 0.21193194}; + + recurrent_to_forget_weights_ = { + -0.13832897, -0.0515101, -0.2359007, -0.16661474, + -0.14340827, 0.36986142, 0.23414481, 0.55899, + 0.10798943, -0.41174671, 0.17751795, -0.34484994, + -0.35874045, -0.11352962, 0.27268326, 0.54058349}; + + recurrent_to_output_weights_ = { + 0.41613156, 0.42610586, -0.16495961, -0.5663873, + 0.30579174, -0.05115908, -0.33941799, 0.23364776, + 0.11178309, 0.09481031, -0.26424935, 0.46261835, + 0.50248802, 0.26114327, -0.43736315, 0.33149987}; + + cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408, + 0.31544167}; + cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703, + -0.77109635}; + + lstm_input_ = {{2., 3., 3., 4., 1., 1.}}; + lstm_golden_output_ = {{-0.36444446, -0.00352185, 0.12886585, -0.05163646, + -0.42312205, -0.01218222, 0.24201041, -0.08124574, + -0.358325, -0.04621704, 0.21641694, -0.06471302}}; + } +}; + +TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) { + const int n_batch = 1; + const int n_input = 2; + // n_cell and n_output have the same size when there is no projection. + const int n_cell = 4; + const int n_output = 4; + + LSTMOpModel lstm(n_batch, n_input, n_cell, n_output, + /*use_cifg=*/true, /*use_peephole=*/true, + /*use_projection_weights=*/false, + /*use_projection_bias=*/false, + /*cell_clip=*/0.0, /*proj_clip=*/0.0, + { + {n_batch, n_input}, // input tensor + + {0, 0}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {0, 0}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {0}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor + + {0}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {0, 0}, // projection_weight tensor + {0}, // projection_bias tensor + }); + + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); + + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); + + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); + + // Resetting cell_state and output_state + lstm.ResetCellState(); + lstm.ResetOutputState(); + + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); +} + +class NoCifgPeepholeProjectionClippingLstmTest : public BaseLstmTest { + void SetUp() override { + input_to_input_weights_ = { + 0.021393683, 0.06124551, 0.046905167, -0.014657677, -0.03149463, + 0.09171803, 0.14647801, 0.10797193, -0.0057968358, 0.0019193048, + -0.2726754, 0.10154029, -0.018539885, 0.080349885, -0.10262385, + -0.022599787, -0.09121155, -0.008675967, -0.045206103, -0.0821282, + -0.008045952, 0.015478081, 0.055217247, 0.038719587, 0.044153627, + -0.06453243, 0.05031825, -0.046935108, -0.008164439, 0.014574226, + -0.1671009, -0.15519552, -0.16819797, -0.13971269, -0.11953059, + 0.25005487, -0.22790983, 0.009855087, -0.028140958, -0.11200698, + 0.11295408, -0.0035217577, 0.054485075, 0.05184695, 0.064711206, + 0.10989193, 0.11674786, 0.03490607, 0.07727357, 0.11390585, + -0.1863375, -0.1034451, -0.13945189, -0.049401227, -0.18767063, + 0.042483903, 0.14233552, 0.13832581, 0.18350165, 0.14545603, + -0.028545704, 0.024939531, 0.050929718, 0.0076203286, -0.0029723682, + -0.042484224, -0.11827596, -0.09171104, -0.10808628, -0.16327988, + -0.2273378, -0.0993647, -0.017155107, 0.0023917493, 0.049272764, + 0.0038534778, 0.054764505, 0.089753784, 0.06947234, 0.08014476, + -0.04544234, -0.0497073, -0.07135631, -0.048929106, -0.004042012, + -0.009284026, 0.018042054, 0.0036860977, -0.07427302, -0.11434604, + -0.018995456, 0.031487543, 0.012834908, 0.019977754, 0.044256654, + -0.39292613, -0.18519334, -0.11651281, -0.06809892, 0.011373677}; + + input_to_forget_weights_ = { + -0.0018401089, -0.004852237, 0.03698424, 0.014181704, + 0.028273236, -0.016726194, -0.05249759, -0.10204261, + 0.00861066, -0.040979505, -0.009899187, 0.01923892, + -0.028177269, -0.08535103, -0.14585495, 0.10662567, + -0.01909731, -0.017883534, -0.0047269356, -0.045103323, + 0.0030784295, 0.076784775, 0.07463696, 0.094531395, + 0.0814421, -0.12257899, -0.033945758, -0.031303465, + 0.045630626, 0.06843887, -0.13492945, -0.012480007, + -0.0811829, -0.07224499, -0.09628791, 0.045100946, + 0.0012300825, 0.013964662, 0.099372394, 0.02543059, + 0.06958324, 0.034257296, 0.0482646, 0.06267997, + 0.052625068, 0.12784666, 0.07077897, 0.025725935, + 0.04165009, 0.07241905, 0.018668644, -0.037377294, + -0.06277783, -0.08833636, -0.040120605, -0.011405586, + -0.007808335, -0.010301386, -0.005102167, 0.027717464, + 0.05483423, 0.11449111, 0.11289652, 0.10939839, + 0.13396506, -0.08402166, -0.01901462, -0.044678304, + -0.07720565, 0.014350063, -0.11757958, -0.0652038, + -0.08185733, -0.076754324, -0.092614375, 0.10405491, + 0.052960336, 0.035755895, 0.035839386, -0.012540553, + 0.036881298, 0.02913376, 0.03420159, 0.05448447, + -0.054523353, 0.02582715, 0.02327355, -0.011857179, + -0.0011980024, -0.034641717, -0.026125094, -0.17582615, + -0.15923657, -0.27486774, -0.0006143371, 0.0001771948, + -8.470171e-05, 0.02651807, 0.045790765, 0.06956496}; + + input_to_cell_weights_ = { + -0.04580283, -0.09549462, -0.032418985, -0.06454633, + -0.043528453, 0.043018587, -0.049152344, -0.12418144, + -0.078985475, -0.07596889, 0.019484362, -0.11434962, + -0.0074034138, -0.06314844, -0.092981495, 0.0062155537, + -0.025034338, -0.0028890965, 0.048929527, 0.06235075, + 0.10665918, -0.032036792, -0.08505916, -0.10843358, + -0.13002433, -0.036816437, -0.02130134, -0.016518239, + 0.0047691227, -0.0025825808, 0.066017866, 0.029991534, + -0.10652836, -0.1037554, -0.13056071, -0.03266643, + -0.033702414, -0.006473424, -0.04611692, 0.014419339, + -0.025174323, 0.0396852, 0.081777506, 0.06157468, + 0.10210095, -0.009658194, 0.046511717, 0.03603906, + 0.0069369148, 0.015960095, -0.06507666, 0.09551598, + 0.053568836, 0.06408714, 0.12835667, -0.008714329, + -0.20211966, -0.12093674, 0.029450472, 0.2849013, + -0.029227901, 0.1164364, -0.08560263, 0.09941786, + -0.036999565, -0.028842626, -0.0033637602, -0.017012902, + -0.09720865, -0.11193351, -0.029155117, -0.017936034, + -0.009768936, -0.04223324, -0.036159635, 0.06505112, + -0.021742892, -0.023377212, -0.07221364, -0.06430552, + 0.05453865, 0.091149814, 0.06387331, 0.007518393, + 0.055960953, 0.069779344, 0.046411168, 0.10509911, + 0.07463894, 0.0075130584, 0.012850982, 0.04555431, + 0.056955688, 0.06555285, 0.050801456, -0.009862683, + 0.00826772, -0.026555609, -0.0073611983, -0.0014897042}; + + input_to_output_weights_ = { + -0.0998932, -0.07201956, -0.052803773, -0.15629593, -0.15001918, + -0.07650751, 0.02359855, -0.075155355, -0.08037709, -0.15093534, + 0.029517552, -0.04751393, 0.010350531, -0.02664851, -0.016839722, + -0.023121163, 0.0077019283, 0.012851257, -0.05040649, -0.0129761, + -0.021737747, -0.038305793, -0.06870586, -0.01481247, -0.001285394, + 0.10124236, 0.083122835, 0.053313006, -0.062235646, -0.075637154, + -0.027833903, 0.029774971, 0.1130802, 0.09218906, 0.09506135, + -0.086665764, -0.037162706, -0.038880914, -0.035832845, -0.014481564, + -0.09825003, -0.12048569, -0.097665586, -0.05287633, -0.0964047, + -0.11366429, 0.035777505, 0.13568819, 0.052451383, 0.050649304, + 0.05798951, -0.021852335, -0.099848844, 0.014740475, -0.078897946, + 0.04974699, 0.014160473, 0.06973932, 0.04964942, 0.033364646, + 0.08190124, 0.025535367, 0.050893165, 0.048514254, 0.06945813, + -0.078907564, -0.06707616, -0.11844508, -0.09986688, -0.07509403, + 0.06263226, 0.14925587, 0.20188436, 0.12098451, 0.14639415, + 0.0015017595, -0.014267382, -0.03417257, 0.012711468, 0.0028300495, + -0.024758482, -0.05098548, -0.0821182, 0.014225672, 0.021544158, + 0.08949725, 0.07505268, -0.0020780868, 0.04908258, 0.06476295, + -0.022907063, 0.027562456, 0.040185735, 0.019567577, -0.015598739, + -0.049097303, -0.017121866, -0.083368234, -0.02332002, -0.0840956}; + + input_gate_bias_ = {0.02234832, 0.14757581, 0.18176508, 0.10380666, + 0.053110216, -0.06928846, -0.13942584, -0.11816189, + 0.19483899, 0.03652339, -0.10250295, 0.036714908, + -0.18426876, 0.036065217, 0.21810818, 0.02383196, + -0.043370757, 0.08690144, -0.04444982, 0.00030581196}; + + forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696, + 0.11098921, 0.15378423, 0.09263801, 0.09790885, + 0.09508917, 0.061199076, 0.07665568, -0.015443159, + -0.03499149, 0.046190713, 0.08895977, 0.10899629, + 0.40694186, 0.06030037, 0.012413437, -0.06108739}; + + cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132, 0.033463873, + -0.1483596, -0.10639995, -0.091433935, 0.058573797, + -0.06809782, -0.07889636, -0.043246906, -0.09829136, + -0.4279842, 0.034901652, 0.18797937, 0.0075234566, + 0.016178843, 0.1749513, 0.13975595, 0.92058027}; + + output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469, 0.12648113, + 0.027195795, 0.35373217, -0.018957434, 0.008907322, + -0.0762701, 0.12018895, 0.04216877, 0.0022856654, + 0.040952638, 0.3147856, 0.08225149, -0.057416286, + -0.14995944, -0.008040261, 0.13208859, 0.029760877}; + + recurrent_to_input_weights_ = { + -0.001374326, -0.078856036, 0.10672688, 0.029162422, + -0.11585556, 0.02557986, -0.13446963, -0.035785314, + -0.01244275, 0.025961924, -0.02337298, -0.044228926, + -0.055839065, -0.046598054, -0.010546039, -0.06900766, + 0.027239809, 0.022582639, -0.013296484, -0.05459212, + 0.08981, -0.045407712, 0.08682226, -0.06867011, + -0.14390695, -0.02916037, 0.000996957, 0.091420636, + 0.14283475, -0.07390571, -0.06402044, 0.062524505, + -0.093129106, 0.04860203, -0.08364217, -0.08119002, + 0.009352075, 0.22920375, 0.0016303885, 0.11583097, + -0.13732095, 0.012405723, -0.07551853, 0.06343048, + 0.12162708, -0.031923793, -0.014335606, 0.01790974, + -0.10650317, -0.0724401, 0.08554849, -0.05727212, + 0.06556731, -0.042729504, -0.043227166, 0.011683251, + -0.013082158, -0.029302018, -0.010899579, -0.062036745, + -0.022509435, -0.00964907, -0.01567329, 0.04260106, + -0.07787477, -0.11576462, 0.017356863, 0.048673786, + -0.017577527, -0.05527947, -0.082487635, -0.040137455, + -0.10820036, -0.04666372, 0.022746278, -0.07851417, + 0.01068115, 0.032956902, 0.022433773, 0.0026891115, + 0.08944216, -0.0685835, 0.010513544, 0.07228705, + 0.02032331, -0.059686817, -0.0005566496, -0.086984694, + 0.040414046, -0.1380399, 0.094208956, -0.05722982, + 0.012092817, -0.04989123, -0.086576, -0.003399834, + -0.04696032, -0.045747425, 0.10091314, 0.048676282, + -0.029037097, 0.031399418, -0.0040285117, 0.047237843, + 0.09504992, 0.041799378, -0.049185462, -0.031518843, + -0.10516937, 0.026374253, 0.10058866, -0.0033195973, + -0.041975245, 0.0073591834, 0.0033782164, -0.004325073, + -0.10167381, 0.042500053, -0.01447153, 0.06464186, + -0.017142897, 0.03312627, 0.009205989, 0.024138335, + -0.011337001, 0.035530265, -0.010912711, 0.0706555, + -0.005894094, 0.051841937, -0.1401738, -0.02351249, + 0.0365468, 0.07590991, 0.08838724, 0.021681072, + -0.10086113, 0.019608743, -0.06195883, 0.077335775, + 0.023646897, -0.095322326, 0.02233014, 0.09756986, + -0.048691444, -0.009579111, 0.07595467, 0.11480546, + -0.09801813, 0.019894179, 0.08502348, 0.004032281, + 0.037211012, 0.068537936, -0.048005626, -0.091520436, + -0.028379958, -0.01556313, 0.06554592, -0.045599163, + -0.01672207, -0.020169014, -0.011877351, -0.20212261, + 0.010889619, 0.0047078193, 0.038385306, 0.08540671, + -0.017140968, -0.0035865551, 0.016678626, 0.005633034, + 0.015963363, 0.00871737, 0.060130805, 0.028611384, + 0.10109069, -0.015060172, -0.07894427, 0.06401885, + 0.011584063, -0.024466386, 0.0047652307, -0.09041358, + 0.030737216, -0.0046374933, 0.14215417, -0.11823516, + 0.019899689, 0.006106124, -0.027092824, 0.0786356, + 0.05052217, -0.058925, -0.011402121, -0.024987547, + -0.0013661642, -0.06832946, -0.015667673, -0.1083353, + -0.00096863037, -0.06988685, -0.053350925, -0.027275559, + -0.033664223, -0.07978348, -0.025200296, -0.017207067, + -0.058403496, -0.055697463, 0.005798788, 0.12965427, + -0.062582195, 0.0013350133, -0.10482091, 0.0379771, + 0.072521195, -0.0029455067, -0.13797039, -0.03628521, + 0.013806405, -0.017858358, -0.01008298, -0.07700066, + -0.017081132, 0.019358726, 0.0027079724, 0.004635139, + 0.062634714, -0.02338735, -0.039547626, -0.02050681, + 0.03385117, -0.083611414, 0.002862572, -0.09421313, + 0.058618143, -0.08598433, 0.00972939, 0.023867095, + -0.053934585, -0.023203006, 0.07452513, -0.048767887, + -0.07314807, -0.056307215, -0.10433547, -0.06440842, + 0.04328182, 0.04389765, -0.020006588, -0.09076438, + -0.11652589, -0.021705797, 0.03345259, -0.010329105, + -0.025767034, 0.013057034, -0.07316461, -0.10145612, + 0.06358255, 0.18531723, 0.07759293, 0.12006465, + 0.1305557, 0.058638252, -0.03393652, 0.09622831, + -0.16253184, -2.4580743e-06, 0.079869635, -0.070196845, + -0.005644518, 0.06857898, -0.12598175, -0.035084512, + 0.03156317, -0.12794146, -0.031963028, 0.04692781, + 0.030070418, 0.0071660685, -0.095516115, -0.004643372, + 0.040170413, -0.062104587, -0.0037324072, 0.0554317, + 0.08184801, -0.019164372, 0.06791302, 0.034257166, + -0.10307039, 0.021943003, 0.046745934, 0.0790918, + -0.0265588, -0.007824208, 0.042546265, -0.00977924, + -0.0002440307, -0.017384544, -0.017990116, 0.12252321, + -0.014512694, -0.08251313, 0.08861942, 0.13589665, + 0.026351685, 0.012641483, 0.07466548, 0.044301085, + -0.045414884, -0.051112458, 0.03444247, -0.08502782, + -0.04106223, -0.028126027, 0.028473156, 0.10467447}; + + recurrent_to_cell_weights_ = { + -0.037322544, 0.018592842, 0.0056175636, -0.06253426, + 0.055647098, -0.05713207, -0.05626563, 0.005559383, + 0.03375411, -0.025757805, -0.088049285, 0.06017052, + -0.06570978, 0.007384076, 0.035123326, -0.07920549, + 0.053676967, 0.044480428, -0.07663568, 0.0071805613, + 0.08089997, 0.05143358, 0.038261272, 0.03339287, + -0.027673481, 0.044746667, 0.028349208, 0.020090483, + -0.019443132, -0.030755889, -0.0040000007, 0.04465846, + -0.021585021, 0.0031670958, 0.0053199246, -0.056117613, + -0.10893326, 0.076739706, -0.08509834, -0.027997585, + 0.037871376, 0.01449768, -0.09002357, -0.06111149, + -0.046195522, 0.0422062, -0.005683705, -0.1253618, + -0.012925729, -0.04890792, 0.06985068, 0.037654128, + 0.03398274, -0.004781977, 0.007032333, -0.031787455, + 0.010868644, -0.031489216, 0.09525667, 0.013939797, + 0.0058680447, 0.0167067, 0.02668468, -0.04797466, + -0.048885044, -0.12722108, 0.035304096, 0.06554885, + 0.00972396, -0.039238118, -0.05159735, -0.11329045, + 0.1613692, -0.03750952, 0.06529313, -0.071974665, + -0.11769596, 0.015524369, -0.0013754242, -0.12446318, + 0.02786344, -0.014179351, 0.005264273, 0.14376344, + 0.015983658, 0.03406988, -0.06939408, 0.040699873, + 0.02111075, 0.09669095, 0.041345075, -0.08316494, + -0.07684199, -0.045768797, 0.032298047, -0.041805092, + 0.0119405, 0.0061010392, 0.12652606, 0.0064572375, + -0.024950314, 0.11574242, 0.04508852, -0.04335324, + 0.06760663, -0.027437469, 0.07216407, 0.06977076, + -0.05438599, 0.034033038, -0.028602652, 0.05346137, + 0.043184172, -0.037189785, 0.10420091, 0.00882477, + -0.054019816, -0.074273005, -0.030617684, -0.0028467078, + 0.024302477, -0.0038869337, 0.005332455, 0.0013399826, + 0.04361412, -0.007001822, 0.09631092, -0.06702025, + -0.042049985, -0.035070654, -0.04103342, -0.10273396, + 0.0544271, 0.037184782, -0.13150354, -0.0058036847, + -0.008264958, 0.042035464, 0.05891794, 0.029673764, + 0.0063542654, 0.044788733, 0.054816857, 0.062257513, + -0.00093483756, 0.048938446, -0.004952862, -0.007730018, + -0.04043371, -0.017094059, 0.07229206, -0.023670016, + -0.052195564, -0.025616996, -0.01520939, 0.045104615, + -0.007376126, 0.003533447, 0.006570588, 0.056037236, + 0.12436656, 0.051817212, 0.028532185, -0.08686856, + 0.11868599, 0.07663395, -0.07323171, 0.03463402, + -0.050708205, -0.04458982, -0.11590894, 0.021273347, + 0.1251325, -0.15313013, -0.12224372, 0.17228661, + 0.023029093, 0.086124025, 0.006445803, -0.03496501, + 0.028332196, 0.04449512, -0.042436164, -0.026587414, + -0.006041347, -0.09292539, -0.05678812, 0.03897832, + 0.09465633, 0.008115513, -0.02171956, 0.08304309, + 0.071401566, 0.019622514, 0.032163795, -0.004167056, + 0.02295182, 0.030739572, 0.056506045, 0.004612461, + 0.06524936, 0.059999723, 0.046395954, -0.0045512207, + -0.1335546, -0.030136576, 0.11584653, -0.014678886, + 0.0020118146, -0.09688814, -0.0790206, 0.039770417, + -0.0329582, 0.07922767, 0.029322514, 0.026405897, + 0.04207835, -0.07073373, 0.063781224, 0.0859677, + -0.10925287, -0.07011058, 0.048005477, 0.03438226, + -0.09606514, -0.006669445, -0.043381985, 0.04240257, + -0.06955775, -0.06769346, 0.043903265, -0.026784198, + -0.017840602, 0.024307009, -0.040079936, -0.019946516, + 0.045318738, -0.12233574, 0.026170589, 0.0074471775, + 0.15978073, 0.10185836, 0.10298046, -0.015476589, + -0.039390966, -0.072174534, 0.0739445, -0.1211869, + -0.0347889, -0.07943156, 0.014809798, -0.12412325, + -0.0030663363, 0.039695457, 0.0647603, -0.08291318, + -0.018529687, -0.004423833, 0.0037507233, 0.084633216, + -0.01514876, -0.056505352, -0.012800942, -0.06994386, + 0.012962922, -0.031234352, 0.07029052, 0.016418684, + 0.03618972, 0.055686004, -0.08663945, -0.017404709, + -0.054761406, 0.029065743, 0.052404847, 0.020238016, + 0.0048197987, -0.0214882, 0.07078733, 0.013016777, + 0.06262858, 0.009184685, 0.020785125, -0.043904778, + -0.0270329, -0.03299152, -0.060088247, -0.015162964, + -0.001828936, 0.12642565, -0.056757294, 0.013586685, + 0.09232601, -0.035886683, 0.06000002, 0.05229691, + -0.052580316, -0.082029596, -0.010794592, 0.012947712, + -0.036429964, -0.085508935, -0.13127148, -0.017744139, + 0.031502828, 0.036232427, -0.031581745, 0.023051167, + -0.05325106, -0.03421577, 0.028793324, -0.034633752, + -0.009881397, -0.043551125, -0.018609839, 0.0019097115, + -0.008799762, 0.056595087, 0.0022273948, 0.055752404}; + + recurrent_to_forget_weights_ = { + -0.057784554, -0.026057621, -0.068447545, -0.022581743, + 0.14811787, 0.10826372, 0.09471067, 0.03987225, + -0.0039523416, 0.00030638507, 0.053185795, 0.10572994, + 0.08414449, -0.022036452, -0.00066928595, -0.09203576, + 0.032950465, -0.10985798, -0.023809856, 0.0021431844, + -0.02196096, -0.00326074, 0.00058621005, -0.074678116, + -0.06193199, 0.055729095, 0.03736828, 0.020123724, + 0.061878487, -0.04729229, 0.034919553, -0.07585433, + -0.04421272, -0.044019096, 0.085488975, 0.04058006, + -0.06890133, -0.030951202, -0.024628663, -0.07672815, + 0.034293607, 0.08556707, -0.05293577, -0.033561368, + -0.04899627, 0.0241671, 0.015736353, -0.095442444, + -0.029564252, 0.016493602, -0.035026584, 0.022337519, + -0.026871363, 0.004780428, 0.0077918363, -0.03601621, + 0.016435321, -0.03263031, -0.09543275, -0.047392778, + 0.013454138, 0.028934088, 0.01685226, -0.086110644, + -0.046250615, -0.01847454, 0.047608484, 0.07339695, + 0.034546845, -0.04881143, 0.009128804, -0.08802852, + 0.03761666, 0.008096139, -0.014454086, 0.014361001, + -0.023502491, -0.0011840804, -0.07607001, 0.001856849, + -0.06509276, -0.006021153, -0.08570962, -0.1451793, + 0.060212336, 0.055259194, 0.06974018, 0.049454916, + -0.027794661, -0.08077226, -0.016179763, 0.1169753, + 0.17213494, -0.0056326236, -0.053934924, -0.0124349, + -0.11520337, 0.05409887, 0.088759385, 0.0019655675, + 0.0042065294, 0.03881498, 0.019844765, 0.041858196, + -0.05695512, 0.047233116, 0.038937137, -0.06542224, + 0.014429736, -0.09719407, 0.13908425, -0.05379757, + 0.012321099, 0.082840554, -0.029899208, 0.044217527, + 0.059855383, 0.07711018, -0.045319796, 0.0948846, + -0.011724666, -0.0033288454, -0.033542685, -0.04764985, + -0.13873616, 0.040668588, 0.034832682, -0.015319203, + -0.018715994, 0.046002675, 0.0599172, -0.043107376, + 0.0294216, -0.002314414, -0.022424703, 0.0030315618, + 0.0014641669, 0.0029166266, -0.11878115, 0.013738511, + 0.12375372, -0.0006038222, 0.029104086, 0.087442465, + 0.052958444, 0.07558703, 0.04817258, 0.044462286, + -0.015213451, -0.08783778, -0.0561384, -0.003008196, + 0.047060397, -0.002058388, 0.03429439, -0.018839769, + 0.024734668, 0.024614193, -0.042046934, 0.09597743, + -0.0043254104, 0.04320769, 0.0064070094, -0.0019131786, + -0.02558259, -0.022822596, -0.023273505, -0.02464396, + -0.10991725, -0.006240552, 0.0074488563, 0.024044557, + 0.04383914, -0.046476185, 0.028658995, 0.060410924, + 0.050786525, 0.009452605, -0.0073054377, -0.024810238, + 0.0052906186, 0.0066939713, -0.0020913032, 0.014515517, + 0.015898481, 0.021362653, -0.030262267, 0.016587038, + -0.011442813, 0.041154444, -0.007631438, -0.03423484, + -0.010977775, 0.036152758, 0.0066366293, 0.11915515, + 0.02318443, -0.041350313, 0.021485701, -0.10906167, + -0.028218046, -0.00954771, 0.020531068, -0.11995105, + -0.03672871, 0.024019798, 0.014255957, -0.05221243, + -0.00661567, -0.04630967, 0.033188973, 0.10107534, + -0.014027541, 0.030796422, -0.10270911, -0.035999842, + 0.15443139, 0.07684145, 0.036571592, -0.035900835, + -0.0034699554, 0.06209149, 0.015920248, -0.031122351, + -0.03858649, 0.01849943, 0.13872518, 0.01503974, + 0.069941424, -0.06948533, -0.0088794185, 0.061282158, + -0.047401894, 0.03100163, -0.041533746, -0.10430945, + 0.044574402, -0.01425562, -0.024290353, 0.034563623, + 0.05866852, 0.023947537, -0.09445152, 0.035450947, + 0.02247216, -0.0042998926, 0.061146557, -0.10250651, + 0.020881841, -0.06747029, 0.10062043, -0.0023941975, + 0.03532124, -0.016341697, 0.09685456, -0.016764693, + 0.051808182, 0.05875331, -0.04536488, 0.001626336, + -0.028892258, -0.01048663, -0.009793449, -0.017093895, + 0.010987891, 0.02357273, -0.00010856845, 0.0099760275, + -0.001845119, -0.03551521, 0.0018358806, 0.05763657, + -0.01769146, 0.040995963, 0.02235177, -0.060430344, + 0.11475477, -0.023854522, 0.10071741, 0.0686208, + -0.014250481, 0.034261297, 0.047418304, 0.08562733, + -0.030519066, 0.0060542435, 0.014653856, -0.038836084, + 0.04096551, 0.032249358, -0.08355519, -0.026823482, + 0.056386515, -0.010401743, -0.028396193, 0.08507674, + 0.014410365, 0.020995233, 0.17040324, 0.11511526, + 0.02459721, 0.0066619175, 0.025853224, -0.023133837, + -0.081302024, 0.017264642, -0.009585969, 0.09491168, + -0.051313367, 0.054532815, -0.014298593, 0.10657464, + 0.007076659, 0.10964551, 0.0409152, 0.008275321, + -0.07283536, 0.07937492, 0.04192024, -0.1075027}; + + recurrent_to_output_weights_ = { + 0.025825322, -0.05813119, 0.09495884, -0.045984812, + -0.01255415, -0.0026479573, -0.08196161, -0.054914974, + -0.0046604523, -0.029587349, -0.044576716, -0.07480124, + -0.082868785, 0.023254942, 0.027502948, -0.0039728214, + -0.08683098, -0.08116779, -0.014675607, -0.037924774, + -0.023314456, -0.007401714, -0.09255757, 0.029460307, + -0.08829125, -0.005139627, -0.08989442, -0.0555066, + 0.13596267, -0.025062224, -0.048351806, -0.03850004, + 0.07266485, -0.022414139, 0.05940088, 0.075114764, + 0.09597592, -0.010211725, -0.0049794707, -0.011523867, + -0.025980417, 0.072999895, 0.11091378, -0.081685916, + 0.014416728, 0.043229222, 0.034178585, -0.07530371, + 0.035837382, -0.085607, -0.007721233, -0.03287832, + -0.043848954, -0.06404588, -0.06632928, -0.073643476, + 0.008214239, -0.045984086, 0.039764922, 0.03474462, + 0.060612556, -0.080590084, 0.049127717, 0.04151091, + -0.030063879, 0.008801774, -0.023021035, -0.019558564, + 0.05158114, -0.010947698, -0.011825728, 0.0075720972, + 0.0699727, -0.0039981045, 0.069350146, 0.08799282, + 0.016156472, 0.035502106, 0.11695009, 0.006217345, + 0.13392477, -0.037875112, 0.025745004, 0.08940699, + -0.00924166, 0.0046702605, -0.036598757, -0.08811812, + 0.10522024, -0.032441203, 0.008176899, -0.04454919, + 0.07058152, 0.0067963637, 0.039206743, 0.03259838, + 0.03725492, -0.09515802, 0.013326398, -0.052055415, + -0.025676316, 0.03198509, -0.015951829, -0.058556724, + 0.036879618, 0.043357447, 0.028362012, -0.05908629, + 0.0059240665, -0.04995891, -0.019187413, 0.0276265, + -0.01628143, 0.0025863599, 0.08800015, 0.035250366, + -0.022165963, -0.07328642, -0.009415526, -0.07455109, + 0.11690406, 0.0363299, 0.07411125, 0.042103454, + -0.009660886, 0.019076364, 0.018299393, -0.046004917, + 0.08891175, 0.0431396, -0.026327137, -0.051502608, + 0.08979574, -0.051670972, 0.04940282, -0.07491107, + -0.021240504, 0.022596184, -0.034280192, 0.060163025, + -0.058211457, -0.051837247, -0.01349775, -0.04639988, + -0.035936575, -0.011681591, 0.064818054, 0.0073146066, + -0.021745546, -0.043124277, -0.06471268, -0.07053354, + -0.029321948, -0.05330136, 0.016933719, -0.053782392, + 0.13747959, -0.1361751, -0.11569455, 0.0033329215, + 0.05693899, -0.053219706, 0.063698, 0.07977434, + -0.07924483, 0.06936997, 0.0034815092, -0.007305279, + -0.037325785, -0.07251102, -0.033633437, -0.08677009, + 0.091591336, -0.14165086, 0.021752775, 0.019683983, + 0.0011612234, -0.058154266, 0.049996935, 0.0288841, + -0.0024567875, -0.14345716, 0.010955264, -0.10234828, + 0.1183656, -0.0010731248, -0.023590032, -0.072285876, + -0.0724771, -0.026382286, -0.0014920527, 0.042667855, + 0.0018776858, 0.02986552, 0.009814309, 0.0733756, + 0.12289186, 0.018043943, -0.0458958, 0.049412545, + 0.033632483, 0.05495232, 0.036686596, -0.013781798, + -0.010036754, 0.02576849, -0.08307328, 0.010112348, + 0.042521734, -0.05869831, -0.071689695, 0.03876447, + -0.13275425, -0.0352966, -0.023077697, 0.10285965, + 0.084736146, 0.15568255, -0.00040734606, 0.027835453, + -0.10292561, -0.032401145, 0.10053256, -0.026142767, + -0.08271222, -0.0030240538, -0.016368777, 0.1070414, + 0.042672627, 0.013456989, -0.0437609, -0.022309763, + 0.11576483, 0.04108048, 0.061026827, -0.0190714, + -0.0869359, 0.037901703, 0.0610107, 0.07202949, + 0.01675338, 0.086139716, -0.08795751, -0.014898893, + -0.023771819, -0.01965048, 0.007955471, -0.043740474, + 0.03346837, -0.10549954, 0.090567775, 0.042013682, + -0.03176985, 0.12569028, -0.02421228, -0.029526481, + 0.023851605, 0.031539805, 0.05292009, -0.02344001, + -0.07811758, -0.08834428, 0.10094801, 0.16594367, + -0.06861939, -0.021256343, -0.041093912, -0.06669611, + 0.035498552, 0.021757556, -0.09302526, -0.015403468, + -0.06614931, -0.051798206, -0.013874718, 0.03630673, + 0.010412845, -0.08077351, 0.046185967, 0.0035662893, + 0.03541868, -0.094149634, -0.034814864, 0.003128424, + -0.020674974, -0.03944324, -0.008110165, -0.11113267, + 0.08484226, 0.043586485, 0.040582247, 0.0968012, + -0.065249965, -0.028036479, 0.0050708856, 0.0017462453, + 0.0326779, 0.041296225, 0.09164146, -0.047743853, + -0.015952192, -0.034451712, 0.084197424, -0.05347844, + -0.11768019, 0.085926116, -0.08251791, -0.045081906, + 0.0948852, 0.068401024, 0.024856757, 0.06978981, + -0.057309967, -0.012775832, -0.0032452994, 0.01977615, + -0.041040014, -0.024264973, 0.063464895, 0.05431621, + }; + + cell_to_input_weights_ = { + 0.040369894, 0.030746894, 0.24704495, 0.018586371, -0.037586458, + -0.15312155, -0.11812848, -0.11465643, 0.20259799, 0.11418174, + -0.10116027, -0.011334949, 0.12411352, -0.076769054, -0.052169047, + 0.21198851, -0.38871562, -0.09061183, -0.09683246, -0.21929175}; + + cell_to_forget_weights_ = { + -0.01998659, -0.15568835, -0.24248174, -0.012770197, 0.041331276, + -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766, + -0.047248036, 0.021479502, 0.033189066, 0.11952997, -0.020432774, + 0.64658105, -0.06650122, -0.03467612, 0.095340036, 0.23647355}; + + cell_to_output_weights_ = { + 0.08286371, -0.08261836, -0.51210177, 0.002913762, 0.17764764, + -0.5495371, -0.08460716, -0.24552552, 0.030037103, 0.04123544, + -0.11940523, 0.007358328, 0.1890978, 0.4833202, -0.34441817, + 0.36312827, -0.26375428, 0.1457655, -0.19724406, 0.15548733}; + + projection_weights_ = { + -0.009802181, 0.09401916, 0.0717386, -0.13895074, + 0.09641832, 0.060420845, 0.08539281, 0.054285463, + 0.061395317, 0.034448683, -0.042991187, 0.019801661, + -0.16840284, -0.015726732, -0.23041931, -0.024478018, + -0.10959692, -0.013875541, 0.18600968, -0.061274476, + 0.0138165, -0.08160894, -0.07661644, 0.032372914, + 0.16169067, 0.22465782, -0.03993472, -0.004017731, + 0.08633481, -0.28869787, 0.08682067, 0.17240396, + 0.014975425, 0.056431185, 0.031037588, 0.16702051, + 0.0077946745, 0.15140012, 0.29405436, 0.120285, + -0.188994, -0.027265169, 0.043389652, -0.022061434, + 0.014777949, -0.20203483, 0.094781205, 0.19100232, + 0.13987629, -0.036132768, -0.06426278, -0.05108664, + 0.13221376, 0.009441198, -0.16715929, 0.15859416, + -0.040437475, 0.050779544, -0.022187516, 0.012166504, + 0.027685808, -0.07675938, -0.0055694645, -0.09444123, + 0.0046453946, 0.050794356, 0.10770313, -0.20790008, + -0.07149004, -0.11425117, 0.008225835, -0.035802525, + 0.14374903, 0.15262283, 0.048710253, 0.1847461, + -0.007487823, 0.11000021, -0.09542012, 0.22619456, + -0.029149994, 0.08527916, 0.009043713, 0.0042746216, + 0.016261552, 0.022461696, 0.12689082, -0.043589946, + -0.12035478, -0.08361797, -0.050666027, -0.1248618, + -0.1275799, -0.071875185, 0.07377272, 0.09944291, + -0.18897448, -0.1593054, -0.06526116, -0.040107165, + -0.004618631, -0.067624845, -0.007576253, 0.10727444, + 0.041546922, -0.20424393, 0.06907816, 0.050412357, + 0.00724631, 0.039827548, 0.12449835, 0.10747581, + 0.13708383, 0.09134148, -0.12617786, -0.06428341, + 0.09956831, 0.1208086, -0.14676677, -0.0727722, + 0.1126304, 0.010139365, 0.015571211, -0.038128063, + 0.022913318, -0.042050496, 0.16842307, -0.060597885, + 0.10531834, -0.06411776, -0.07451711, -0.03410368, + -0.13393489, 0.06534304, 0.003620307, 0.04490757, + 0.05970546, 0.05197996, 0.02839995, 0.10434969, + -0.013699693, -0.028353551, -0.07260381, 0.047201227, + -0.024575593, -0.036445823, 0.07155557, 0.009672501, + -0.02328883, 0.009533515, -0.03606021, -0.07421458, + -0.028082801, -0.2678904, -0.13221288, 0.18419984, + -0.13012612, -0.014588381, -0.035059117, -0.04824723, + 0.07830115, -0.056184657, 0.03277091, 0.025466874, + 0.14494097, -0.12522776, -0.098633975, -0.10766018, + -0.08317623, 0.08594209, 0.07749552, 0.039474737, + 0.1776665, -0.07409566, -0.0477268, 0.29323658, + 0.10801441, 0.1154011, 0.013952499, 0.10739139, + 0.10708251, -0.051456142, 0.0074137426, -0.10430189, + 0.10034707, 0.045594677, 0.0635285, -0.0715442, + -0.089667566, -0.10811871, 0.00026344223, 0.08298446, + -0.009525053, 0.006585689, -0.24567553, -0.09450807, + 0.09648481, 0.026996298, -0.06419476, -0.04752702, + -0.11063944, -0.23441927, -0.17608605, -0.052156363, + 0.067035615, 0.19271925, -0.0032889997, -0.043264326, + 0.09663576, -0.057112187, -0.10100678, 0.0628376, + 0.04447668, 0.017961001, -0.10094388, -0.10190601, + 0.18335468, 0.10494553, -0.052095775, -0.0026118709, + 0.10539724, -0.04383912, -0.042349473, 0.08438151, + -0.1947263, 0.02251204, 0.11216432, -0.10307853, + 0.17351969, -0.039091777, 0.08066188, -0.00561982, + 0.12633002, 0.11335965, -0.0088127935, -0.019777594, + 0.06864014, -0.059751723, 0.016233567, -0.06894641, + -0.28651384, -0.004228674, 0.019708522, -0.16305895, + -0.07468996, -0.0855457, 0.099339016, -0.07580735, + -0.13775392, 0.08434318, 0.08330512, -0.12131499, + 0.031935584, 0.09180414, -0.08876437, -0.08049874, + 0.008753825, 0.03498998, 0.030215185, 0.03907079, + 0.089751154, 0.029194152, -0.03337423, -0.019092513, + 0.04331237, 0.04299654, -0.036394123, -0.12915532, + 0.09793732, 0.07512415, -0.11319543, -0.032502122, + 0.15661901, 0.07671967, -0.005491124, -0.19379048, + -0.218606, 0.21448623, 0.017840758, 0.1416943, + -0.07051762, 0.19488361, 0.02664691, -0.18104725, + -0.09334311, 0.15026465, -0.15493552, -0.057762887, + -0.11604192, -0.262013, -0.01391798, 0.012185008, + 0.11156489, -0.07483202, 0.06693364, -0.26151478, + 0.046425626, 0.036540434, -0.16435726, 0.17338543, + -0.21401681, -0.11385144, -0.08283257, -0.069031075, + 0.030635102, 0.010969227, 0.11109743, 0.010919218, + 0.027526086, 0.13519906, 0.01891392, -0.046839405, + -0.040167913, 0.017953383, -0.09700955, 0.0061885654, + -0.07000971, 0.026893595, -0.038844477, 0.14543656}; + + lstm_input_ = { + {// Batch0: 4 (input_sequence_size) * 5 (n_input) + 0.787926, 0.151646, 0.071352, 0.118426, 0.458058, // step 0 + 0.596268, 0.998386, 0.568695, 0.864524, 0.571277, // step 1 + 0.073204, 0.296072, 0.743333, 0.069199, 0.045348, // step 2 + 0.867394, 0.291279, 0.013714, 0.482521, 0.626339}, // step 3 + + {// Batch1: 4 (input_sequence_size) * 5 (n_input) + 0.295743, 0.544053, 0.690064, 0.858138, 0.497181, // step 0 + 0.642421, 0.524260, 0.134799, 0.003639, 0.162482, // step 1 + 0.640394, 0.930399, 0.050782, 0.432485, 0.988078, // step 2 + 0.082922, 0.563329, 0.865614, 0.333232, 0.259916} // step 3 + }; + + lstm_golden_output_ = { + {// Batch0: 4 (input_sequence_size) * 16 (n_output) + -0.00396806, 0.029352, -0.00279226, 0.0159977, -0.00835576, + -0.0211779, 0.0283512, -0.0114597, 0.00907307, -0.0244004, + -0.0152191, -0.0259063, 0.00914318, 0.00415118, 0.017147, + 0.0134203, -0.0166936, 0.0381209, 0.000889694, 0.0143363, + -0.0328911, -0.0234288, 0.0333051, -0.012229, 0.0110322, + -0.0457725, -0.000832209, -0.0202817, 0.0327257, 0.0121308, + 0.0155969, 0.0312091, -0.0213783, 0.0350169, 0.000324794, + 0.0276012, -0.0263374, -0.0371449, 0.0446149, -0.0205474, + 0.0103729, -0.0576349, -0.0150052, -0.0292043, 0.0376827, + 0.0136115, 0.0243435, 0.0354492, -0.0189322, 0.0464512, + -0.00251373, 0.0225745, -0.0308346, -0.0317124, 0.0460407, + -0.0189395, 0.0149363, -0.0530162, -0.0150767, -0.0340193, + 0.0286833, 0.00824207, 0.0264887, 0.0305169}, + {// Batch1: 4 (input_sequence_size) * 16 (n_output) + -0.013869, 0.0287268, -0.00334693, 0.00733398, -0.0287926, + -0.0186926, 0.0193662, -0.0115437, 0.00422612, -0.0345232, + 0.00223253, -0.00957321, 0.0210624, 0.013331, 0.0150954, + 0.02168, -0.0141913, 0.0322082, 0.00227024, 0.0260507, + -0.0188721, -0.0296489, 0.0399134, -0.0160509, 0.0116039, + -0.0447318, -0.0150515, -0.0277406, 0.0316596, 0.0118233, + 0.0214762, 0.0293641, -0.0204549, 0.0450315, -0.00117378, + 0.0167673, -0.0375007, -0.0238314, 0.038784, -0.0174034, + 0.0131743, -0.0506589, -0.0048447, -0.0240239, 0.0325789, + 0.00790065, 0.0220157, 0.0333314, -0.0264787, 0.0387855, + -0.000764675, 0.0217599, -0.037537, -0.0335206, 0.0431679, + -0.0211424, 0.010203, -0.062785, -0.00832363, -0.025181, + 0.0412031, 0.0118723, 0.0239643, 0.0394009}}; + } +}; + +TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) { + const int n_batch = 2; + const int n_input = 5; + const int n_cell = 20; + const int n_output = 16; + + LSTMOpModel lstm(n_batch, n_input, n_cell, n_output, + /*use_cifg=*/false, /*use_peephole=*/true, + /*use_projection_weights=*/true, + /*use_projection_bias=*/false, + /*cell_clip=*/0.0, /*proj_clip=*/0.0, + { + {n_batch, n_input}, // input tensor + + {n_cell, n_input}, // input_to_input_weight tensor + {n_cell, n_input}, // input_to_forget_weight tensor + {n_cell, n_input}, // input_to_cell_weight tensor + {n_cell, n_input}, // input_to_output_weight tensor + + {n_cell, n_output}, // recurrent_to_input_weight tensor + {n_cell, n_output}, // recurrent_to_forget_weight tensor + {n_cell, n_output}, // recurrent_to_cell_weight tensor + {n_cell, n_output}, // recurrent_to_output_weight tensor + + {n_cell}, // cell_to_input_weight tensor + {n_cell}, // cell_to_forget_weight tensor + {n_cell}, // cell_to_output_weight tensor + + {n_cell}, // input_gate_bias tensor + {n_cell}, // forget_gate_bias tensor + {n_cell}, // cell_bias tensor + {n_cell}, // output_gate_bias tensor + + {n_output, n_cell}, // projection_weight tensor + {0}, // projection_bias tensor + }); + + lstm.SetInputToInputWeights(input_to_input_weights_); + lstm.SetInputToCellWeights(input_to_cell_weights_); + lstm.SetInputToForgetWeights(input_to_forget_weights_); + lstm.SetInputToOutputWeights(input_to_output_weights_); + + lstm.SetInputGateBias(input_gate_bias_); + lstm.SetCellBias(cell_gate_bias_); + lstm.SetForgetGateBias(forget_gate_bias_); + lstm.SetOutputGateBias(output_gate_bias_); + + lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_); + lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_); + lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_); + lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_); + + lstm.SetCellToInputWeights(cell_to_input_weights_); + lstm.SetCellToForgetWeights(cell_to_forget_weights_); + lstm.SetCellToOutputWeights(cell_to_output_weights_); + + lstm.SetProjectionWeights(projection_weights_); + + // Resetting cell_state and output_state + lstm.ResetCellState(); + lstm.ResetOutputState(); + + VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm); +} + +class BaseReduceOpModel : public SingleOpModelWithNNAPI { + public: + void SetAxis(const std::vector& data) { PopulateTensor(axis_, data); } + + template + void SetInput(std::vector data) { + PopulateTensor(input_, data); + } + + template + std::vector GetOutput() { + return ExtractVector(output_); + } + + std::vector GetDequantizedOutput() { + return Dequantize(ExtractVector(output_), + GetScale(output_), GetZeroPoint(output_)); + } + + std::vector GetOutputShape() { return GetTensorShape(output_); } + + int Input() { return input_; } + + protected: + int input_; + int axis_; + int output_; +}; + +// Model for the tests case where axis is a const tensor. +class MeanOpConstModel : public BaseReduceOpModel { + public: + MeanOpConstModel(const TensorData& input, const TensorData& output, + std::initializer_list axis_shape, + std::initializer_list axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +// Tests for reduce_mean +TEST(NNAPIDelegate, MeanFloatNotKeepDims) { + std::vector data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {2}}, + {4}, {1, 0, -3, -3}, false); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({12, 13}))); +} + +TEST(NNAPIDelegate, MeanFloatKeepDims) { + std::vector data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MeanOpConstModel m({TensorType_FLOAT32, {4, 3, 2}}, {TensorType_FLOAT32, {3}}, + {2}, {0, 2}, true); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1})); + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5}))); +} + +class BaseEmbeddingLookupOpModel : public SingleOpModelWithNNAPI { + public: + BaseEmbeddingLookupOpModel(std::initializer_list index_shape, + std::initializer_list weight_shape, + TensorType weight_type = TensorType_FLOAT32) { + input_ = AddInput(TensorType_INT32); + weight_ = AddInput(weight_type); + output_ = AddOutput(TensorType_FLOAT32); + SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP, BuiltinOptions_NONE, 0); + BuildInterpreter({index_shape, weight_shape}); + } + + void SetInput(std::initializer_list data) { + PopulateTensor(input_, data); + } + + std::vector GetOutput() { return ExtractVector(output_); } + + protected: + int input_; + int weight_; + int output_; +}; + +class EmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel { + public: + using BaseEmbeddingLookupOpModel::BaseEmbeddingLookupOpModel; + + void Set3DWeightMatrix(const std::function& function) { + TfLiteTensor* tensor = interpreter_->tensor(weight_); + int rows = tensor->dims->data[0]; + int columns = tensor->dims->data[1]; + int features = tensor->dims->data[2]; + for (int i = 0; i < rows; i++) { + for (int j = 0; j < columns; j++) { + for (int k = 0; k < features; k++) { + tensor->data.f[(i * columns + j) * features + k] = function(i, j, k); + } + } + } + } +}; + +TEST(NNAPIDelegate, EmbeddingLookupSimpleTest) { + EmbeddingLookupOpModel m({3}, {3, 2, 4}); + m.SetInput({1, 0, 2}); + m.Set3DWeightMatrix( + [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; }); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), + ElementsAreArray(ArrayFloatNear({ + 1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13, // Row 1 + 0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13, // Row 0 + 2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13, // Row 2 + }))); +} + +class HashtableLookupOpModel : public SingleOpModelWithNNAPI { + public: + HashtableLookupOpModel(std::initializer_list lookup_shape, + std::initializer_list key_shape, + std::initializer_list value_shape, + TensorType type) { + lookup_ = AddInput(TensorType_INT32); + key_ = AddInput(TensorType_INT32); + value_ = AddInput(type); + output_ = AddOutput(type); + hit_ = AddOutput(TensorType_UINT8); + SetBuiltinOp(BuiltinOperator_HASHTABLE_LOOKUP, BuiltinOptions_NONE, 0); + BuildInterpreter({lookup_shape, key_shape, value_shape}); + } + + void SetLookup(std::initializer_list data) { + PopulateTensor(lookup_, data); + } + + void SetHashtableKey(std::initializer_list data) { + PopulateTensor(key_, data); + } + + void SetHashtableValue(const std::vector& content) { + PopulateStringTensor(value_, content); + } + + void SetHashtableValue(const std::function& function) { + TfLiteTensor* tensor = interpreter_->tensor(value_); + int rows = tensor->dims->data[0]; + for (int i = 0; i < rows; i++) { + tensor->data.f[i] = function(i); + } + } + + void SetHashtableValue(const std::function& function) { + TfLiteTensor* tensor = interpreter_->tensor(value_); + int rows = tensor->dims->data[0]; + int features = tensor->dims->data[1]; + for (int i = 0; i < rows; i++) { + for (int j = 0; j < features; j++) { + tensor->data.f[i * features + j] = function(i, j); + } + } + } + + std::vector GetStringOutput() { + TfLiteTensor* output = interpreter_->tensor(output_); + int num = GetStringCount(output); + std::vector result(num); + for (int i = 0; i < num; i++) { + auto ref = GetString(output, i); + result[i] = string(ref.str, ref.len); + } + return result; + } + + std::vector GetOutput() { return ExtractVector(output_); } + std::vector GetHit() { return ExtractVector(hit_); } + + private: + int lookup_; + int key_; + int value_; + int output_; + int hit_; +}; + +TEST(NNAPIDelegate, HashtableLookupTest2DInput) { + HashtableLookupOpModel m({4}, {3}, {3, 2}, TensorType_FLOAT32); + + m.SetLookup({1234, -292, -11, 0}); + m.SetHashtableKey({-11, 0, 1234}); + m.SetHashtableValue([](int i, int j) { return i + j / 10.0f; }); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({ + 2.0, 2.1, // 2-nd item + 0, 0, // Not found + 0.0, 0.1, // 0-th item + 1.0, 1.1, // 1-st item + }))); + EXPECT_THAT(m.GetHit(), ElementsAreArray({ + 1, + 0, + 1, + 1, + })); +} + +TEST(NNAPIDelegate, HashtableLookupTest1DInput) { + HashtableLookupOpModel m({4}, {3}, {3}, TensorType_FLOAT32); + + m.SetLookup({1234, -292, -11, 0}); + m.SetHashtableKey({-11, 0, 1234}); + m.SetHashtableValue([](int i) { return i * i / 10.0f; }); + + m.Invoke(); + + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({ + 0.4, // 2-nd item + 0, // Not found + 0.0, // 0-th item + 0.1, // 1-st item + }))); + EXPECT_THAT(m.GetHit(), ElementsAreArray({ + 1, + 0, + 1, + 1, + })); +} } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/error_reporter.cc b/tensorflow/contrib/lite/error_reporter.cc index 03fcd5409ceab1895cea3b9e0e4fcb5a127e6a45..646913c0262c3483e999208651b5f0f872006cf6 100644 --- a/tensorflow/contrib/lite/error_reporter.cc +++ b/tensorflow/contrib/lite/error_reporter.cc @@ -16,6 +16,10 @@ limitations under the License. #include #include +#ifdef __ANDROID__ +#include +#endif + namespace tflite { ErrorReporter::~ErrorReporter() {} @@ -39,6 +43,15 @@ int ErrorReporter::ReportError(void*, const char* format, ...) { } int StderrReporter::Report(const char* format, va_list args) { +#ifdef __ANDROID__ + // On Android stderr is not captured for applications, only for code run from + // the shell. Rather than assume all users will set up a custom error + // reporter, let's output to logcat here + va_list args_for_log; + va_copy(args_for_log, args); + __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log); + va_end(args_for_log); +#endif const int result = vfprintf(stderr, format, args); fputc('\n', stderr); return result; diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm index d74e275f0439b1ce56b29e0eadff5f211f6a4faa..30fee64a6f621016446eff58c305e88fda01fa76 100644 --- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm +++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm @@ -315,7 +315,7 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const labelLayers = [[NSMutableArray alloc] init]; oldPredictionValues = [[NSMutableDictionary alloc] init]; - NSString* graph_path = FilePathForResourceName(model_file_name, @"tflite"); + NSString* graph_path = FilePathForResourceName(model_file_name, model_file_type); model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]); if (!model) { LOG(FATAL) << "Failed to mmap model " << graph_path; diff --git a/tensorflow/contrib/lite/examples/ios/camera/Podfile b/tensorflow/contrib/lite/examples/ios/camera/Podfile index c7d3b1c966eaa0de71f5c37a6a77b3881e30ddd7..cd8c39043f6df61ed83e75e80a42156fdba68642 100644 --- a/tensorflow/contrib/lite/examples/ios/camera/Podfile +++ b/tensorflow/contrib/lite/examples/ios/camera/Podfile @@ -2,4 +2,4 @@ platform :ios, '8.0' inhibit_all_warnings! target 'tflite_camera_example' - pod 'TensorFlowLite' + pod 'TensorFlowLite', '0.1.7' diff --git a/tensorflow/contrib/lite/examples/ios/simple/Podfile b/tensorflow/contrib/lite/examples/ios/simple/Podfile index e4aca2be82d437a0225d2c15d3e486b0344aa978..c885398f44456bc1b7429b4f6605237bbc64e654 100644 --- a/tensorflow/contrib/lite/examples/ios/simple/Podfile +++ b/tensorflow/contrib/lite/examples/ios/simple/Podfile @@ -2,4 +2,4 @@ platform :ios, '8.0' inhibit_all_warnings! target 'tflite_simple_example' - pod 'TensorFlowLite' + pod 'TensorFlowLite', '0.1.7' diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h index e36218e4f12057a362af47c48454f7930fc495f2..6fdcf78b69c6799fc2e666af1150efb88b55ff5c 100644 --- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h +++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h @@ -16,11 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_ #define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_ -#include "tensorflow/contrib/lite/builtin_op_data.h" -#include "tensorflow/contrib/lite/interpreter.h" -#include "tensorflow/contrib/lite/kernels/register.h" -#include "tensorflow/contrib/lite/string_util.h" -#include "tensorflow/contrib/lite/version.h" +#include "tensorflow/contrib/lite/examples/label_image/label_image.h" #include "tensorflow/contrib/lite/builtin_op_data.h" #include "tensorflow/contrib/lite/interpreter.h" @@ -28,8 +24,6 @@ limitations under the License. #include "tensorflow/contrib/lite/string_util.h" #include "tensorflow/contrib/lite/version.h" -#include "tensorflow/contrib/lite/examples/label_image/label_image.h" - namespace tflite { namespace label_image { diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc index 86d7d1cc4a625243791d5e7d5b746526a58efb6d..7c6f523041ad5a516f348c1b4f66683128838228 100644 --- a/tensorflow/contrib/lite/examples/label_image/label_image.cc +++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc @@ -213,22 +213,23 @@ void RunInference(Settings* s) { } } - const int output_size = 1000; - const size_t num_results = 5; const float threshold = 0.001f; std::vector> top_results; int output = interpreter->outputs()[0]; + TfLiteIntArray* output_dims = interpreter->tensor(output)->dims; + // assume output dims to be something like (1, 1, ... ,size) + auto output_size = output_dims->data[output_dims->size - 1]; switch (interpreter->tensor(output)->type) { case kTfLiteFloat32: get_top_n(interpreter->typed_output_tensor(0), output_size, - num_results, threshold, &top_results, true); + s->number_of_results, threshold, &top_results, true); break; case kTfLiteUInt8: get_top_n(interpreter->typed_output_tensor(0), - output_size, num_results, threshold, &top_results, - false); + output_size, s->number_of_results, threshold, + &top_results, false); break; default: LOG(FATAL) << "cannot handle output type " @@ -259,6 +260,7 @@ void display_usage() { << "--labels, -l: labels for the model\n" << "--tflite_model, -m: model_name.tflite\n" << "--profiling, -p: [0|1], profiling or not\n" + << "--num_results, -r: number of results to show\n" << "--threads, -t: number of threads\n" << "--verbose, -v: [0|1] print more information\n" << "\n"; @@ -280,12 +282,13 @@ int Main(int argc, char** argv) { {"threads", required_argument, nullptr, 't'}, {"input_mean", required_argument, nullptr, 'b'}, {"input_std", required_argument, nullptr, 's'}, + {"num_results", required_argument, nullptr, 'r'}, {nullptr, 0, nullptr, 0}}; /* getopt_long stores the option index here. */ int option_index = 0; - c = getopt_long(argc, argv, "a:b:c:f:i:l:m:p:s:t:v:", long_options, + c = getopt_long(argc, argv, "a:b:c:f:i:l:m:p:r:s:t:v:", long_options, &option_index); /* Detect the end of the options. */ @@ -315,6 +318,10 @@ int Main(int argc, char** argv) { s.profiling = strtol(optarg, nullptr, 10); // NOLINT(runtime/deprecated_fn) break; + case 'r': + s.number_of_results = + strtol(optarg, nullptr, 10); // NOLINT(runtime/deprecated_fn) + break; case 's': s.input_std = strtod(optarg, nullptr); break; diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/contrib/lite/examples/label_image/label_image.h index 4b48014e1c77eca1eca081f0fe906441a5dcce22..34c223f713b9fe7692440a6b7538f00be995ad11 100644 --- a/tensorflow/contrib/lite/examples/label_image/label_image.h +++ b/tensorflow/contrib/lite/examples/label_image/label_image.h @@ -34,6 +34,7 @@ struct Settings { string labels_file_name = "./labels.txt"; string input_layer_type = "uint8_t"; int number_of_threads = 4; + int number_of_results = 5; }; } // namespace label_image diff --git a/tensorflow/contrib/lite/examples/python/BUILD b/tensorflow/contrib/lite/examples/python/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..d337c3ddc43a23e50a5afdab93b16c0f61ccd538 --- /dev/null +++ b/tensorflow/contrib/lite/examples/python/BUILD @@ -0,0 +1,13 @@ +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//tensorflow:internal"]) + +py_binary( + name = "label_image", + srcs = ["label_image.py"], + main = "label_image.py", + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/contrib/lite/python:lite", + ], +) diff --git a/tensorflow/contrib/lite/examples/python/label_image.md b/tensorflow/contrib/lite/examples/python/label_image.md new file mode 100644 index 0000000000000000000000000000000000000000..e81192a96c142f2b3e7e85d160166fdd37ccdc53 --- /dev/null +++ b/tensorflow/contrib/lite/examples/python/label_image.md @@ -0,0 +1,50 @@ + +With model, input image (grace_hopper.bmp), and labels file (labels.txt) +in /tmp. + +The example input image and labels file are from TensorFlow repo and +MobileNet V1 model files. + +``` +curl https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp > /tmp/grace_hopper.bmp + +curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz | tar xzv -C /tmp mobilenet_v1_1.0_224/labels.txt +mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/ + +``` + +Run + +``` +curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz | tar xzv -C /tmp +bazel run --config opt //tensorflow/contrib/lite/examples/python:label_image +``` + +We can get results like + +``` +0.470588: military uniform +0.337255: Windsor tie +0.047059: bow tie +0.031373: mortarboard +0.019608: suit +``` + +Run + +``` +curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp +bazel run --config opt //tensorflow/contrib/lite/examples/python:label_image \ +-- --model_file /tmp/mobilenet_v1_1.0_224.tflite +``` + +We can get results like +``` +0.728693: military uniform +0.116163: Windsor tie +0.035517: bow tie +0.014874: mortarboard +0.011758: bolo tie +``` + +Check [models](../../g3doc/models.md) for models hosted by Google. diff --git a/tensorflow/contrib/lite/examples/python/label_image.py b/tensorflow/contrib/lite/examples/python/label_image.py new file mode 100644 index 0000000000000000000000000000000000000000..282118a1d2b43a08930b24366110a021fc634b5e --- /dev/null +++ b/tensorflow/contrib/lite/examples/python/label_image.py @@ -0,0 +1,86 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""label_image for tflite""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import numpy as np + +from PIL import Image + +from tensorflow.contrib.lite.python import interpreter as interpreter_wrapper + +def load_labels(filename): + my_labels = [] + input_file = open(filename, 'r') + for l in input_file: + my_labels.append(l.strip()) + return my_labels + +if __name__ == "__main__": + floating_model = False + + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--image", default="/tmp/grace_hopper.bmp", \ + help="image to be classified") + parser.add_argument("-m", "--model_file", \ + default="/tmp/mobilenet_v1_1.0_224_quant.tflite", \ + help=".tflite model to be executed") + parser.add_argument("-l", "--label_file", default="/tmp/labels.txt", \ + help="name of file containing labels") + parser.add_argument("--input_mean", default=127.5, help="input_mean") + parser.add_argument("--input_std", default=127.5, \ + help="input standard deviation") + args = parser.parse_args() + + interpreter = interpreter_wrapper.Interpreter(model_path=args.model_file) + interpreter.allocate_tensors() + + input_details = interpreter.get_input_details() + output_details = interpreter.get_output_details() + + # check the type of the input tensor + if input_details[0]['dtype'] == np.float32: + floating_model = True + + # NxHxWxC, H:1, W:2 + height = input_details[0]['shape'][1] + width = input_details[0]['shape'][2] + img = Image.open(args.image) + img = img.resize((width, height)) + + # add N dim + input_data = np.expand_dims(img, axis=0) + + if floating_model: + input_data = (np.float32(input_data) - args.input_mean) / args.input_std + + interpreter.set_tensor(input_details[0]['index'], input_data) + + interpreter.invoke() + + output_data = interpreter.get_tensor(output_details[0]['index']) + results = np.squeeze(output_data) + + top_k = results.argsort()[-5:][::-1] + labels = load_labels(args.label_file) + for i in top_k: + if floating_model: + print('{0:08.6f}'.format(float(results[i]))+":", labels[i]) + else: + print('{0:08.6f}'.format(float(results[i]/255.0))+":", labels[i]) diff --git a/tensorflow/contrib/lite/experimental/c/BUILD b/tensorflow/contrib/lite/experimental/c/BUILD index 50f8da66d06abaf0637866e85c04e80fee042071..8fc07e8eb7eb1b53cc94eed75093c49c29679d77 100644 --- a/tensorflow/contrib/lite/experimental/c/BUILD +++ b/tensorflow/contrib/lite/experimental/c/BUILD @@ -26,17 +26,33 @@ tflite_cc_shared_object( }), deps = [ ":c_api", + ":c_api_experimental", ":exported_symbols.lds", ":version_script.lds", ], ) +cc_library( + name = "c_api_internal", + srcs = ["c_api.h"], + hdrs = ["c_api_internal.h"], + copts = tflite_copts(), + visibility = [ + "//tensorflow/contrib/lite/experimental/c:__subpackages__", + ], + deps = [ + "//tensorflow/contrib/lite:context", + "//tensorflow/contrib/lite:framework", + ], +) + cc_library( name = "c_api", srcs = ["c_api.cc"], hdrs = ["c_api.h"], copts = tflite_copts(), deps = [ + ":c_api_internal", "//tensorflow/contrib/lite:context", "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:schema_fbs_version", @@ -44,6 +60,17 @@ cc_library( ], ) +cc_library( + name = "c_api_experimental", + srcs = ["c_api_experimental.cc"], + hdrs = ["c_api_experimental.h"], + copts = tflite_copts(), + deps = [ + ":c_api", + ":c_api_internal", + ], +) + cc_test( name = "c_api_test", size = "small", @@ -51,9 +78,21 @@ cc_test( data = ["//tensorflow/contrib/lite:testdata/add.bin"], deps = [ ":c_api", - "//tensorflow/contrib/lite:framework", "//tensorflow/contrib/lite:kernel_api", "//tensorflow/contrib/lite/testing:util", "@com_google_googletest//:gtest", ], ) + +cc_test( + name = "c_api_experimental_test", + size = "small", + srcs = ["c_api_experimental_test.cc"], + data = ["//tensorflow/contrib/lite:testdata/add.bin"], + deps = [ + ":c_api", + ":c_api_experimental", + "//tensorflow/contrib/lite/testing:util", + "@com_google_googletest//:gtest", + ], +) diff --git a/tensorflow/contrib/lite/experimental/c/c_api.cc b/tensorflow/contrib/lite/experimental/c/c_api.cc index 9d29e8b3e055e86a9e68285d81de742e36452215..a4ab0e8c306b5b1e514e1ddf0c166ba0b43d75d1 100644 --- a/tensorflow/contrib/lite/experimental/c/c_api.cc +++ b/tensorflow/contrib/lite/experimental/c/c_api.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/contrib/lite/experimental/c/c_api.h" #include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/experimental/c/c_api_internal.h" #include "tensorflow/contrib/lite/interpreter.h" #include "tensorflow/contrib/lite/kernels/register.h" #include "tensorflow/contrib/lite/model.h" @@ -23,28 +24,55 @@ limitations under the License. extern "C" { #endif // __cplusplus -struct _TFL_Interpreter { - std::unique_ptr impl; -}; - // LINT.IfChange -TFL_Interpreter* TFL_NewInterpreter(const void* model_data, - int32_t model_size) { +TFL_Model* TFL_NewModel(const void* model_data, size_t model_size) { auto model = tflite::FlatBufferModel::BuildFromBuffer( - static_cast(model_data), static_cast(model_size)); - if (!model) { + static_cast(model_data), model_size); + return model ? new TFL_Model{std::move(model)} : nullptr; +} + +TFL_Model* TFL_NewModelFromFile(const char* model_path) { + auto model = tflite::FlatBufferModel::BuildFromFile(model_path); + return model ? new TFL_Model{std::move(model)} : nullptr; +} + +void TFL_DeleteModel(TFL_Model* model) { delete model; } + +TFL_InterpreterOptions* TFL_NewInterpreterOptions() { + return new TFL_InterpreterOptions{}; +} + +void TFL_DeleteInterpreterOptions(TFL_InterpreterOptions* options) { + delete options; +} + +void TFL_InterpreterOptionsSetNumThreads(TFL_InterpreterOptions* options, + int32_t num_threads) { + options->num_threads = num_threads; +} + +TFL_Interpreter* TFL_NewInterpreter( + const TFL_Model* model, const TFL_InterpreterOptions* optional_options) { + if (!model || !model->impl) { return nullptr; } tflite::ops::builtin::BuiltinOpResolver resolver; - tflite::InterpreterBuilder builder(*model, resolver); - std::unique_ptr interpreter_impl; - if (builder(&interpreter_impl) != kTfLiteOk) { + tflite::InterpreterBuilder builder(*model->impl, resolver); + std::unique_ptr interpreter; + if (builder(&interpreter) != kTfLiteOk) { return nullptr; } - return new TFL_Interpreter{std::move(interpreter_impl)}; + if (optional_options) { + if (optional_options->num_threads != + TFL_InterpreterOptions::kDefaultNumThreads) { + interpreter->SetNumThreads(optional_options->num_threads); + } + } + + return new TFL_Interpreter{std::move(interpreter)}; } void TFL_DeleteInterpreter(TFL_Interpreter* interpreter) { delete interpreter; } @@ -97,9 +125,13 @@ int32_t TFL_TensorDim(const TFL_Tensor* tensor, int32_t dim_index) { size_t TFL_TensorByteSize(const TFL_Tensor* tensor) { return tensor->bytes; } +void* TFL_TensorData(const TFL_Tensor* tensor) { + return static_cast(tensor->data.raw); +} + TFL_Status TFL_TensorCopyFromBuffer(TFL_Tensor* tensor, const void* input_data, - int32_t input_data_size) { - if (tensor->bytes != static_cast(input_data_size)) { + size_t input_data_size) { + if (tensor->bytes != input_data_size) { return kTfLiteError; } memcpy(tensor->data.raw, input_data, input_data_size); @@ -107,8 +139,8 @@ TFL_Status TFL_TensorCopyFromBuffer(TFL_Tensor* tensor, const void* input_data, } TFL_Status TFL_TensorCopyToBuffer(const TFL_Tensor* tensor, void* output_data, - int32_t output_data_size) { - if (tensor->bytes != static_cast(output_data_size)) { + size_t output_data_size) { + if (tensor->bytes != output_data_size) { return kTfLiteError; } memcpy(output_data, tensor->data.raw, output_data_size); diff --git a/tensorflow/contrib/lite/experimental/c/c_api.h b/tensorflow/contrib/lite/experimental/c/c_api.h index 070f1add13c9904e1a2b3736001ada0e274fdc55..3757349b5510ea3c3ac876b50b5c8c7db14688c9 100644 --- a/tensorflow/contrib/lite/experimental/c/c_api.h +++ b/tensorflow/contrib/lite/experimental/c/c_api.h @@ -30,6 +30,9 @@ limitations under the License. // // Conventions: // * We use the prefix TFL_ for everything in the API. +// * size_t is used to represent byte sizes of objects that are +// materialized in the address space of the calling process. +// * int is used as an index into arrays. #ifdef SWIG #define TFL_CAPI_EXPORT @@ -53,16 +56,51 @@ typedef TfLiteTensor TFL_Tensor; typedef TfLiteStatus TFL_Status; typedef TfLiteType TFL_Type; +// -------------------------------------------------------------------------- +// TFL_Model wraps a loaded TensorFlow Lite model. +typedef struct TFL_Model TFL_Model; + +// Returns a model from the provided buffer, or null on failure. +TFL_CAPI_EXPORT extern TFL_Model* TFL_NewModel(const void* model_data, + size_t model_size); + +// Returns a model from the provided file, or null on failure. +TFL_CAPI_EXPORT extern TFL_Model* TFL_NewModelFromFile(const char* model_path); + +// Destroys the model instance. +TFL_CAPI_EXPORT extern void TFL_DeleteModel(TFL_Model* model); + +// -------------------------------------------------------------------------- +// TFL_InterpreterOptions allows customized interpreter configuration. +typedef struct TFL_InterpreterOptions TFL_InterpreterOptions; + +// Returns a new interpreter options instances. +TFL_CAPI_EXPORT extern TFL_InterpreterOptions* TFL_NewInterpreterOptions(); + +// Destroys the interpreter options instance. +TFL_CAPI_EXPORT extern void TFL_DeleteInterpreterOptions( + TFL_InterpreterOptions* options); + +// Sets the number of CPU threads to use for the interpreter. +TFL_CAPI_EXPORT extern void TFL_InterpreterOptionsSetNumThreads( + TFL_InterpreterOptions* options, int32_t num_threads); + // -------------------------------------------------------------------------- // TFL_Interpreter provides inference from a provided model. -typedef struct _TFL_Interpreter TFL_Interpreter; +typedef struct TFL_Interpreter TFL_Interpreter; -// Returns an interpreter for the provided model, or null on failure. +// Returns a new interpreter using the provided model and options, or null on +// failure. +// +// * `model` must be a valid model instance. The caller retains ownership of the +// object, and can destroy it immediately after creating the interpreter. +// * `optional_options` may be null. The caller retains ownership of the object, +// and can safely destroy it immediately after creating the interpreter. // // NOTE: The client *must* explicitly allocate tensors before attempting to // access input tensor data or invoke the interpreter. TFL_CAPI_EXPORT extern TFL_Interpreter* TFL_NewInterpreter( - const void* model_data, int32_t model_size); + const TFL_Model* model, const TFL_InterpreterOptions* optional_options); // Destroys the interpreter. TFL_CAPI_EXPORT extern void TFL_DeleteInterpreter(TFL_Interpreter* interpreter); @@ -76,7 +114,8 @@ TFL_CAPI_EXPORT extern int TFL_InterpreterGetInputTensorCount( TFL_CAPI_EXPORT extern TFL_Tensor* TFL_InterpreterGetInputTensor( const TFL_Interpreter* interpreter, int32_t input_index); -// Attempts to resize the specified input tensor. +// Resizes the specified input tensor. +// // NOTE: After a resize, the client *must* explicitly allocate tensors before // attempting to access the resized tensor data or invoke the interpreter. // REQUIRES: 0 <= input_index < TFL_InterpreterGetInputTensorCount(tensor) @@ -131,16 +170,24 @@ TFL_CAPI_EXPORT extern int32_t TFL_TensorDim(const TFL_Tensor* tensor, // Returns the size of the underlying data in bytes. TFL_CAPI_EXPORT extern size_t TFL_TensorByteSize(const TFL_Tensor* tensor); +// Returns a pointer to the underlying data buffer. +// +// Note: The result may be null if tensors have not yet been allocated, e.g., +// if the Tensor has just been created or resized and `TFL_AllocateTensors()` +// has yet to be called, or if the output tensor is dynamically sized and the +// interpreter hasn't been invoked. +TFL_CAPI_EXPORT extern void* TFL_TensorData(const TFL_Tensor* tensor); + // Copies from the provided input buffer into the tensor's buffer. // REQUIRES: input_data_size == TFL_TensorByteSize(tensor) TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyFromBuffer( - TFL_Tensor* tensor, const void* input_data, int32_t input_data_size); + TFL_Tensor* tensor, const void* input_data, size_t input_data_size); // Copies to the provided output buffer from the tensor's buffer. // REQUIRES: output_data_size == TFL_TensorByteSize(tensor) TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyToBuffer( const TFL_Tensor* output_tensor, void* output_data, - int32_t output_data_size); + size_t output_data_size); #ifdef __cplusplus } // extern "C" diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc similarity index 62% rename from tensorflow/compiler/xla/client/xla_client/xla_builder.h rename to tensorflow/contrib/lite/experimental/c/c_api_experimental.cc index ce2a8afd4cb1e7037e68a02670af707f3ff9252c..c4dbc55cbf6b116df46553411be5337f83ceb4e7 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h +++ b/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc @@ -13,9 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_ -#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_ +#include "tensorflow/contrib/lite/experimental/c/c_api_experimental.h" -#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/contrib/lite/experimental/c/c_api_internal.h" -#endif // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_ +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +TFL_Status TFL_InterpreterResetVariableTensorsToZero( + TFL_Interpreter* interpreter) { + return interpreter->impl->ResetVariableTensorsToZero(); +} + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental.h b/tensorflow/contrib/lite/experimental/c/c_api_experimental.h new file mode 100644 index 0000000000000000000000000000000000000000..b0ac258dcf9bf4ab603ba847f1b111a89cf2f29b --- /dev/null +++ b/tensorflow/contrib/lite/experimental/c/c_api_experimental.h @@ -0,0 +1,32 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_ +#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_ + +#include "tensorflow/contrib/lite/experimental/c/c_api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Resets all variable tensors to zero. +TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterResetVariableTensorsToZero( + TFL_Interpreter* interpreter); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_ diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc b/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..db6e5251de518d2e754f853edbfb1c1edc425a83 --- /dev/null +++ b/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc @@ -0,0 +1,46 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/lite/experimental/c/c_api_experimental.h" + +#include +#include "tensorflow/contrib/lite/experimental/c/c_api.h" +#include "tensorflow/contrib/lite/testing/util.h" + +namespace { + +TEST(CApiExperimentalSimple, Smoke) { + TFL_Model* model = TFL_NewModelFromFile( + "tensorflow/contrib/lite/testdata/add.bin"); + ASSERT_NE(model, nullptr); + + TFL_Interpreter* interpreter = + TFL_NewInterpreter(model, /*optional_options=*/nullptr); + ASSERT_NE(interpreter, nullptr); + ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk); + + EXPECT_EQ(TFL_InterpreterResetVariableTensorsToZero(interpreter), kTfLiteOk); + + TFL_DeleteModel(model); + TFL_DeleteInterpreter(interpreter); +} + +} // namespace + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/experimental/c/c_api_internal.h b/tensorflow/contrib/lite/experimental/c/c_api_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..c5c612a4c6d3f8ccc49697961fd87b81bc00b6a8 --- /dev/null +++ b/tensorflow/contrib/lite/experimental/c/c_api_internal.h @@ -0,0 +1,41 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_ +#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_ + +#include "tensorflow/contrib/lite/experimental/c/c_api.h" + +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/model.h" + +// Internal structures used by the C API. These are likely to change and should +// not be depended on. + +struct TFL_Model { + std::unique_ptr impl; +}; + +struct TFL_InterpreterOptions { + enum { + kDefaultNumThreads = -1, + }; + int num_threads = kDefaultNumThreads; +}; + +struct TFL_Interpreter { + std::unique_ptr impl; +}; + +#endif // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_ diff --git a/tensorflow/contrib/lite/experimental/c/c_api_test.cc b/tensorflow/contrib/lite/experimental/c/c_api_test.cc index bc925e00a6096c5e8abcc0fa68b335c4db4401c3..a631dae8906a2f5ab10b4125454f2eafb937823f 100644 --- a/tensorflow/contrib/lite/experimental/c/c_api_test.cc +++ b/tensorflow/contrib/lite/experimental/c/c_api_test.cc @@ -18,22 +18,28 @@ limitations under the License. #include "tensorflow/contrib/lite/experimental/c/c_api.h" #include -#include "tensorflow/contrib/lite/allocation.h" #include "tensorflow/contrib/lite/context.h" #include "tensorflow/contrib/lite/testing/util.h" namespace { TEST(CApiSimple, Smoke) { - tflite::FileCopyAllocation model_file( - "tensorflow/contrib/lite/testdata/add.bin", - tflite::DefaultErrorReporter()); + TFL_Model* model = TFL_NewModelFromFile( + "tensorflow/contrib/lite/testdata/add.bin"); + ASSERT_NE(model, nullptr); - TFL_Interpreter* interpreter = - TFL_NewInterpreter(model_file.base(), model_file.bytes()); + TFL_InterpreterOptions* options = TFL_NewInterpreterOptions(); + ASSERT_NE(options, nullptr); + TFL_InterpreterOptionsSetNumThreads(options, 2); + + TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options); ASSERT_NE(interpreter, nullptr); - ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk); + // The options/model can be deleted immediately after interpreter creation. + TFL_DeleteInterpreterOptions(options); + TFL_DeleteModel(model); + + ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk); ASSERT_EQ(TFL_InterpreterGetInputTensorCount(interpreter), 1); ASSERT_EQ(TFL_InterpreterGetOutputTensorCount(interpreter), 1); diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity index 9397d8f27a6cce6013203afc8acc3266a429825d..bcf24b89e335781877a7046001ac4deb6fc55041 100644 --- a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity +++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity @@ -154,7 +154,7 @@ Camera: m_Enabled: 1 serializedVersion: 2 m_ClearFlags: 1 - m_BackGroundColor: {r: 0.19215687, g: 0.3019608, b: 0.4745098, a: 0} + m_BackGroundColor: {r: 0.21933319, g: 0.21933319, b: 0.21933319, a: 0} m_NormalizedViewPortRect: serializedVersion: 2 x: 0 @@ -195,6 +195,100 @@ Transform: m_Father: {fileID: 0} m_RootOrder: 0 m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} +--- !u!1 &871349752 +GameObject: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + serializedVersion: 5 + m_Component: + - component: {fileID: 871349756} + - component: {fileID: 871349755} + - component: {fileID: 871349754} + - component: {fileID: 871349753} + m_Layer: 5 + m_Name: Canvas + m_TagString: Untagged + m_Icon: {fileID: 0} + m_NavMeshLayer: 0 + m_StaticEditorFlags: 0 + m_IsActive: 1 +--- !u!114 &871349753 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 871349752} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 1301386320, guid: f5f67c52d1564df4a8936ccd202a3bd8, type: 3} + m_Name: + m_EditorClassIdentifier: + m_IgnoreReversedGraphics: 1 + m_BlockingObjects: 0 + m_BlockingMask: + serializedVersion: 2 + m_Bits: 4294967295 +--- !u!114 &871349754 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 871349752} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 1980459831, guid: f5f67c52d1564df4a8936ccd202a3bd8, type: 3} + m_Name: + m_EditorClassIdentifier: + m_UiScaleMode: 0 + m_ReferencePixelsPerUnit: 100 + m_ScaleFactor: 1 + m_ReferenceResolution: {x: 800, y: 600} + m_ScreenMatchMode: 0 + m_MatchWidthOrHeight: 0 + m_PhysicalUnit: 3 + m_FallbackScreenDPI: 96 + m_DefaultSpriteDPI: 96 + m_DynamicPixelsPerUnit: 1 +--- !u!223 &871349755 +Canvas: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 871349752} + m_Enabled: 1 + serializedVersion: 3 + m_RenderMode: 0 + m_Camera: {fileID: 0} + m_PlaneDistance: 100 + m_PixelPerfect: 0 + m_ReceivesEvents: 1 + m_OverrideSorting: 0 + m_OverridePixelPerfect: 0 + m_SortingBucketNormalizedSize: 0 + m_AdditionalShaderChannelsFlag: 0 + m_SortingLayerID: 0 + m_SortingOrder: 0 + m_TargetDisplay: 0 +--- !u!224 &871349756 +RectTransform: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 871349752} + m_LocalRotation: {x: 0, y: 0, z: 0, w: 1} + m_LocalPosition: {x: 0, y: 0, z: 0} + m_LocalScale: {x: 0, y: 0, z: 0} + m_Children: + - {fileID: 1726294324} + m_Father: {fileID: 0} + m_RootOrder: 1 + m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} + m_AnchorMin: {x: 0, y: 0} + m_AnchorMax: {x: 0, y: 0} + m_AnchoredPosition: {x: 0, y: 0} + m_SizeDelta: {x: 0, y: 0} + m_Pivot: {x: 0, y: 0} --- !u!1 &904015943 GameObject: m_ObjectHideFlags: 0 @@ -240,3 +334,144 @@ MonoBehaviour: - 1 - 3 - 7 + inferenceText: {fileID: 1726294325} +--- !u!1 &1726294323 +GameObject: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + serializedVersion: 5 + m_Component: + - component: {fileID: 1726294324} + - component: {fileID: 1726294326} + - component: {fileID: 1726294325} + m_Layer: 5 + m_Name: InferenceText + m_TagString: Untagged + m_Icon: {fileID: 0} + m_NavMeshLayer: 0 + m_StaticEditorFlags: 0 + m_IsActive: 1 +--- !u!224 &1726294324 +RectTransform: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 1726294323} + m_LocalRotation: {x: -0, y: -0, z: -0, w: 1} + m_LocalPosition: {x: 0, y: 0, z: 0} + m_LocalScale: {x: 1, y: 1, z: 1} + m_Children: [] + m_Father: {fileID: 871349756} + m_RootOrder: 0 + m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} + m_AnchorMin: {x: 0.5, y: 0.5} + m_AnchorMax: {x: 0.5, y: 0.5} + m_AnchoredPosition: {x: 0, y: 25} + m_SizeDelta: {x: 450, y: 250} + m_Pivot: {x: 0.5, y: 0.5} +--- !u!114 &1726294325 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 1726294323} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 708705254, guid: f5f67c52d1564df4a8936ccd202a3bd8, type: 3} + m_Name: + m_EditorClassIdentifier: + m_Material: {fileID: 0} + m_Color: {r: 0.9338235, g: 0.9338235, b: 0.9338235, a: 1} + m_RaycastTarget: 1 + m_OnCullStateChanged: + m_PersistentCalls: + m_Calls: [] + m_TypeName: UnityEngine.UI.MaskableGraphic+CullStateChangedEvent, UnityEngine.UI, + Version=1.0.0.0, Culture=neutral, PublicKeyToken=null + m_FontData: + m_Font: {fileID: 10102, guid: 0000000000000000e000000000000000, type: 0} + m_FontSize: 35 + m_FontStyle: 0 + m_BestFit: 0 + m_MinSize: 2 + m_MaxSize: 40 + m_Alignment: 4 + m_AlignByGeometry: 0 + m_RichText: 1 + m_HorizontalOverflow: 0 + m_VerticalOverflow: 0 + m_LineSpacing: 1 + m_Text: 'Inference took 0.0153 ms + + Input: 1,3,7 + + Output: 3,9,21' +--- !u!222 &1726294326 +CanvasRenderer: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 1726294323} +--- !u!1 &2026426602 +GameObject: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + serializedVersion: 5 + m_Component: + - component: {fileID: 2026426605} + - component: {fileID: 2026426604} + - component: {fileID: 2026426603} + m_Layer: 0 + m_Name: EventSystem + m_TagString: Untagged + m_Icon: {fileID: 0} + m_NavMeshLayer: 0 + m_StaticEditorFlags: 0 + m_IsActive: 1 +--- !u!114 &2026426603 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 2026426602} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 1077351063, guid: f5f67c52d1564df4a8936ccd202a3bd8, type: 3} + m_Name: + m_EditorClassIdentifier: + m_HorizontalAxis: Horizontal + m_VerticalAxis: Vertical + m_SubmitButton: Submit + m_CancelButton: Cancel + m_InputActionsPerSecond: 10 + m_RepeatDelay: 0.5 + m_ForceModuleActive: 0 +--- !u!114 &2026426604 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 2026426602} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: -619905303, guid: f5f67c52d1564df4a8936ccd202a3bd8, type: 3} + m_Name: + m_EditorClassIdentifier: + m_FirstSelected: {fileID: 0} + m_sendNavigationEvents: 1 + m_DragThreshold: 5 +--- !u!4 &2026426605 +Transform: + m_ObjectHideFlags: 0 + m_PrefabParentObject: {fileID: 0} + m_PrefabInternal: {fileID: 0} + m_GameObject: {fileID: 2026426602} + m_LocalRotation: {x: 0, y: 0, z: 0, w: 1} + m_LocalPosition: {x: 0, y: 0, z: 0} + m_LocalScale: {x: 1, y: 1, z: 1} + m_Children: [] + m_Father: {fileID: 0} + m_RootOrder: 2 + m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0} diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs index abca8144998367eadaeb0b75d85bb0f6cf3a2057..83291e61794819e7c57f69ed2be6ea40294e01da 100644 --- a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs +++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs @@ -18,6 +18,7 @@ using System.Collections.Generic; using System.Linq; using TensorFlowLite; using UnityEngine; +using UnityEngine.UI; /// /// Simple example demonstrating use of the experimental C# bindings for TensorFlowLite. @@ -30,14 +31,24 @@ public class HelloTFLite : MonoBehaviour { [Tooltip("Configurable TFLite input tensor data.")] public float[] inputs; + [Tooltip("Target Text widget for display of inference execution.")] + public Text inferenceText; + private Interpreter interpreter; private float[] outputs; + void Awake() { + // As the demo is extremely simple, there's no need to run at full frame-rate. + QualitySettings.vSyncCount = 0; + Application.targetFrameRate = 5; + } + void Start () { interpreter = new Interpreter(model.bytes); - Debug.LogFormat("InputCount: {0}, OutputCount: {1}", - interpreter.GetInputTensorCount(), - interpreter.GetOutputTensorCount()); + Debug.LogFormat( + "InputCount: {0}, OutputCount: {1}", + interpreter.GetInputTensorCount(), + interpreter.GetOutputTensorCount()); } void Update () { @@ -51,13 +62,17 @@ public class HelloTFLite : MonoBehaviour { outputs = new float[inputs.Length]; } + float startTimeSeconds = Time.realtimeSinceStartup; interpreter.SetInputTensorData(0, inputs); interpreter.Invoke(); interpreter.GetOutputTensorData(0, outputs); + float inferenceTimeSeconds = Time.realtimeSinceStartup - startTimeSeconds; - Debug.LogFormat("Input: {0}, Output: {1}", - ArrayToString(inputs), - ArrayToString(outputs)); + inferenceText.text = string.Format( + "Inference took {0:0.0000} ms\nInput(s): {1}\nOutput(s): {2}", + inferenceTimeSeconds * 1000.0, + ArrayToString(inputs), + ArrayToString(outputs)); } void OnDestroy() { diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs index ab966bae2efb9431e2f9f35dc818d130aabd71f6..b6905b5fbfe5b49e30d79b372b3be35d90fe252a 100644 --- a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs +++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs @@ -16,6 +16,8 @@ using System; using System.Runtime.InteropServices; using TFL_Interpreter = System.IntPtr; +using TFL_InterpreterOptions = System.IntPtr; +using TFL_Model = System.IntPtr; using TFL_Tensor = System.IntPtr; namespace TensorFlowLite @@ -32,7 +34,9 @@ namespace TensorFlowLite public Interpreter(byte[] modelData) { GCHandle modelDataHandle = GCHandle.Alloc(modelData, GCHandleType.Pinned); IntPtr modelDataPtr = modelDataHandle.AddrOfPinnedObject(); - handle = TFL_NewInterpreter(modelDataPtr, modelData.Length); + TFL_Model model = TFL_NewModel(modelDataPtr, modelData.Length); + handle = TFL_NewInterpreter(model, /*options=*/IntPtr.Zero); + TFL_DeleteModel(model); if (handle == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Interpreter"); } @@ -88,10 +92,16 @@ namespace TensorFlowLite #region Externs + [DllImport (TensorFlowLibrary)] + private static extern unsafe TFL_Interpreter TFL_NewModel(IntPtr model_data, int model_size); + + [DllImport (TensorFlowLibrary)] + private static extern unsafe TFL_Interpreter TFL_DeleteModel(TFL_Model model); + [DllImport (TensorFlowLibrary)] private static extern unsafe TFL_Interpreter TFL_NewInterpreter( - IntPtr model_data, - int model_size); + TFL_Model model, + TFL_InterpreterOptions optional_options); [DllImport (TensorFlowLibrary)] private static extern unsafe void TFL_DeleteInterpreter(TFL_Interpreter interpreter); diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset index 74d7b532b092680d2b87092007e38f2cbc6b3a00..a9bbfb02d1e7065b7d0e90609a3928d667933477 100644 --- a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset +++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset @@ -35,6 +35,9 @@ GraphicsSettings: - {fileID: 15106, guid: 0000000000000000f000000000000000, type: 0} - {fileID: 10753, guid: 0000000000000000f000000000000000, type: 0} - {fileID: 10770, guid: 0000000000000000f000000000000000, type: 0} + - {fileID: 17000, guid: 0000000000000000f000000000000000, type: 0} + - {fileID: 16000, guid: 0000000000000000f000000000000000, type: 0} + - {fileID: 16002, guid: 0000000000000000f000000000000000, type: 0} m_PreloadedShaders: [] m_SpritesDefaultMaterial: {fileID: 10754, guid: 0000000000000000f000000000000000, type: 0} diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md index 0b3813fccb10c3a89fb462f9ab6bb81c6a9a147a..f480c49cd050de2192e9673f72c9e4d5c3c6ceff 100644 --- a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md +++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md @@ -1,6 +1,6 @@ # TF Lite Experimental Unity Plugin -This directoryy contains an experimental sample Unity (2017) Plugin, based on +This directory contains an experimental sample Unity (2017) Plugin, based on the experimental TF Lite C API. The sample demonstrates running inference within Unity by way of a C# `Interpreter` wrapper. @@ -22,3 +22,8 @@ bazel build -c opt --cxxopt=--std=c++11 \ --cpu=armeabi-v7a \ //tensorflow/contrib/lite/experimental/c:libtensorflowlite_c.so ``` + +If you encounter issues with native plugin discovery on Mac ("Darwin") +platforms, try renaming `libtensorflowlite_c.so` to `tensorflowlite_c.bundle`. +Similarly, on Windows you'll likely need to rename `libtensorflowlite_c.so` to +`tensorflowlite_c.dll`. diff --git a/tensorflow/contrib/lite/experimental/kernels/BUILD b/tensorflow/contrib/lite/experimental/kernels/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..9c06c4ebd958294586dbb1fde5040a0d328954ac --- /dev/null +++ b/tensorflow/contrib/lite/experimental/kernels/BUILD @@ -0,0 +1,84 @@ +package(default_visibility = [ + "//visibility:public", +]) + +licenses(["notice"]) # Apache 2.0 + +load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts") +load("//tensorflow:tensorflow.bzl", "tf_cc_test") + +# ctc support classes imported directly from TensorFlow. +cc_library( + name = "ctc_utils", + hdrs = [ + "ctc_beam_entry.h", + "ctc_beam_scorer.h", + "ctc_beam_search.h", + "ctc_decoder.h", + "ctc_loss_util.h", + ], + deps = [ + ":top_n", + "//tensorflow/contrib/lite/kernels/internal:types", + "//third_party/eigen3", + ], +) + +# top_n support classes imported directly from TensorFlow. +cc_library( + name = "top_n", + hdrs = [ + "top_n.h", + ], + deps = [ + "//tensorflow/contrib/lite/kernels/internal:types", + ], +) + +cc_library( + name = "experimental_ops", + srcs = [ + "ctc_beam_search_decoder.cc", + ], + # Suppress warnings that are introduced by Eigen Tensor. + copts = tflite_copts() + [ + "-Wno-error=reorder", + ] + select({ + "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"], + "//conditions:default": [ + ], + }), + deps = [ + ":ctc_utils", + "//tensorflow/contrib/lite:builtin_op_data", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite:string_util", + "//tensorflow/contrib/lite/kernels:builtin_ops", + "//tensorflow/contrib/lite/kernels:gemm_support", + "//tensorflow/contrib/lite/kernels:kernel_util", + "//tensorflow/contrib/lite/kernels:op_macros", + "//tensorflow/contrib/lite/kernels/internal:kernel_utils", + "//tensorflow/contrib/lite/kernels/internal:optimized", + "//tensorflow/contrib/lite/kernels/internal:optimized_base", + "//tensorflow/contrib/lite/kernels/internal:quantization_util", + "//tensorflow/contrib/lite/kernels/internal:reference", + "//tensorflow/contrib/lite/kernels/internal:reference_base", + "//tensorflow/contrib/lite/kernels/internal:tensor_utils", + "@flatbuffers", + ], +) + +tf_cc_test( + name = "ctc_beam_search_decoder_test", + size = "small", + srcs = ["ctc_beam_search_decoder_test.cc"], + tags = ["tflite_not_portable_ios"], + deps = [ + ":experimental_ops", + "//tensorflow/contrib/lite:framework", + "//tensorflow/contrib/lite/kernels:builtin_ops", + "//tensorflow/contrib/lite/kernels:test_util", + "@com_google_googletest//:gtest", + "@flatbuffers", + ], +) diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h new file mode 100644 index 0000000000000000000000000000000000000000..a60ff2a1c53f1b3f9f490ab5cf2bc429ba09dff0 --- /dev/null +++ b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h @@ -0,0 +1,150 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Copied from tensorflow/core/util/ctc/ctc_beam_entry.h +// TODO(b/111524997): Remove this file. +#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_ENTRY_H_ +#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_ENTRY_H_ + +#include +#include +#include +#include + +#include "third_party/eigen3/Eigen/Core" +#include "tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h" + +namespace tflite { +namespace experimental { +namespace ctc { + +// The ctc_beam_search namespace holds several classes meant to be accessed only +// in case of extending the CTCBeamSearch decoder to allow custom scoring +// functions. +// +// BeamEntry is exposed through template arguments BeamScorer and BeamComparer +// of CTCBeamSearch (ctc_beam_search.h). +namespace ctc_beam_search { + +struct EmptyBeamState {}; + +struct BeamProbability { + BeamProbability() : total(kLogZero), blank(kLogZero), label(kLogZero) {} + void Reset() { + total = kLogZero; + blank = kLogZero; + label = kLogZero; + } + float total; + float blank; + float label; +}; + +template +class BeamRoot; + +template +struct BeamEntry { + // BeamRoot::AddEntry() serves as the factory method. + friend BeamEntry* BeamRoot::AddEntry( + BeamEntry* p, int l); + inline bool Active() const { return newp.total != kLogZero; } + // Return the child at the given index, or construct a new one in-place if + // none was found. + BeamEntry& GetChild(int ind) { + auto entry = children.emplace(ind, nullptr); + auto& child_entry = entry.first->second; + // If this is a new child, populate the BeamEntry*. + if (entry.second) { + child_entry = beam_root->AddEntry(this, ind); + } + return *child_entry; + } + std::vector LabelSeq(bool merge_repeated) const { + std::vector labels; + int prev_label = -1; + const BeamEntry* c = this; + while (c->parent != nullptr) { // Checking c->parent to skip root leaf. + if (!merge_repeated || c->label != prev_label) { + labels.push_back(c->label); + } + prev_label = c->label; + c = c->parent; + } + std::reverse(labels.begin(), labels.end()); + return labels; + } + + BeamEntry* parent; + int label; + // All instances of child BeamEntry are owned by *beam_root. + std::unordered_map*> children; + BeamProbability oldp; + BeamProbability newp; + CTCBeamState state; + + private: + // Constructor giving parent, label, and the beam_root. + // The object pointed to by p cannot be copied and should not be moved, + // otherwise parent will become invalid. + // This private constructor is only called through the factory method + // BeamRoot::AddEntry(). + BeamEntry(BeamEntry* p, int l, BeamRoot* beam_root) + : parent(p), label(l), beam_root(beam_root) {} + BeamRoot* beam_root; + + BeamEntry(const BeamEntry&) = delete; + void operator=(const BeamEntry&) = delete; +}; + +// This class owns all instances of BeamEntry. This is used to avoid recursive +// destructor call during destruction. +template +class BeamRoot { + public: + BeamRoot(BeamEntry* p, int l) { root_entry_ = AddEntry(p, l); } + BeamRoot(const BeamRoot&) = delete; + BeamRoot& operator=(const BeamRoot&) = delete; + + BeamEntry* AddEntry(BeamEntry* p, int l) { + auto* new_entry = new BeamEntry(p, l, this); + beam_entries_.emplace_back(new_entry); + return new_entry; + } + BeamEntry* RootEntry() const { return root_entry_; } + + private: + BeamEntry* root_entry_ = nullptr; + std::vector>> beam_entries_; +}; + +// BeamComparer is the default beam comparer provided in CTCBeamSearch. +template +class BeamComparer { + public: + virtual ~BeamComparer() {} + virtual bool inline operator()(const BeamEntry* a, + const BeamEntry* b) const { + return a->newp.total > b->newp.total; + } +}; + +} // namespace ctc_beam_search + +} // namespace ctc +} // namespace experimental +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_ENTRY_H_ diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_scorer.h b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_scorer.h new file mode 100644 index 0000000000000000000000000000000000000000..ec60e26257b0f4126e7a7abed6a663abe277ef12 --- /dev/null +++ b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_scorer.h @@ -0,0 +1,79 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Collection of scoring classes that can be extended and provided to the +// CTCBeamSearchDecoder to incorporate additional scoring logic (such as a +// language model). +// +// To build a custom scorer extend and implement the pure virtual methods from +// BeamScorerInterface. The default CTC decoding behavior is implemented +// through BaseBeamScorer. + +// Copied from tensorflow/core/util/ctc/ctc_beam_scorer.h +// TODO(b/111524997): Remove this file. +#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SCORER_H_ +#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SCORER_H_ + +#include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h" + +namespace tflite { +namespace experimental { +namespace ctc { + +// Base implementation of a beam scorer used by default by the decoder that can +// be subclassed and provided as an argument to CTCBeamSearchDecoder, if complex +// scoring is required. Its main purpose is to provide a thin layer for +// integrating language model scoring easily. +template +class BaseBeamScorer { + public: + virtual ~BaseBeamScorer() {} + // State initialization. + virtual void InitializeState(CTCBeamState* root) const {} + // ExpandState is called when expanding a beam to one of its children. + // Called at most once per child beam. In the simplest case, no state + // expansion is done. + virtual void ExpandState(const CTCBeamState& from_state, int from_label, + CTCBeamState* to_state, int to_label) const {} + // ExpandStateEnd is called after decoding has finished. Its purpose is to + // allow a final scoring of the beam in its current state, before resorting + // and retrieving the TopN requested candidates. Called at most once per beam. + virtual void ExpandStateEnd(CTCBeamState* state) const {} + // GetStateExpansionScore should be an inexpensive method to retrieve the + // (cached) expansion score computed within ExpandState. The score is + // multiplied (log-addition) with the input score at the current step from + // the network. + // + // The score returned should be a log-probability. In the simplest case, as + // there's no state expansion logic, the expansion score is zero. + virtual float GetStateExpansionScore(const CTCBeamState& state, + float previous_score) const { + return previous_score; + } + // GetStateEndExpansionScore should be an inexpensive method to retrieve the + // (cached) expansion score computed within ExpandStateEnd. The score is + // multiplied (log-addition) with the final probability of the beam. + // + // The score returned should be a log-probability. + virtual float GetStateEndExpansionScore(const CTCBeamState& state) const { + return 0; + } +}; + +} // namespace ctc +} // namespace experimental +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SCORER_H_ diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h new file mode 100644 index 0000000000000000000000000000000000000000..c658e43092519ba29d880a670a890af148230091 --- /dev/null +++ b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h @@ -0,0 +1,420 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Copied from tensorflow/core/util/ctc/ctc_beam_search.h +// TODO(b/111524997): Remove this file. +#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SEARCH_H_ +#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SEARCH_H_ + +#include +#include +#include +#include +#include + +#include "third_party/eigen3/Eigen/Core" +#include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h" +#include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_scorer.h" +#include "tensorflow/contrib/lite/experimental/kernels/ctc_decoder.h" +#include "tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h" +#include "tensorflow/contrib/lite/experimental/kernels/top_n.h" +#include "tensorflow/contrib/lite/kernels/internal/compatibility.h" + +namespace tflite { +namespace experimental { +namespace ctc { + +template > +class CTCBeamSearchDecoder : public CTCDecoder { + // Beam Search + // + // Example (GravesTh Fig. 7.5): + // a - + // P = [ 0.3 0.7 ] t = 0 + // [ 0.4 0.6 ] t = 1 + // + // Then P(l = -) = P(--) = 0.7 * 0.6 = 0.42 + // P(l = a) = P(a-) + P(aa) + P(-a) = 0.3*0.4 + ... = 0.58 + // + // In this case, Best Path decoding is suboptimal. + // + // For Beam Search, we use the following main recurrence relations: + // + // Relation 1: + // ---------------------------------------------------------- Eq. 1 + // P(l=abcd @ t=7) = P(l=abc @ t=6) * P(d @ 7) + // + P(l=abcd @ t=6) * (P(d @ 7) + P(- @ 7)) + // where P(l=? @ t=7), ? = a, ab, abc, abcd are all stored and + // updated recursively in the beam entry. + // + // Relation 2: + // ---------------------------------------------------------- Eq. 2 + // P(l=abc? @ t=3) = P(l=abc @ t=2) * P(? @ 3) + // for ? in a, b, d, ..., (not including c or the blank index), + // and the recurrence starts from the beam entry for P(l=abc @ t=2). + // + // For this case, the length of the new sequence equals t+1 (t + // starts at 0). This special case can be calculated as: + // P(l=abc? @ t=3) = P(a @ 0)*P(b @ 1)*P(c @ 2)*P(? @ 3) + // but we calculate it recursively for speed purposes. + typedef ctc_beam_search::BeamEntry BeamEntry; + typedef ctc_beam_search::BeamRoot BeamRoot; + typedef ctc_beam_search::BeamProbability BeamProbability; + + public: + typedef BaseBeamScorer DefaultBeamScorer; + + // The beam search decoder is constructed specifying the beam_width (number of + // candidates to keep at each decoding timestep) and a beam scorer (used for + // custom scoring, for example enabling the use of a language model). + // The ownership of the scorer remains with the caller. The default + // implementation, CTCBeamSearchDecoder<>::DefaultBeamScorer, generates the + // standard beam search. + CTCBeamSearchDecoder(int num_classes, int beam_width, + BaseBeamScorer* scorer, int batch_size = 1, + bool merge_repeated = false) + : CTCDecoder(num_classes, batch_size, merge_repeated), + beam_width_(beam_width), + leaves_(beam_width), + beam_scorer_(scorer) { + Reset(); + } + + ~CTCBeamSearchDecoder() override {} + + // Run the hibernating beam search algorithm on the given input. + bool Decode(const CTCDecoder::SequenceLength& seq_len, + const std::vector& input, + std::vector* output, + CTCDecoder::ScoreOutput* scores) override; + + // Calculate the next step of the beam search and update the internal state. + template + void Step(const Vector& log_input_t); + + template + float GetTopK(const int K, const Vector& input, + std::vector* top_k_logits, + std::vector* top_k_indices); + + // Retrieve the beam scorer instance used during decoding. + BaseBeamScorer* GetBeamScorer() const { return beam_scorer_; } + + // Set label selection parameters for faster decoding. + // See comments for label_selection_size_ and label_selection_margin_. + void SetLabelSelectionParameters(int label_selection_size, + float label_selection_margin) { + label_selection_size_ = label_selection_size; + label_selection_margin_ = label_selection_margin; + } + + // Reset the beam search + void Reset(); + + // Extract the top n paths at current time step + bool TopPaths(int n, std::vector>* paths, + std::vector* log_probs, bool merge_repeated) const; + + private: + int beam_width_; + + // Label selection is designed to avoid possibly very expensive scorer calls, + // by pruning the hypotheses based on the input alone. + // Label selection size controls how many items in each beam are passed + // through to the beam scorer. Only items with top N input scores are + // considered. + // Label selection margin controls the difference between minimal input score + // (versus the best scoring label) for an item to be passed to the beam + // scorer. This margin is expressed in terms of log-probability. + // Default is to do no label selection. + // For more detail: https://research.google.com/pubs/pub44823.html + int label_selection_size_ = 0; // zero means unlimited + float label_selection_margin_ = -1; // -1 means unlimited. + + gtl::TopN leaves_; + std::unique_ptr beam_root_; + BaseBeamScorer* beam_scorer_; + + CTCBeamSearchDecoder(const CTCBeamSearchDecoder&) = delete; + void operator=(const CTCBeamSearchDecoder&) = delete; +}; + +template +bool CTCBeamSearchDecoder::Decode( + const CTCDecoder::SequenceLength& seq_len, + const std::vector& input, + std::vector* output, ScoreOutput* scores) { + // Storage for top paths. + std::vector> beams; + std::vector beam_log_probabilities; + int top_n = output->size(); + if (std::any_of(output->begin(), output->end(), + [this](const CTCDecoder::Output& output) -> bool { + return output.size() < this->batch_size_; + })) { + return false; + } + if (scores->rows() < batch_size_ || scores->cols() < top_n) { + return false; + } + + for (int b = 0; b < batch_size_; ++b) { + int seq_len_b = seq_len[b]; + Reset(); + + for (int t = 0; t < seq_len_b; ++t) { + // Pass log-probabilities for this example + time. + Step(input[t].row(b)); + } // for (int t... + + // O(n * log(n)) + std::unique_ptr> branches(leaves_.Extract()); + leaves_.Reset(); + for (int i = 0; i < branches->size(); ++i) { + BeamEntry* entry = (*branches)[i]; + beam_scorer_->ExpandStateEnd(&entry->state); + entry->newp.total += + beam_scorer_->GetStateEndExpansionScore(entry->state); + leaves_.push(entry); + } + + bool status = + TopPaths(top_n, &beams, &beam_log_probabilities, merge_repeated_); + if (!status) { + return status; + } + + TFLITE_DCHECK_EQ(top_n, beam_log_probabilities.size()); + TFLITE_DCHECK_EQ(beams.size(), beam_log_probabilities.size()); + + for (int i = 0; i < top_n; ++i) { + // Copy output to the correct beam + batch + (*output)[i][b].swap(beams[i]); + (*scores)(b, i) = -beam_log_probabilities[i]; + } + } // for (int b... + return true; +} + +template +template +float CTCBeamSearchDecoder::GetTopK( + const int K, const Vector& input, std::vector* top_k_logits, + std::vector* top_k_indices) { + // Find Top K choices, complexity nk in worst case. The array input is read + // just once. + TFLITE_DCHECK_EQ(num_classes_, input.size()); + top_k_logits->clear(); + top_k_indices->clear(); + top_k_logits->resize(K, -INFINITY); + top_k_indices->resize(K, -1); + for (int j = 0; j < num_classes_ - 1; ++j) { + const float logit = input(j); + if (logit > (*top_k_logits)[K - 1]) { + int k = K - 1; + while (k > 0 && logit > (*top_k_logits)[k - 1]) { + (*top_k_logits)[k] = (*top_k_logits)[k - 1]; + (*top_k_indices)[k] = (*top_k_indices)[k - 1]; + k--; + } + (*top_k_logits)[k] = logit; + (*top_k_indices)[k] = j; + } + } + // Return max value which is in 0th index or blank character logit + return std::max((*top_k_logits)[0], input(num_classes_ - 1)); +} + +template +template +void CTCBeamSearchDecoder::Step( + const Vector& raw_input) { + std::vector top_k_logits; + std::vector top_k_indices; + const bool top_k = + (label_selection_size_ > 0 && label_selection_size_ < raw_input.size()); + // Number of character classes to consider in each step. + const int max_classes = top_k ? label_selection_size_ : (num_classes_ - 1); + // Get max coefficient and remove it from raw_input later. + float max_coeff; + if (top_k) { + max_coeff = GetTopK(label_selection_size_, raw_input, &top_k_logits, + &top_k_indices); + } else { + max_coeff = raw_input.maxCoeff(); + } + const float label_selection_input_min = + (label_selection_margin_ >= 0) ? (max_coeff - label_selection_margin_) + : -std::numeric_limits::infinity(); + + // Extract the beams sorted in decreasing new probability + TFLITE_DCHECK_EQ(num_classes_, raw_input.size()); + + std::unique_ptr> branches(leaves_.Extract()); + leaves_.Reset(); + + for (BeamEntry* b : *branches) { + // P(.. @ t) becomes the new P(.. @ t-1) + b->oldp = b->newp; + } + + for (BeamEntry* b : *branches) { + if (b->parent != nullptr) { // if not the root + if (b->parent->Active()) { + // If last two sequence characters are identical: + // Plabel(l=acc @ t=6) = (Plabel(l=acc @ t=5) + // + Pblank(l=ac @ t=5)) + // else: + // Plabel(l=abc @ t=6) = (Plabel(l=abc @ t=5) + // + P(l=ab @ t=5)) + float previous = (b->label == b->parent->label) ? b->parent->oldp.blank + : b->parent->oldp.total; + b->newp.label = + LogSumExp(b->newp.label, + beam_scorer_->GetStateExpansionScore(b->state, previous)); + } + // Plabel(l=abc @ t=6) *= P(c @ 6) + b->newp.label += raw_input(b->label) - max_coeff; + } + // Pblank(l=abc @ t=6) = P(l=abc @ t=5) * P(- @ 6) + b->newp.blank = b->oldp.total + raw_input(blank_index_) - max_coeff; + // P(l=abc @ t=6) = Plabel(l=abc @ t=6) + Pblank(l=abc @ t=6) + b->newp.total = LogSumExp(b->newp.blank, b->newp.label); + + // Push the entry back to the top paths list. + // Note, this will always fill leaves back up in sorted order. + leaves_.push(b); + } + + // we need to resort branches in descending oldp order. + + // branches is in descending oldp order because it was + // originally in descending newp order and we copied newp to oldp. + + // Grow new leaves + for (BeamEntry* b : *branches) { + // A new leaf (represented by its BeamProbability) is a candidate + // iff its total probability is nonzero and either the beam list + // isn't full, or the lowest probability entry in the beam has a + // lower probability than the leaf. + auto is_candidate = [this](const BeamProbability& prob) { + return (prob.total > kLogZero && + (leaves_.size() < beam_width_ || + prob.total > leaves_.peek_bottom()->newp.total)); + }; + + if (!is_candidate(b->oldp)) { + continue; + } + + for (int ind = 0; ind < max_classes; ind++) { + const int label = top_k ? top_k_indices[ind] : ind; + const float logit = top_k ? top_k_logits[ind] : raw_input(ind); + // Perform label selection: if input for this label looks very + // unpromising, never evaluate it with a scorer. + if (logit < label_selection_input_min) { + continue; + } + BeamEntry& c = b->GetChild(label); + if (!c.Active()) { + // Pblank(l=abcd @ t=6) = 0 + c.newp.blank = kLogZero; + // If new child label is identical to beam label: + // Plabel(l=abcc @ t=6) = Pblank(l=abc @ t=5) * P(c @ 6) + // Otherwise: + // Plabel(l=abcd @ t=6) = P(l=abc @ t=5) * P(d @ 6) + beam_scorer_->ExpandState(b->state, b->label, &c.state, c.label); + float previous = (c.label == b->label) ? b->oldp.blank : b->oldp.total; + c.newp.label = logit - max_coeff + + beam_scorer_->GetStateExpansionScore(c.state, previous); + // P(l=abcd @ t=6) = Plabel(l=abcd @ t=6) + c.newp.total = c.newp.label; + + if (is_candidate(c.newp)) { + // Before adding the new node to the beam, check if the beam + // is already at maximum width. + if (leaves_.size() == beam_width_) { + // Bottom is no longer in the beam search. Reset + // its probability; signal it's no longer in the beam search. + BeamEntry* bottom = leaves_.peek_bottom(); + bottom->newp.Reset(); + } + leaves_.push(&c); + } else { + // Deactivate child. + c.oldp.Reset(); + c.newp.Reset(); + } + } + } + } // for (BeamEntry* b... +} + +template +void CTCBeamSearchDecoder::Reset() { + leaves_.Reset(); + + // This beam root, and all of its children, will be in memory until + // the next reset. + beam_root_.reset(new BeamRoot(nullptr, -1)); + beam_root_->RootEntry()->newp.total = 0.0; // ln(1) + beam_root_->RootEntry()->newp.blank = 0.0; // ln(1) + + // Add the root as the initial leaf. + leaves_.push(beam_root_->RootEntry()); + + // Call initialize state on the root object. + beam_scorer_->InitializeState(&beam_root_->RootEntry()->state); +} + +template +bool CTCBeamSearchDecoder::TopPaths( + int n, std::vector>* paths, std::vector* log_probs, + bool merge_repeated) const { + TFLITE_DCHECK(paths); + TFLITE_DCHECK(log_probs); + paths->clear(); + log_probs->clear(); + if (n > beam_width_) { + return false; + } + if (n > leaves_.size()) { + return false; + } + + gtl::TopN top_branches(n); + + // O(beam_width_ * log(n)), space complexity is O(n) + for (auto it = leaves_.unsorted_begin(); it != leaves_.unsorted_end(); ++it) { + top_branches.push(*it); + } + // O(n * log(n)) + std::unique_ptr> branches(top_branches.Extract()); + + for (int i = 0; i < n; ++i) { + BeamEntry* e((*branches)[i]); + paths->push_back(e->LabelSeq(merge_repeated)); + log_probs->push_back(e->newp.total); + } + return true; +} + +} // namespace ctc +} // namespace experimental +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SEARCH_H_ diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc new file mode 100644 index 0000000000000000000000000000000000000000..834d1ebd666db2be46394166edadf2a166d958aa --- /dev/null +++ b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc @@ -0,0 +1,247 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include +#include "flatbuffers/flexbuffers.h" +#include "tensorflow/contrib/lite/context.h" +#include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h" +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" +#include "tensorflow/contrib/lite/kernels/op_macros.h" + +namespace tflite { +namespace ops { +namespace experimental { +namespace ctc_beam_search_decoder { + +constexpr int kInputsTensor = 0; +constexpr int kSequenceLengthTensor = 1; + +typedef struct { + int beam_width; + int top_paths; + bool merge_repeated; +} CTCBeamSearchDecoderParams; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + TFLITE_CHECK(buffer != nullptr); + const uint8_t* buffer_t = reinterpret_cast(buffer); + const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap(); + + CTCBeamSearchDecoderParams* option = new CTCBeamSearchDecoderParams; + option->beam_width = m["beam_width"].AsInt32(); + option->top_paths = m["top_paths"].AsInt32(); + option->merge_repeated = m["merge_repeated"].AsBool(); + + return option; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast(buffer); +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + const CTCBeamSearchDecoderParams* option = + reinterpret_cast(node->user_data); + const int top_paths = option->top_paths; + TF_LITE_ENSURE(context, option->beam_width >= top_paths); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + // The outputs should be top_paths * 3 + 1. + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 3 * top_paths + 1); + + const TfLiteTensor* inputs = GetInput(context, node, kInputsTensor); + TF_LITE_ENSURE_EQ(context, NumDimensions(inputs), 3); + // TensorFlow only supports float. + TF_LITE_ENSURE_EQ(context, inputs->type, kTfLiteFloat32); + const int batch_size = SizeOfDimension(inputs, 1); + + const TfLiteTensor* sequence_length = + GetInput(context, node, kSequenceLengthTensor); + TF_LITE_ENSURE_EQ(context, NumDimensions(sequence_length), 1); + TF_LITE_ENSURE_EQ(context, NumElements(sequence_length), batch_size); + // TensorFlow only supports int32. + TF_LITE_ENSURE_EQ(context, sequence_length->type, kTfLiteInt32); + + // Resize decoded outputs. + // Do not resize indices & values cause we don't know the values yet. + for (int i = 0; i < top_paths; ++i) { + TfLiteTensor* indices = GetOutput(context, node, i); + SetTensorToDynamic(indices); + TfLiteTensor* values = GetOutput(context, node, i + top_paths); + SetTensorToDynamic(values); + TfLiteTensor* output_shape = GetOutput(context, node, i + 2 * top_paths); + SetTensorToDynamic(output_shape); + } + + // Resize log probability outputs. + TfLiteTensor* log_probability_output = + GetOutput(context, node, top_paths * 3); + TfLiteIntArray* log_probability_output_shape_array = TfLiteIntArrayCreate(2); + log_probability_output_shape_array->data[0] = batch_size; + log_probability_output_shape_array->data[1] = top_paths; + return context->ResizeTensor(context, log_probability_output, + log_probability_output_shape_array); +} + +TfLiteStatus Resize(TfLiteContext* context, + std::initializer_list output_shape, + TfLiteTensor* output) { + const int dimensions = output_shape.size(); + TfLiteIntArray* output_shape_array = TfLiteIntArrayCreate(dimensions); + int i = 0; + for (const int v : output_shape) { + output_shape_array->data[i++] = v; + } + return context->ResizeTensor(context, output, output_shape_array); +} + +TfLiteStatus StoreAllDecodedSequences( + TfLiteContext* context, + const std::vector>>& sequences, + TfLiteNode* node, int top_paths) { + const int32_t batch_size = sequences.size(); + std::vector num_entries(top_paths, 0); + + // Calculate num_entries per path + for (const auto& batch_s : sequences) { + TF_LITE_ENSURE_EQ(context, batch_s.size(), top_paths); + for (int p = 0; p < top_paths; ++p) { + num_entries[p] += batch_s[p].size(); + } + } + + for (int p = 0; p < top_paths; ++p) { + const int32_t p_num = num_entries[p]; + + // Resize the decoded outputs. + TfLiteTensor* indices = GetOutput(context, node, p); + TF_LITE_ENSURE_OK(context, Resize(context, {p_num, 2}, indices)); + + TfLiteTensor* values = GetOutput(context, node, p + top_paths); + TF_LITE_ENSURE_OK(context, Resize(context, {p_num}, values)); + + TfLiteTensor* decoded_shape = GetOutput(context, node, p + 2 * top_paths); + TF_LITE_ENSURE_OK(context, Resize(context, {2}, decoded_shape)); + + int32_t max_decoded = 0; + int32_t offset = 0; + + int32_t* indices_data = GetTensorData(indices); + int32_t* values_data = GetTensorData(values); + int32_t* decoded_shape_data = GetTensorData(decoded_shape); + for (int b = 0; b < batch_size; ++b) { + auto& p_batch = sequences[b][p]; + int32_t num_decoded = p_batch.size(); + max_decoded = std::max(max_decoded, num_decoded); + + std::copy_n(p_batch.begin(), num_decoded, values_data + offset); + for (int32_t t = 0; t < num_decoded; ++t, ++offset) { + indices_data[offset * 2] = b; + indices_data[offset * 2 + 1] = t; + } + } + + decoded_shape_data[0] = batch_size; + decoded_shape_data[1] = max_decoded; + } + return kTfLiteOk; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* inputs = GetInput(context, node, kInputsTensor); + const TfLiteTensor* sequence_length = + GetInput(context, node, kSequenceLengthTensor); + const CTCBeamSearchDecoderParams* option = + reinterpret_cast(node->user_data); + + const int max_time = SizeOfDimension(inputs, 0); + const int batch_size = SizeOfDimension(inputs, 1); + const int num_classes = SizeOfDimension(inputs, 2); + + const int beam_width = option->beam_width; + const int top_paths = option->top_paths; + const bool merge_repeated = option->merge_repeated; + + // Validate sequence length is less or equal than max time. + for (int i = 0; i < batch_size; ++i) { + TF_LITE_ENSURE(context, + max_time >= GetTensorData(sequence_length)[i]); + } + + // The following logic is implemented like + // tensorflow/core/kernels/ctc_decoder_ops.cc + std::vector::UnalignedConstMatrix> input_list_t; + + for (std::size_t t = 0; t < max_time; ++t) { + input_list_t.emplace_back( + GetTensorData(inputs) + t * batch_size * num_classes, batch_size, + num_classes); + } + + ::tflite::experimental::ctc::CTCBeamSearchDecoder<>::DefaultBeamScorer + beam_scorer; + ::tflite::experimental::ctc::CTCBeamSearchDecoder<> beam_search( + num_classes, beam_width, &beam_scorer, 1 /* batch_size */, + merge_repeated); + + // Allocate temporary memory for holding chip operation data. + float* input_chip_t_data = + static_cast(malloc(num_classes * sizeof(float))); + Eigen::array dims; + dims[0] = num_classes; + optimized_ops::TTypes::Flat input_chip_t(input_chip_t_data, dims); + + std::vector>> best_paths(batch_size); + std::vector log_probs; + + TfLiteTensor* log_probabilities = GetOutput(context, node, 3 * top_paths); + float* log_probabilities_output = GetTensorData(log_probabilities); + + // Assumption: the blank index is num_classes - 1 + for (int b = 0; b < batch_size; ++b) { + auto& best_paths_b = best_paths[b]; + best_paths_b.resize(top_paths); + for (int t = 0; t < GetTensorData(sequence_length)[b]; ++t) { + input_chip_t = input_list_t[t].chip(b, 0); + auto input_bi = + Eigen::Map(input_chip_t.data(), num_classes); + beam_search.Step(input_bi); + } + TF_LITE_ENSURE(context, beam_search.TopPaths(top_paths, &best_paths_b, + &log_probs, merge_repeated)); + beam_search.Reset(); + + // Fill in log_probabilities output. + for (int bp = 0; bp < top_paths; ++bp) { + log_probabilities_output[b * top_paths + bp] = log_probs[bp]; + } + } + + free(input_chip_t_data); + return StoreAllDecodedSequences(context, best_paths, node, top_paths); +} + +} // namespace ctc_beam_search_decoder + +TfLiteRegistration* Register_CTC_BEAM_SEARCH_DECODER() { + static TfLiteRegistration r = { + ctc_beam_search_decoder::Init, ctc_beam_search_decoder::Free, + ctc_beam_search_decoder::Prepare, ctc_beam_search_decoder::Eval}; + return &r; +} + +} // namespace experimental +} // namespace ops +} // namespace tflite diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9d1e6a562f00905d1db7f7e055ac1c6b1cc34f9e --- /dev/null +++ b/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc @@ -0,0 +1,238 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include +#include + +#include +#include "flatbuffers/flexbuffers.h" +#include "tensorflow/contrib/lite/interpreter.h" +#include "tensorflow/contrib/lite/kernels/register.h" +#include "tensorflow/contrib/lite/kernels/test_util.h" +#include "tensorflow/contrib/lite/model.h" + +namespace tflite { +namespace ops { +namespace experimental { + +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; + +TfLiteRegistration* Register_CTC_BEAM_SEARCH_DECODER(); + +namespace { + +using ::testing::ElementsAre; +using ::testing::ElementsAreArray; + +class CTCBeamSearchDecoderOpModel : public SingleOpModel { + public: + CTCBeamSearchDecoderOpModel(std::initializer_list input_shape, + std::initializer_list sequence_length_shape, + int beam_width, int top_paths, + bool merge_repeated) { + inputs_ = AddInput(TensorType_FLOAT32); + sequence_length_ = AddInput(TensorType_INT32); + + for (int i = 0; i < top_paths * 3; ++i) { + outputs_.push_back(AddOutput(TensorType_INT32)); + } + outputs_.push_back(AddOutput(TensorType_FLOAT32)); + + flexbuffers::Builder fbb; + fbb.Map([&]() { + fbb.Int("beam_width", beam_width); + fbb.Int("top_paths", top_paths); + fbb.Bool("merge_repeated", merge_repeated); + }); + fbb.Finish(); + SetCustomOp("CTCBeamSearchDecoder", fbb.GetBuffer(), + Register_CTC_BEAM_SEARCH_DECODER); + BuildInterpreter({input_shape, sequence_length_shape}); + } + + int inputs() { return inputs_; } + + int sequence_length() { return sequence_length_; } + + std::vector> GetDecodedOutpus() { + std::vector> outputs; + for (int i = 0; i < outputs_.size() - 1; ++i) { + outputs.push_back(ExtractVector(outputs_[i])); + } + return outputs; + } + + std::vector GetLogProbabilitiesOutput() { + return ExtractVector(outputs_[outputs_.size() - 1]); + } + + std::vector> GetOutputShapes() { + std::vector> output_shapes; + for (const int output : outputs_) { + output_shapes.push_back(GetTensorShape(output)); + } + return output_shapes; + } + + private: + int inputs_; + int sequence_length_; + std::vector outputs_; +}; + +TEST(CTCBeamSearchTest, SimpleTest) { + CTCBeamSearchDecoderOpModel m({2, 1, 2}, {1}, 1, 1, true); + m.PopulateTensor(m.inputs(), + {-0.50922557, -1.35512652, -2.55445064, -1.58419356}); + m.PopulateTensor(m.sequence_length(), {2}); + m.Invoke(); + + // Make sure the output shapes are right. + const std::vector>& output_shapes = m.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 4); + EXPECT_THAT(output_shapes[0], ElementsAre(1, 2)); + EXPECT_THAT(output_shapes[1], ElementsAre(1)); + EXPECT_THAT(output_shapes[2], ElementsAre(2)); + EXPECT_THAT(output_shapes[3], ElementsAre(1, 1)); + + // Check decoded outputs. + const std::vector>& decoded_outputs = m.GetDecodedOutpus(); + EXPECT_EQ(decoded_outputs.size(), 3); + EXPECT_THAT(decoded_outputs[0], ElementsAre(0, 0)); + EXPECT_THAT(decoded_outputs[1], ElementsAre(0)); + EXPECT_THAT(decoded_outputs[2], ElementsAre(1, 1)); + // Check log probabilities output. + EXPECT_THAT(m.GetLogProbabilitiesOutput(), + ElementsAreArray(ArrayFloatNear({0.32134813}))); +} + +TEST(CTCBeamSearchTest, MultiBatchTest) { + CTCBeamSearchDecoderOpModel m({3, 3, 3}, {3}, 1, 1, true); + m.PopulateTensor( + m.inputs(), + {-0.63649208, -0.00487571, -0.04249819, -0.67754697, -1.0341399, + -2.14717721, -0.77686821, -3.41973774, -0.05151402, -0.21482619, + -0.57411168, -1.45039917, -0.73769373, -2.10941739, -0.44818325, + -0.25287673, -2.80057302, -0.54748312, -0.73334867, -0.86537719, + -0.2065197, -0.18725838, -1.42770405, -0.86051965, -1.61642301, + -2.07275114, -0.9201845}); + m.PopulateTensor(m.sequence_length(), {3, 3, 3}); + m.Invoke(); + + // Make sure the output shapes are right. + const std::vector>& output_shapes = m.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 4); + EXPECT_THAT(output_shapes[0], ElementsAre(4, 2)); + EXPECT_THAT(output_shapes[1], ElementsAre(4)); + EXPECT_THAT(output_shapes[2], ElementsAre(2)); + EXPECT_THAT(output_shapes[3], ElementsAre(3, 1)); + + // Check decoded outputs. + const std::vector>& decoded_outputs = m.GetDecodedOutpus(); + EXPECT_EQ(decoded_outputs.size(), 3); + EXPECT_THAT(decoded_outputs[0], ElementsAre(0, 0, 0, 1, 1, 0, 2, 0)); + EXPECT_THAT(decoded_outputs[1], ElementsAre(1, 0, 0, 0)); + EXPECT_THAT(decoded_outputs[2], ElementsAre(3, 2)); + // Check log probabilities output. + EXPECT_THAT( + m.GetLogProbabilitiesOutput(), + ElementsAreArray(ArrayFloatNear({0.46403232, 0.49500442, 0.40443572}))); +} + +TEST(CTCBeamSearchTest, MultiPathsTest) { + CTCBeamSearchDecoderOpModel m({3, 2, 5}, {2}, 3, 2, true); + m.PopulateTensor( + m.inputs(), + {-2.206851, -0.09542714, -0.2393415, -3.81866197, -0.27241158, + -0.20371124, -0.68236623, -1.1397166, -0.17422639, -1.85224048, + -0.9406037, -0.32544678, -0.21846784, -0.38377237, -0.33498676, + -0.10139782, -0.51886883, -0.21678554, -0.15267063, -1.91164412, + -0.31328673, -0.27462716, -0.65975336, -1.53671973, -2.76554225, + -0.23920634, -1.2370502, -4.98751576, -3.12995717, -0.43129368}); + m.PopulateTensor(m.sequence_length(), {3, 3}); + m.Invoke(); + + // Make sure the output shapes are right. + const std::vector>& output_shapes = m.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 7); + EXPECT_THAT(output_shapes[0], ElementsAre(4, 2)); + EXPECT_THAT(output_shapes[1], ElementsAre(3, 2)); + EXPECT_THAT(output_shapes[2], ElementsAre(4)); + EXPECT_THAT(output_shapes[3], ElementsAre(3)); + EXPECT_THAT(output_shapes[4], ElementsAre(2)); + EXPECT_THAT(output_shapes[5], ElementsAre(2)); + EXPECT_THAT(output_shapes[6], ElementsAre(2, 2)); + + // Check decoded outputs. + const std::vector>& decoded_outputs = m.GetDecodedOutpus(); + EXPECT_EQ(decoded_outputs.size(), 6); + EXPECT_THAT(decoded_outputs[0], ElementsAre(0, 0, 0, 1, 1, 0, 1, 1)); + EXPECT_THAT(decoded_outputs[1], ElementsAre(0, 0, 0, 1, 1, 0)); + EXPECT_THAT(decoded_outputs[2], ElementsAre(1, 2, 3, 0)); + EXPECT_THAT(decoded_outputs[3], ElementsAre(2, 1, 0)); + EXPECT_THAT(decoded_outputs[4], ElementsAre(2, 2)); + EXPECT_THAT(decoded_outputs[5], ElementsAre(2, 2)); + // Check log probabilities output. + EXPECT_THAT(m.GetLogProbabilitiesOutput(), + ElementsAreArray(ArrayFloatNear( + {0.91318405, 0.9060272, 1.0780245, 0.64358956}))); +} + +TEST(CTCBeamSearchTest, NonEqualSequencesTest) { + CTCBeamSearchDecoderOpModel m({3, 3, 4}, {3}, 3, 1, true); + m.PopulateTensor( + m.inputs(), + {-1.26658163, -0.25760023, -0.03917975, -0.63772235, -0.03794756, + -0.45063099, -0.27706473, -0.01569179, -0.59940385, -0.35700127, + -0.48920721, -1.42635476, -1.3462478, -0.02565498, -0.30179568, + -0.6491698, -0.55017719, -2.92291466, -0.92522973, -0.47592022, + -0.07099135, -0.31575624, -0.86345281, -0.36017021, -0.79208612, + -1.75306124, -0.65089224, -0.00912786, -0.42915003, -1.72606203, + -1.66337589, -0.70800793, -2.52272352, -0.67329562, -2.49145522, + -0.49786342}); + m.PopulateTensor(m.sequence_length(), {1, 2, 3}); + m.Invoke(); + + // Make sure the output shapes are right. + const std::vector>& output_shapes = m.GetOutputShapes(); + EXPECT_EQ(output_shapes.size(), 4); + EXPECT_THAT(output_shapes[0], ElementsAre(3, 2)); + EXPECT_THAT(output_shapes[1], ElementsAre(3)); + EXPECT_THAT(output_shapes[2], ElementsAre(2)); + EXPECT_THAT(output_shapes[3], ElementsAre(3, 1)); + + // Check decoded outputs. + const std::vector>& decoded_outputs = m.GetDecodedOutpus(); + EXPECT_EQ(decoded_outputs.size(), 3); + EXPECT_THAT(decoded_outputs[0], ElementsAre(0, 0, 1, 0, 2, 0)); + EXPECT_THAT(decoded_outputs[1], ElementsAre(2, 0, 1)); + EXPECT_THAT(decoded_outputs[2], ElementsAre(3, 1)); + // Check log probabilities output. + EXPECT_THAT(m.GetLogProbabilitiesOutput(), + ElementsAreArray(ArrayFloatNear({0., 1.0347567, 0.7833005}))); +} + +} // namespace +} // namespace experimental +} // namespace ops +} // namespace tflite + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_decoder.h b/tensorflow/contrib/lite/experimental/kernels/ctc_decoder.h new file mode 100644 index 0000000000000000000000000000000000000000..596ad4a5f7264ae24caa5592d10c09c256629b06 --- /dev/null +++ b/tensorflow/contrib/lite/experimental/kernels/ctc_decoder.h @@ -0,0 +1,114 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Copied from tensorflow/core/util/ctc/ctc_decoder.h +// TODO(b/111524997): Remove this file. +#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_DECODER_H_ +#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_DECODER_H_ + +#include +#include + +#include "third_party/eigen3/Eigen/Core" + +namespace tflite { +namespace experimental { +namespace ctc { + +// The CTCDecoder is an abstract interface to be implemented when providing a +// decoding method on the timestep output of a RNN trained with CTC loss. +// +// The two types of decoding available are: +// - greedy path, through the CTCGreedyDecoder +// - beam search, through the CTCBeamSearchDecoder +class CTCDecoder { + public: + typedef Eigen::Map SequenceLength; + typedef Eigen::Map Input; + typedef std::vector> Output; + typedef Eigen::Map ScoreOutput; + + CTCDecoder(int num_classes, int batch_size, bool merge_repeated) + : num_classes_(num_classes), + blank_index_(num_classes - 1), + batch_size_(batch_size), + merge_repeated_(merge_repeated) {} + + virtual ~CTCDecoder() {} + + // Dimensionality of the input/output is expected to be: + // - seq_len[b] - b = 0 to batch_size_ + // - input[t].rows(b) - t = 0 to timesteps; b = 0 t batch_size_ + // - output.size() specifies the number of beams to be returned. + // - scores(b, i) - b = 0 to batch_size; i = 0 to output.size() + virtual bool Decode(const SequenceLength& seq_len, + const std::vector& input, + std::vector* output, ScoreOutput* scores) = 0; + + int batch_size() { return batch_size_; } + int num_classes() { return num_classes_; } + + protected: + int num_classes_; + int blank_index_; + int batch_size_; + bool merge_repeated_; +}; + +// CTCGreedyDecoder is an implementation of the simple best path decoding +// algorithm, selecting at each timestep the most likely class at each timestep. +class CTCGreedyDecoder : public CTCDecoder { + public: + CTCGreedyDecoder(int num_classes, int batch_size, bool merge_repeated) + : CTCDecoder(num_classes, batch_size, merge_repeated) {} + + bool Decode(const CTCDecoder::SequenceLength& seq_len, + const std::vector& input, + std::vector* output, + CTCDecoder::ScoreOutput* scores) override { + if (output->empty() || (*output)[0].size() < batch_size_) { + return false; + } + if (scores->rows() < batch_size_ || scores->cols() == 0) { + return false; + } + // For each batch entry, identify the transitions + for (int b = 0; b < batch_size_; ++b) { + int seq_len_b = seq_len[b]; + // Only writing to beam 0 + std::vector& output_b = (*output)[0][b]; + + int prev_class_ix = -1; + (*scores)(b, 0) = 0; + for (int t = 0; t < seq_len_b; ++t) { + auto row = input[t].row(b); + int max_class_ix; + (*scores)(b, 0) += -row.maxCoeff(&max_class_ix); + if (max_class_ix != blank_index_ && + !(merge_repeated_ && max_class_ix == prev_class_ix)) { + output_b.push_back(max_class_ix); + } + prev_class_ix = max_class_ix; + } + } + return true; + } +}; + +} // namespace ctc +} // namespace experimental +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_DECODER_H_ diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h b/tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h new file mode 100644 index 0000000000000000000000000000000000000000..0bae732533716ac047a55ea31633c8ed51253fe0 --- /dev/null +++ b/tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h @@ -0,0 +1,50 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Copied from tensorflow/core/util/ctc/ctc_loss_util.h +// TODO(b/111524997): Remove this file. +#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_LOSS_UTIL_H_ +#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_LOSS_UTIL_H_ + +#include +#include + +namespace tflite { +namespace experimental { +namespace ctc { + +const float kLogZero = -std::numeric_limits::infinity(); + +// Add logarithmic probabilities using: +// ln(a + b) = ln(a) + ln(1 + exp(ln(b) - ln(a))) +// The two inputs are assumed to be log probabilities. +// (GravesTh) Eq. 7.18 +inline float LogSumExp(float log_prob_1, float log_prob_2) { + // Always have 'b' be the smaller number to avoid the exponential from + // blowing up. + if (log_prob_1 == kLogZero && log_prob_2 == kLogZero) { + return kLogZero; + } else { + return (log_prob_1 > log_prob_2) + ? log_prob_1 + log1pf(expf(log_prob_2 - log_prob_1)) + : log_prob_2 + log1pf(expf(log_prob_1 - log_prob_2)); + } +} + +} // namespace ctc +} // namespace experimental +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_LOSS_UTIL_H_ diff --git a/tensorflow/contrib/lite/experimental/kernels/top_n.h b/tensorflow/contrib/lite/experimental/kernels/top_n.h new file mode 100644 index 0000000000000000000000000000000000000000..cd2a2f1c80276d4659ccd2f8f05af3af030acb90 --- /dev/null +++ b/tensorflow/contrib/lite/experimental/kernels/top_n.h @@ -0,0 +1,341 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// This simple class finds the top n elements of an incrementally provided set +// of elements which you push one at a time. If the number of elements exceeds +// n, the lowest elements are incrementally dropped. At the end you get +// a vector of the top elements sorted in descending order (through Extract() or +// ExtractNondestructive()), or a vector of the top elements but not sorted +// (through ExtractUnsorted() or ExtractUnsortedNondestructive()). +// +// The value n is specified in the constructor. If there are p elements pushed +// altogether: +// The total storage requirements are O(min(n, p)) elements +// The running time is O(p * log(min(n, p))) comparisons +// If n is a constant, the total storage required is a constant and the running +// time is linear in p. +// +// NOTE(zhifengc): There is a way to do this in O(min(n, p)) storage and O(p) +// runtime. The basic idea is to repeatedly fill up a buffer of 2 * n elements, +// discarding the lowest n elements whenever the buffer is full using a linear- +// time median algorithm. This may have better performance when the input +// sequence is partially sorted. +// +// NOTE(zhifengc): This class should be redesigned to avoid reallocating a +// vector for each Extract. + +// Copied from tensorflow/core/lib/gtl/top_n.h +// TODO(b/111524997): Remove this file. +#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_TOP_N_H_ +#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_TOP_N_H_ + +#include +#include +#include +#include +#include + +#include "tensorflow/contrib/lite/kernels/internal/compatibility.h" + +namespace tflite { +namespace gtl { + +// Cmp is an stl binary predicate. Note that Cmp is the "greater" predicate, +// not the more commonly used "less" predicate. +// +// If you use a "less" predicate here, the TopN will pick out the bottom N +// elements out of the ones passed to it, and it will return them sorted in +// ascending order. +// +// TopN is rule-of-zero copyable and movable if its members are. +template > +class TopN { + public: + // The TopN is in one of the three states: + // + // o UNORDERED: this is the state an instance is originally in, + // where the elements are completely orderless. + // + // o BOTTOM_KNOWN: in this state, we keep the invariant that there + // is at least one element in it, and the lowest element is at + // position 0. The elements in other positions remain + // unsorted. This state is reached if the state was originally + // UNORDERED and a peek_bottom() function call is invoked. + // + // o HEAP_SORTED: in this state, the array is kept as a heap and + // there are exactly (limit_+1) elements in the array. This + // state is reached when at least (limit_+1) elements are + // pushed in. + // + // The state transition graph is at follows: + // + // peek_bottom() (limit_+1) elements + // UNORDERED --------------> BOTTOM_KNOWN --------------------> HEAP_SORTED + // | ^ + // | (limit_+1) elements | + // +-----------------------------------------------------------+ + + enum State { UNORDERED, BOTTOM_KNOWN, HEAP_SORTED }; + using UnsortedIterator = typename std::vector::const_iterator; + + // 'limit' is the maximum number of top results to return. + explicit TopN(size_t limit) : TopN(limit, Cmp()) {} + TopN(size_t limit, const Cmp &cmp) : limit_(limit), cmp_(cmp) {} + + size_t limit() const { return limit_; } + + // Number of elements currently held by this TopN object. This + // will be no greater than 'limit' passed to the constructor. + size_t size() const { return std::min(elements_.size(), limit_); } + + bool empty() const { return size() == 0; } + + // If you know how many elements you will push at the time you create the + // TopN object, you can call reserve to preallocate the memory that TopN + // will need to process all 'n' pushes. Calling this method is optional. + void reserve(size_t n) { elements_.reserve(std::min(n, limit_ + 1)); } + + // Push 'v'. If the maximum number of elements was exceeded, drop the + // lowest element and return it in 'dropped' (if given). If the maximum is not + // exceeded, 'dropped' will remain unchanged. 'dropped' may be omitted or + // nullptr, in which case it is not filled in. + // Requires: T is CopyAssignable, Swappable + void push(const T &v) { push(v, nullptr); } + void push(const T &v, T *dropped) { PushInternal(v, dropped); } + + // Move overloads of push. + // Requires: T is MoveAssignable, Swappable + void push(T &&v) { // NOLINT(build/c++11) + push(std::move(v), nullptr); + } + void push(T &&v, T *dropped) { // NOLINT(build/c++11) + PushInternal(std::move(v), dropped); + } + + // Peeks the bottom result without calling Extract() + const T &peek_bottom(); + + // Extract the elements as a vector sorted in descending order. The caller + // assumes ownership of the vector and must delete it when done. This is a + // destructive operation. The only method that can be called immediately + // after Extract() is Reset(). + std::vector *Extract(); + + // Similar to Extract(), but makes no guarantees the elements are in sorted + // order. As with Extract(), the caller assumes ownership of the vector and + // must delete it when done. This is a destructive operation. The only + // method that can be called immediately after ExtractUnsorted() is Reset(). + std::vector *ExtractUnsorted(); + + // A non-destructive version of Extract(). Copy the elements in a new vector + // sorted in descending order and return it. The caller assumes ownership of + // the new vector and must delete it when done. After calling + // ExtractNondestructive(), the caller can continue to push() new elements. + std::vector *ExtractNondestructive() const; + + // A non-destructive version of Extract(). Copy the elements to a given + // vector sorted in descending order. After calling + // ExtractNondestructive(), the caller can continue to push() new elements. + // Note: + // 1. The given argument must to be allocated. + // 2. Any data contained in the vector prior to the call will be deleted + // from it. After the call the vector will contain only the elements + // from the data structure. + void ExtractNondestructive(std::vector *output) const; + + // A non-destructive version of ExtractUnsorted(). Copy the elements in a new + // vector and return it, with no guarantees the elements are in sorted order. + // The caller assumes ownership of the new vector and must delete it when + // done. After calling ExtractUnsortedNondestructive(), the caller can + // continue to push() new elements. + std::vector *ExtractUnsortedNondestructive() const; + + // A non-destructive version of ExtractUnsorted(). Copy the elements into + // a given vector, with no guarantees the elements are in sorted order. + // After calling ExtractUnsortedNondestructive(), the caller can continue + // to push() new elements. + // Note: + // 1. The given argument must to be allocated. + // 2. Any data contained in the vector prior to the call will be deleted + // from it. After the call the vector will contain only the elements + // from the data structure. + void ExtractUnsortedNondestructive(std::vector *output) const; + + // Return an iterator to the beginning (end) of the container, + // with no guarantees about the order of iteration. These iterators are + // invalidated by mutation of the data structure. + UnsortedIterator unsorted_begin() const { return elements_.begin(); } + UnsortedIterator unsorted_end() const { return elements_.begin() + size(); } + + // Accessor for comparator template argument. + Cmp *comparator() { return &cmp_; } + + // This removes all elements. If Extract() or ExtractUnsorted() have been + // called, this will put it back in an empty but useable state. + void Reset(); + + private: + template + void PushInternal(U &&v, T *dropped); // NOLINT(build/c++11) + + // elements_ can be in one of two states: + // elements_.size() <= limit_: elements_ is an unsorted vector of elements + // pushed so far. + // elements_.size() > limit_: The last element of elements_ is unused; + // the other elements of elements_ are an stl heap whose size is exactly + // limit_. In this case elements_.size() is exactly one greater than + // limit_, but don't use "elements_.size() == limit_ + 1" to check for + // that because you'll get a false positive if limit_ == size_t(-1). + std::vector elements_; + size_t limit_; // Maximum number of elements to find + Cmp cmp_; // Greater-than comparison function + State state_ = UNORDERED; +}; + +// ---------------------------------------------------------------------- +// Implementations of non-inline functions + +template +template +void TopN::PushInternal(U &&v, T *dropped) { // NOLINT(build/c++11) + if (limit_ == 0) { + if (dropped) *dropped = std::forward(v); // NOLINT(build/c++11) + return; + } + if (state_ != HEAP_SORTED) { + elements_.push_back(std::forward(v)); // NOLINT(build/c++11) + if (state_ == UNORDERED || cmp_(elements_.back(), elements_.front())) { + // Easy case: we just pushed the new element back + } else { + // To maintain the BOTTOM_KNOWN state, we need to make sure that + // the element at position 0 is always the smallest. So we put + // the new element at position 0 and push the original bottom + // element in the back. + // Warning: this code is subtle. + using std::swap; + swap(elements_.front(), elements_.back()); + } + if (elements_.size() == limit_ + 1) { + // Transition from unsorted vector to a heap. + std::make_heap(elements_.begin(), elements_.end(), cmp_); + if (dropped) *dropped = std::move(elements_.front()); + std::pop_heap(elements_.begin(), elements_.end(), cmp_); + state_ = HEAP_SORTED; + } + } else { + // Only insert the new element if it is greater than the least element. + if (cmp_(v, elements_.front())) { + elements_.back() = std::forward(v); // NOLINT(build/c++11) + std::push_heap(elements_.begin(), elements_.end(), cmp_); + if (dropped) *dropped = std::move(elements_.front()); + std::pop_heap(elements_.begin(), elements_.end(), cmp_); + } else { + if (dropped) *dropped = std::forward(v); // NOLINT(build/c++11) + } + } +} + +template +const T &TopN::peek_bottom() { + TFLITE_DCHECK(!empty()); + if (state_ == UNORDERED) { + // We need to do a linear scan to find out the bottom element + int min_candidate = 0; + for (size_t i = 1; i < elements_.size(); ++i) { + if (cmp_(elements_[min_candidate], elements_[i])) { + min_candidate = i; + } + } + // By swapping the element at position 0 and the minimal + // element, we transition to the BOTTOM_KNOWN state + if (min_candidate != 0) { + using std::swap; + swap(elements_[0], elements_[min_candidate]); + } + state_ = BOTTOM_KNOWN; + } + return elements_.front(); +} + +template +std::vector *TopN::Extract() { + auto out = new std::vector; + out->swap(elements_); + if (state_ != HEAP_SORTED) { + std::sort(out->begin(), out->end(), cmp_); + } else { + out->pop_back(); + std::sort_heap(out->begin(), out->end(), cmp_); + } + return out; +} + +template +std::vector *TopN::ExtractUnsorted() { + auto out = new std::vector; + out->swap(elements_); + if (state_ == HEAP_SORTED) { + // Remove the limit_+1'th element. + out->pop_back(); + } + return out; +} + +template +std::vector *TopN::ExtractNondestructive() const { + auto out = new std::vector; + ExtractNondestructive(out); + return out; +} + +template +void TopN::ExtractNondestructive(std::vector *output) const { + TFLITE_DCHECK(output); + *output = elements_; + if (state_ != HEAP_SORTED) { + std::sort(output->begin(), output->end(), cmp_); + } else { + output->pop_back(); + std::sort_heap(output->begin(), output->end(), cmp_); + } +} + +template +std::vector *TopN::ExtractUnsortedNondestructive() const { + auto elements = new std::vector; + ExtractUnsortedNondestructive(elements); + return elements; +} + +template +void TopN::ExtractUnsortedNondestructive(std::vector *output) const { + TFLITE_DCHECK(output); + *output = elements_; + if (state_ == HEAP_SORTED) { + // Remove the limit_+1'th element. + output->pop_back(); + } +} + +template +void TopN::Reset() { + elements_.clear(); + state_ = UNORDERED; +} + +} // namespace gtl +} // namespace tflite + +#endif // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_TOP_N_H_ diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/contrib/lite/g3doc/models.md index 3292aece0e76244a61613b514457edf479858fdb..4ceb9a53dc0967ab6320a1bfdb1ddb859482c5dd 100644 --- a/tensorflow/contrib/lite/g3doc/models.md +++ b/tensorflow/contrib/lite/g3doc/models.md @@ -42,22 +42,22 @@ single thread large core. Model Name | Paper_Model_Files | Model_Size | Top-1 Accuracy | Top-5 Accuracy | TF Lite Performance ------------------------ | :-------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ------------------: -Mobilenet_0.25_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.25_128_quant.tgz) | 0.5 Mb | 39.7% | 65.8% | 3.7 ms -Mobilenet_0.25_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.25_160_quant.tgz) | 0.5 Mb | 41.9% | 69.1% | 5.5 ms -Mobilenet_0.25_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.25_192_quant.tgz) | 0.5 Mb | 45.3% | 71.9% | 7.9 ms -Mobilenet_0.25_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.25_224_quant.tgz) | 0.5 Mb | 46.4% | 73.8% | 10.4 ms -Mobilenet_0.50_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.5_128_quant.tgz) | 1.4 Mb | 54.1% | 78.9% | 8.8 ms -Mobilenet_0.50_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.5_160_quant.tgz) | 1.4 Mb | 57.6% | 81.3% | 13.0 ms -Mobilenet_0.50_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.5_192_quant.tgz) | 1.4 Mb | 59.1% | 83.2% | 18.3 ms -Mobilenet_0.50_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.5_224_quant.tgz) | 1.4 Mb | 61.0% | 84.5% | 24.7 ms -Mobilenet_0.75_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.75_128_quant.tgz) | 2.6 Mb | 52.5% | 82.8% | 16.2 ms -Mobilenet_0.75_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.75_160_quant.tgz) | 2.6 Mb | 63.6% | 85.5% | 24.3 ms -Mobilenet_0.75_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.75_192_quant.tgz) | 2.6 Mb | 61.1% | 87.1% | 33.8 ms -Mobilenet_0.75_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_0.75_224_quant.tgz) | 2.6 Mb | 66.7% | 88.1% | 45.4 ms -Mobilenet_1.0_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_1.0_128_quant.tgz) | 4.3 Mb | 62.7% | 85.5% | 24.9 ms -Mobilenet_1.0_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_1.0_160_quant.tgz) | 4.3 Mb | 66.6% | 87.7% | 37.4 ms -Mobilenet_1.0_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_1.0_192_quant.tgz) | 4.3 Mb | 69.2% | 88.9% | 51.9 ms -Mobilenet_1.0_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_07_12/mobilenet_v1_1.0_224_quant.tgz) | 4.3 Mb | 69.3% | 89.5% | 70.2 ms +Mobilenet_0.25_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_128_quant.tgz) | 0.5 Mb | 39.5% | 64.4% | 3.7 ms +Mobilenet_0.25_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_160_quant.tgz) | 0.5 Mb | 43.4% | 68.5% | 5.5 ms +Mobilenet_0.25_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_192_quant.tgz) | 0.5 Mb | 46.0% | 71.2% | 7.9 ms +Mobilenet_0.25_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_224_quant.tgz) | 0.5 Mb | 48.0% | 72.8% | 10.4 ms +Mobilenet_0.50_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_128_quant.tgz) | 1.4 Mb | 54.5% | 77.7% | 8.8 ms +Mobilenet_0.50_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_160_quant.tgz) | 1.4 Mb | 57.7% | 80.4% | 13.0 ms +Mobilenet_0.50_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_192_quant.tgz) | 1.4 Mb | 60.0% | 82.2% | 18.3 ms +Mobilenet_0.50_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_224_quant.tgz) | 1.4 Mb | 60.7% | 83.2% | 24.7 ms +Mobilenet_0.75_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_128_quant.tgz) | 2.6 Mb | 55.8% | 78.8% | 16.2 ms +Mobilenet_0.75_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_160_quant.tgz) | 2.6 Mb | 62.3% | 83.8% | 24.3 ms +Mobilenet_0.75_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_192_quant.tgz) | 2.6 Mb | 66.1% | 86.4% | 33.8 ms +Mobilenet_0.75_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_224_quant.tgz) | 2.6 Mb | 66.8% | 87.0% | 45.4 ms +Mobilenet_1.0_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_128_quant.tgz) | 4.3 Mb | 63.4% | 84.2% | 24.9 ms +Mobilenet_1.0_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz) | 4.3 Mb | 67.2% | 86.7% | 37.4 ms +Mobilenet_1.0_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz) | 4.3 Mb | 69.2% | 88.3% | 51.9 ms +Mobilenet_1.0_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz) | 4.3 Mb | 70.1% | 88.9% | 70.2 ms ## Other models diff --git a/tensorflow/contrib/lite/g3doc/performance.md b/tensorflow/contrib/lite/g3doc/performance.md index 613e9f97c38942f20d3ca44cdc69e72b35c8608f..5cd0aab44f10de1b76e1acb302fc1ee2711c8d74 100644 --- a/tensorflow/contrib/lite/g3doc/performance.md +++ b/tensorflow/contrib/lite/g3doc/performance.md @@ -39,7 +39,6 @@ Device | CPU_MASK | Pixel 2 | f0 | Pixel xl | 0c | - @@ -50,7 +49,7 @@ Pixel xl | 0c | @@ -61,7 +60,7 @@ Pixel xl | 0c | @@ -134,14 +133,14 @@ modified to set `num_threads` to 1. diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/contrib/lite/g3doc/rpi.md index cdc9172d873bfd32811ca69901ed2e4eedf902a3..9fcf79ba004d85566b64ce35b3693e01c4b0e2cf 100644 --- a/tensorflow/contrib/lite/g3doc/rpi.md +++ b/tensorflow/contrib/lite/g3doc/rpi.md @@ -20,7 +20,7 @@ Clone this Tensorflow repository, Run this script at the root of the repository ```bash ./tensorflow/contrib/lite/download_dependencies.sh ``` -Note than you only need to to this once. +Note that you only need to do this once. You should then be able to compile: ```bash @@ -42,7 +42,7 @@ First, clone this TensorFlow repository. Run this at the root of the repository: ```bash ./tensorflow/contrib/lite/download_dependencies.sh ``` -Note than you only need to to this once. +Note that you only need to do this once. You should then be able to compile: ```bash diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc index e38597495dc7e860209026631c2d386f690b6461..7a680f5c6400a94a2746d09891e0e39a410404a2 100644 --- a/tensorflow/contrib/lite/interpreter.cc +++ b/tensorflow/contrib/lite/interpreter.cc @@ -26,18 +26,12 @@ limitations under the License. #include "tensorflow/contrib/lite/error_reporter.h" #include "tensorflow/contrib/lite/graph_info.h" #include "tensorflow/contrib/lite/memory_planner.h" -#ifndef TFLITE_MCU #include "tensorflow/contrib/lite/nnapi_delegate.h" -#endif #include "tensorflow/contrib/lite/profiling/profiler.h" #include "tensorflow/contrib/lite/schema/schema_generated.h" #include "tensorflow/contrib/lite/util.h" namespace tflite { -#ifdef TFLITE_MCU -class NNAPIDelegate {}; -#endif - namespace { TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node, @@ -630,7 +624,6 @@ TfLiteStatus Interpreter::Invoke() { } TfLiteStatus status = kTfLiteOk; -#ifndef TFLITE_MCU if (nnapi_delegate_) { if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) { TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this)); @@ -644,7 +637,6 @@ TfLiteStatus Interpreter::Invoke() { return kTfLiteError; } } -#endif // Invocations are always done in node order. // Note that calling Invoke repeatedly will cause the original memory plan to @@ -902,17 +894,15 @@ TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor, } void Interpreter::UseNNAPI(bool enable) { -#ifndef TFLITE_MCU // TODO(aselle): This is a workaround for finding if NNAPI exists. // We also need to make sure getLibraryHandle() is renamed to be NNAPI // prefixed. - if (!NNAPIExists()) enable = false; + if (!NNAPIDelegate::IsSupported()) enable = false; if (!enable) { nnapi_delegate_.reset(); } else if (!nnapi_delegate_) { nnapi_delegate_.reset(new NNAPIDelegate); } -#endif } void Interpreter::SetNumThreads(int num_threads) { diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h index be149a8cc0e642d10b270ba617cd8d6be29430b2..159ff7bc20a1e1261cdfd746312279bd59e3b1a4 100644 --- a/tensorflow/contrib/lite/interpreter.h +++ b/tensorflow/contrib/lite/interpreter.h @@ -165,7 +165,7 @@ class Interpreter { return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(), dims.data(), quantization, buffer, bytes, allocation); - }; + } TfLiteStatus SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const size_t rank, @@ -527,12 +527,13 @@ class Interpreter { TfLiteRegistration** registration); // WARNING: This is an experimental interface that is subject to change. - // Gets an TfLiteIntArray* representing the execution plan. The caller owns - // this memory and must free it with TfLiteIntArrayFree(). + // Gets an TfLiteIntArray* representing the execution plan. The interpreter + // owns this memory and it is only guaranteed to exist during the invocation + // of the delegate prepare. TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan); // WARNING: This is an experimental interface that is subject to change. - // Entry point for C node plugin API to get the execution plan + // Entry point for C node plugin API to get the execution plan. static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context, TfLiteIntArray** execution_plan); diff --git a/tensorflow/contrib/lite/java/demo/.gitignore b/tensorflow/contrib/lite/java/demo/.gitignore index 39fb081a42a86ccf8f9cf99dbccc8bdf7c828bce..d245ab61095a6f9b6d2077aac934f9b13e66d85e 100644 --- a/tensorflow/contrib/lite/java/demo/.gitignore +++ b/tensorflow/contrib/lite/java/demo/.gitignore @@ -1,9 +1,29 @@ +# This file is based on https://github.com/github/gitignore/blob/master/Android.gitignore *.iml +.idea/compiler.xml +.idea/copyright +.idea/dictionaries +.idea/gradle.xml +.idea/libraries +.idea/inspectionProfiles +.idea/misc.xml +.idea/modules.xml +.idea/runConfigurations.xml +.idea/tasks.xml +.idea/workspace.xml .gradle -/local.properties -/.idea/workspace.xml -/.idea/libraries +local.properties .DS_Store -/build +build/ +gradleBuild/ +*.apk +*.ap_ +*.dex +*.class +bin/ +gen/ +out/ +*.log +.navigation/ /captures .externalNativeBuild diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java index c23521c0774ebab01f38db8b416020ae5755cee9..38b740021bb5037fc8980c75ca6aac2a9cc20c4e 100644 --- a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java +++ b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java @@ -65,6 +65,25 @@ public class TestHelper { } } + /** + * Gets the string name of the data type of an input. + * + * @param interpreter an instance of {@code Interpreter}. If it is not initialized, an {@code + * IllegalArgumentException} will be thrown. + * @param index an integer index of the input. If it is invalid, an {@code + * IllegalArgumentException} will be thrown. + * @return string name of the data type. Possible values include "float", "int", "byte", and + * "long". + */ + public static String getInputDataType(Interpreter interpreter, int index) { + if (interpreter != null && interpreter.wrapper != null) { + return interpreter.wrapper.getInputTensor(index).dataType().toStringName(); + } else { + throw new IllegalArgumentException( + "Interpreter has not initialized;" + " Failed to get input data type."); + } + } + /** * Gets the string name of the data type of an output. * diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD index 329c98f91e90134e1dff58427102776fd6b7a73b..c5586475ec258849948ff6b960abc846e2ea1b3c 100644 --- a/tensorflow/contrib/lite/kernels/BUILD +++ b/tensorflow/contrib/lite/kernels/BUILD @@ -8,6 +8,19 @@ load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts") load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite") load("//tensorflow:tensorflow.bzl", "tf_cc_test") +# Suppress warnings that are introduced by Eigen Tensor. +EXTRA_EIGEN_COPTS = select({ + "//tensorflow:ios": [ + "-Wno-error=invalid-partial-specialization", + "-Wno-error=reorder", + ], + "//tensorflow:windows": [ + "/DEIGEN_HAS_C99_MATH", + "/DEIGEN_AVOID_STL_ARRAY", + ], + "//conditions:default": ["-Wno-error=reorder"], +}) + tf_cc_test( name = "optional_tensor_test", size = "small", @@ -49,13 +62,7 @@ cc_library( hdrs = [ "eigen_support.h", ], - copts = tflite_copts() + [ - "-Wno-error=reorder", - ] + select({ - "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"], - "//conditions:default": [ - ], - }), + copts = tflite_copts() + EXTRA_EIGEN_COPTS, deps = [ ":op_macros", "//tensorflow/contrib/lite:arena_planner", @@ -209,14 +216,7 @@ cc_library( "padding.h", "register.h", ], - # Suppress warnings that are introduced by Eigen Tensor. - copts = tflite_copts() + [ - "-Wno-error=reorder", - ] + select({ - "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"], - "//conditions:default": [ - ], - }), + copts = tflite_copts() + EXTRA_EIGEN_COPTS, deps = [ ":activation_functor", ":eigen_support", diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc index 6e13b8c667c5c5188c9e1bc753346f231ae8e1b0..817266a47147980699a348a5c26ed637828e80c6 100644 --- a/tensorflow/contrib/lite/kernels/activations.cc +++ b/tensorflow/contrib/lite/kernels/activations.cc @@ -212,25 +212,25 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = GetOutput(context, node, 0); const TfLiteTensor* alpha = GetInput(context, node, 1); - output->type = input->type; - // Currently only Float32 is supported // TODO(ycling): Support other data types. TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE_EQ(context, alpha->type, kTfLiteFloat32); + output->type = input->type; - // Currently, only support 4D `input` and 3D `alpha` with shape - // (1, 1, channels). - // TODO(impjdi): Support other cases where `alpha` is broadcastable - // to `input`. - TF_LITE_ENSURE_EQ(context, input->dims->size, 4); - TF_LITE_ENSURE_EQ(context, alpha->dims->size, 3); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[0], 1); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[1], 1); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[2], input->dims->data[3]); + // PRelu (parameteric Relu) shares the same alpha value on "shared axis". + // This means it's always required to "broadcast" alpha values in PRelu. + TfLiteIntArray* output_size = nullptr; + TF_LITE_ENSURE_OK( + context, CalculateShapeForBroadcast(context, input, alpha, &output_size)); - return context->ResizeTensor(context, output, - TfLiteIntArrayCopy(input->dims)); + TF_LITE_ENSURE_OK(context, + context->ResizeTensor(context, output, output_size)); + // After broadcasting, the output shape should always be the same as the + // input shape. + TF_LITE_ENSURE(context, HaveSameShapes(input, output)); + + return kTfLiteOk; } TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) { @@ -524,33 +524,24 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) { } } +template +T ApplyPrelu(T input, T alpha) { + return input >= 0.0 ? input : input * alpha; +} + TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input = GetInput(context, node, 0); const TfLiteTensor* alpha = GetInput(context, node, 1); - const TfLiteTensor* output = GetOutput(context, node, 0); - + TfLiteTensor* output = GetOutput(context, node, 0); if (input->type != kTfLiteFloat32) { context->ReportError(context, "Only float32 supported currently, got %d.", input->type); return kTfLiteError; } - TF_LITE_ENSURE_EQ(context, input->dims->size, 4); - const int batches = input->dims->data[0]; - const int height = input->dims->data[1]; - const int width = input->dims->data[2]; - const int channels = input->dims->data[3]; - - TF_LITE_ENSURE_EQ(context, alpha->dims->size, 3); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[0], 1); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[1], 1); - TF_LITE_ENSURE_EQ(context, alpha->dims->data[2], channels); - - const int n = batches * height * width * channels; - for (int i = 0; i < n; ++i) { - const float x = input->data.f[i]; - output->data.f[i] = x >= 0.0f ? x : alpha->data.f[i % channels] * x; - } - + reference_ops::BroadcastBinaryFunction( + GetTensorData(input), GetTensorDims(input), + GetTensorData(alpha), GetTensorDims(alpha), + GetTensorData(output), GetTensorDims(output), ApplyPrelu); return kTfLiteOk; } diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/contrib/lite/kernels/comparisons.cc index f678f48fa5bbbcece6c5b87030d951783378d78f..8b4d778332afd5f4b53509bd669a674c63d9f6f9 100644 --- a/tensorflow/contrib/lite/kernels/comparisons.cc +++ b/tensorflow/contrib/lite/kernels/comparisons.cc @@ -57,6 +57,57 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) { return context->ResizeTensor(context, output, output_size); } +// TODO(ruic): optimize macros below to using template functions. +#define TF_LITE_QUANTIZE_COMPARISON(opname) \ + void EvalQuantized##opname(TfLiteContext* context, TfLiteNode* node, \ + const TfLiteTensor* input1, \ + const TfLiteTensor* input2, TfLiteTensor* output, \ + bool requires_broadcast) { \ + if (input1->type == kTfLiteUInt8) { \ + auto input1_offset = -input1->params.zero_point; \ + auto input2_offset = -input2->params.zero_point; \ + const int left_shift = 20; \ + const double twice_max_input_scale = \ + 2 * std::max(input1->params.scale, input2->params.scale); \ + const double real_input1_multiplier = \ + input1->params.scale / twice_max_input_scale; \ + const double real_input2_multiplier = \ + input2->params.scale / twice_max_input_scale; \ + \ + int32 input1_multiplier; \ + int input1_shift; \ + QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier, \ + &input1_multiplier, &input1_shift); \ + int32 input2_multiplier; \ + int input2_shift; \ + QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier, \ + &input2_multiplier, &input2_shift); \ + \ + if (requires_broadcast) { \ + reference_ops::Broadcast##opname( \ + left_shift, GetTensorData(input1), GetTensorDims(input1), \ + input1_offset, input1_multiplier, input1_shift, \ + GetTensorData(input2), GetTensorDims(input2), \ + input2_offset, input2_multiplier, input2_shift, \ + GetTensorData(output), GetTensorDims(output)); \ + } else { \ + reference_ops::opname( \ + left_shift, GetTensorData(input1), GetTensorDims(input1), \ + input1_offset, input1_multiplier, input1_shift, \ + GetTensorData(input2), GetTensorDims(input2), \ + input2_offset, input2_multiplier, input2_shift, \ + GetTensorData(output), GetTensorDims(output)); \ + } \ + } \ + } +TF_LITE_QUANTIZE_COMPARISON(Equal); +TF_LITE_QUANTIZE_COMPARISON(NotEqual); +TF_LITE_QUANTIZE_COMPARISON(Greater); +TF_LITE_QUANTIZE_COMPARISON(GreaterEqual); +TF_LITE_QUANTIZE_COMPARISON(Less); +TF_LITE_QUANTIZE_COMPARISON(LessEqual); +#undef TF_LITE_QUANTIZE_COMPARISON + #define TF_LITE_COMPARISON(type, opname, requires_broadcast) \ requires_broadcast \ ? reference_ops::Broadcast##opname( \ @@ -73,7 +124,6 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, Equal, requires_broadcast); @@ -84,9 +134,13 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedEqual(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type %d, requires float|int", + "Does not support type %d, requires float|int|uint8", input1->type); return kTfLiteError; } @@ -99,7 +153,6 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, NotEqual, requires_broadcast); @@ -110,9 +163,13 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedNotEqual(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type %d, requires float|int", + "Does not support type %d, requires float|int|uint8", input1->type); return kTfLiteError; } @@ -124,7 +181,6 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, Greater, requires_broadcast); @@ -135,9 +191,13 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedGreater(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type %d, requires float|int", + "Does not support type %d, requires float|int|uint8", input1->type); return kTfLiteError; } @@ -149,7 +209,6 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, GreaterEqual, requires_broadcast); @@ -160,9 +219,13 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedGreaterEqual(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type %d, requires float|int", + "Does not support type %d, requires float|int|uint8", input1->type); return kTfLiteError; } @@ -174,7 +237,6 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, Less, requires_broadcast); @@ -185,9 +247,13 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, Less, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedLess(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type %d, requires float|int", + "Does not support type %d, requires float|int|uint8", input1->type); return kTfLiteError; } @@ -199,7 +265,6 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); bool requires_broadcast = !HaveSameShapes(input1, input2); - // TODO(renjieliu): Support quantized data. switch (input1->type) { case kTfLiteFloat32: TF_LITE_COMPARISON(float, LessEqual, requires_broadcast); @@ -210,9 +275,13 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt64: TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast); break; + case kTfLiteUInt8: + EvalQuantizedLessEqual(context, node, input1, input2, output, + requires_broadcast); + break; default: context->ReportError(context, - "Does not support type %d, requires float|int", + "Does not support type %d, requires float|int|uint8", input1->type); return kTfLiteError; } diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/contrib/lite/kernels/comparisons_test.cc index bb02e1c812fdc40bf515f1f978e9e39b5a16a4ea..67a91c17fd4f25e4a9ea22de5e2a10dc1c17656d 100644 --- a/tensorflow/contrib/lite/kernels/comparisons_test.cc +++ b/tensorflow/contrib/lite/kernels/comparisons_test.cc @@ -35,6 +35,15 @@ class ComparisonOpModel : public SingleOpModel { BuildInterpreter({input1_shape, input2_shape}); } + ComparisonOpModel(const TensorData& input1, const TensorData& input2, + TensorType input_type, BuiltinOperator op) { + input1_ = AddInput(input1); + input2_ = AddInput(input2); + output_ = AddOutput(TensorType_BOOL); + ConfigureBuiltinOp(op); + BuildInterpreter({GetShape(input1_), GetShape(input2_)}); + } + int input1() { return input1_; } int input2() { return input2_; } @@ -354,6 +363,192 @@ TEST(ComparisonsTest, LessEqualBroadcastTwoD) { EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4)); } +TEST(QuantizedComparisonsTest, EqualQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_EQUAL); + model.QuantizeAndPopulate(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate(model.input2(), {1, 2, 7, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, false)); +} + +TEST(QuantizedComparisonsTest, NotEqualQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_NOT_EQUAL); + model.QuantizeAndPopulate(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate(model.input2(), {1, 2, 7, 0}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, true)); +} + +TEST(ComparisonsTest, GreaterQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_GREATER); + model.QuantizeAndPopulate(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate(model.input2(), {1, 2, 6, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false)); +} + +TEST(ComparisonsTest, GreaterEqualQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_GREATER_EQUAL); + model.QuantizeAndPopulate(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate(model.input2(), {1, 2, 6, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, true, true, false)); +} + +TEST(ComparisonsTest, LessQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_LESS); + model.QuantizeAndPopulate(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate(model.input2(), {1, 2, 6, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true)); +} + +TEST(ComparisonsTest, LessEqualQuantized) { + const float kMin = -1.f; + const float kMax = 128.f; + ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + {TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_LESS_EQUAL); + model.QuantizeAndPopulate(model.input1(), {1, 9, 7, 3}); + model.QuantizeAndPopulate(model.input2(), {1, 2, 6, 5}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true)); +} + +TEST(ComparisonsTest, QuantizedEqualWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_EQUAL); + model.QuantizeAndPopulate(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate(model.input2(), {2}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(false, true, false, false, false, false)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_NOT_EQUAL); + model.QuantizeAndPopulate(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate(model.input2(), {2}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(true, false, true, true, true, true)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_GREATER); + model.QuantizeAndPopulate(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate(model.input2(), {8}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(true, false, false, false, true, true)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_GREATER_EQUAL); + model.QuantizeAndPopulate(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate(model.input2(), {8}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(true, false, false, true, true, true)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedLessWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_LESS); + model.QuantizeAndPopulate(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate(model.input2(), {8}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(false, true, true, false, false, false)) + << "With shape number " << i; + } +} + +TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) { + const float kMin = -1.f; + const float kMax = 128.f; + std::vector> test_shapes = { + {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; + for (int i = 0; i < test_shapes.size(); ++i) { + ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax}, + {TensorType_UINT8, {}, kMin, kMax}, + TensorType_UINT8, BuiltinOperator_LESS_EQUAL); + model.QuantizeAndPopulate(model.input1(), {20, 2, 7, 8, 11, 20}); + model.QuantizeAndPopulate(model.input2(), {8}); + model.Invoke(); + EXPECT_THAT(model.GetOutput(), + ElementsAre(false, true, true, true, false, false)) + << "With shape number " << i; + } +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc index ad211e9c67eed9ca70fcdd51171fdb70bd89b27c..605a20ac3e7c8346db2bcf64e9422132b433b3da 100644 --- a/tensorflow/contrib/lite/kernels/concatenation.cc +++ b/tensorflow/contrib/lite/kernels/concatenation.cc @@ -57,7 +57,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, t0->dims->size <= 4); TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone); TF_LITE_ENSURE(context, - input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8); + input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 || + input_type == kTfLiteInt16 || input_type == kTfLiteInt32 || + input_type == kTfLiteInt64); // Output dimensions will match input dimensions, except 'axis', which // will be the sum of inputs @@ -121,6 +123,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_CONCATENATION(optimized_ops, float); } break; + case kTfLiteInt32: + if (kernel_type == kReference) { + TF_LITE_CONCATENATION(reference_ops, int32); + } else { + TF_LITE_CONCATENATION(optimized_ops, int32); + } + break; case kTfLiteUInt8: if (kernel_type == kReference) { TF_LITE_CONCATENATION_QUANTIZED(reference_ops); @@ -128,6 +137,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_CONCATENATION_QUANTIZED(optimized_ops); } break; + case kTfLiteInt64: + if (kernel_type == kReference) { + TF_LITE_CONCATENATION(reference_ops, int64_t); + } else { + TF_LITE_CONCATENATION(optimized_ops, int64_t); + } + break; + default: context->ReportError(context, "Only float32 and uint8 are currently supported."); diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc index 6f174763dfab9845d991b930e44b07a95e00d824..04c0263b789e75727ed3bd4d6b3292063a4530e0 100644 --- a/tensorflow/contrib/lite/kernels/conv.cc +++ b/tensorflow/contrib/lite/kernels/conv.cc @@ -256,10 +256,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); - TF_LITE_ENSURE(context, real_multiplier < 1.0); - QuantizeMultiplierSmallerThanOneExp( - real_multiplier, &data->output_multiplier, &data->output_shift); - data->output_shift *= -1; + + int exponent; + QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent); + data->output_shift = -exponent; CalculateActivationRangeUint8(params->activation, output, &data->output_activation_min, &data->output_activation_max); diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc index 0dcfc826fd218d2d2dfbf89201d2c13fbfe6f0e1..24633c2fd7cb3725977ae6c6459daa829165ccfd 100644 --- a/tensorflow/contrib/lite/kernels/conv_test.cc +++ b/tensorflow/contrib/lite/kernels/conv_test.cc @@ -64,12 +64,6 @@ class BaseConvolutionOpModel : public SingleOpModel { } output_ = AddOutput(output); - if (input.type != TensorType_FLOAT32) { - // The following is required by quantized inference. It is the unittest's - // responsibility to make sure the output scale falls into the correct - // range. - CHECK_LT(GetScale(input_) * GetScale(filter_), GetScale(output_)); - } SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions, CreateConv2DOptions( @@ -441,6 +435,44 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantized) { })); } +TEST_P(ConvolutionOpTest, SimpleTestQuantizedOutputMultiplierGreaterThan1) { + // output_multiplier = 1.0118 + QuantizedConvolutionOpModel quant_op( + GetRegistration(), {TensorType_UINT8, {2, 2, 4, 1}, -128.5, 128}, + {TensorType_UINT8, {3, 2, 2, 1}, -128.5, 128}, + {TensorType_UINT8, {}, -127, 128}); + ConvolutionOpModel float_op( + GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}}, + {TensorType_FLOAT32, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}}); + std::initializer_list input = { + // First batch + 1, 1, 1, 1, // row = 1 + 2, 2, 2, 2, // row = 2 + // Second batch + 1, 2, 3, 4, // row = 1 + 1, 2, 3, 4, // row = 2 + }; + std::initializer_list filter = { + 1, 2, 3, 4, // first 2x2 filter + -1, 1, -1, 1, // second 2x2 filter + -1, -1, 1, 1, // third 2x2 filter + }; + std::initializer_list bias = {1, 2, 3}; + + quant_op.SetInput(input); + quant_op.SetFilter(filter); + quant_op.SetBias(bias); + quant_op.Invoke(); + + float_op.SetInput(input); + float_op.SetFilter(filter); + float_op.SetBias(bias); + float_op.Invoke(); + + EXPECT_THAT(quant_op.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear(float_op.GetOutput(), 1))); +} + TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) { QuantizedConvolutionOpModel m(GetRegistration(), {TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64}, diff --git a/tensorflow/contrib/lite/kernels/dequantize.cc b/tensorflow/contrib/lite/kernels/dequantize.cc index 672b2170e4990f0a7ca9755071d9d086f5ae5c2b..2b0f04489a48cd4402e7574ecc5eeecfd8c6234f 100644 --- a/tensorflow/contrib/lite/kernels/dequantize.cc +++ b/tensorflow/contrib/lite/kernels/dequantize.cc @@ -36,6 +36,21 @@ struct OpContext { TfLiteTensor* output; }; +struct OpData { + // This boolean value is only used when the input tensor is constant. + bool float_dequantized_weights_initialized; +}; + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + auto* op_data = new OpData(); + op_data->float_dequantized_weights_initialized = false; + return op_data; +} + +void Free(TfLiteContext* context, void* buffer) { + delete reinterpret_cast(buffer); +} + TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); @@ -45,12 +60,22 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8); op_context.output->type = kTfLiteFloat32; + // If the input tensor is constant, we can persist the dequantized value in + // the output tensor. Otherwise we run dequantize upon each eval. + if (IsConstantTensor(op_context.input)) { + op_context.output->allocation_type = kTfLiteArenaRwPersistent; + } return context->ResizeTensor(context, op_context.output, TfLiteIntArrayCopy(op_context.input->dims)); } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + OpData* op_data = reinterpret_cast(node->user_data); OpContext op_context(context, node); + if (IsConstantTensor(op_context.input) && + op_data->float_dequantized_weights_initialized) { + return kTfLiteOk; + } auto zero_point = op_context.input->params.zero_point; auto scale = op_context.input->params.scale; @@ -59,14 +84,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { GetTensorDims(op_context.input), zero_point, scale, GetTensorData(op_context.output), GetTensorDims(op_context.output)); + + if (IsConstantTensor(op_context.input)) { + op_data->float_dequantized_weights_initialized = true; + } + return kTfLiteOk; } } // namespace dequantize TfLiteRegistration* Register_DEQUANTIZE_OPT() { - static TfLiteRegistration r = {nullptr, nullptr, dequantize::Prepare, - dequantize::Eval}; + static TfLiteRegistration r = {dequantize::Init, dequantize::Free, + dequantize::Prepare, dequantize::Eval}; return &r; } diff --git a/tensorflow/contrib/lite/kernels/detection_postprocess.cc b/tensorflow/contrib/lite/kernels/detection_postprocess.cc index 0c532cac5a9f59c8b09ff9aefc294e243561f027..d7bde0ff79bd23fa4c277dd04ec4343663e0ad00 100644 --- a/tensorflow/contrib/lite/kernels/detection_postprocess.cc +++ b/tensorflow/contrib/lite/kernels/detection_postprocess.cc @@ -40,8 +40,8 @@ constexpr int kOutputTensorDetectionClasses = 1; constexpr int kOutputTensorDetectionScores = 2; constexpr int kOutputTensorNumDetections = 3; -constexpr size_t kNumCoordBox = 4; -constexpr size_t kBatchSize = 1; +constexpr int kNumCoordBox = 4; +constexpr int kBatchSize = 1; // Object Detection model produces axis-aligned boxes in two formats: // BoxCorner represents the upper right (xmin, ymin) and diff --git a/tensorflow/contrib/lite/kernels/elementwise.cc b/tensorflow/contrib/lite/kernels/elementwise.cc index 59bab3c4ecd20bf938919ca606a5933f3112f233..e19779ea59d441984d3562508e4237e10ce17515 100644 --- a/tensorflow/contrib/lite/kernels/elementwise.cc +++ b/tensorflow/contrib/lite/kernels/elementwise.cc @@ -22,79 +22,118 @@ namespace tflite { namespace ops { namespace builtin { namespace elementwise { +namespace { +bool IsNumericSupportedType(const TfLiteType type) { + return type == kTfLiteFloat32; +} + +bool IsLogicalSupportedType(const TfLiteType type) { + return type == kTfLiteBool; +} + +typedef bool (*IsSupportedType)(TfLiteType); +template TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); const TfLiteTensor* input = GetInput(context, node, 0); TfLiteTensor* output = GetOutput(context, node, 0); TF_LITE_ENSURE_EQ(context, input->type, output->type); - // Quantized float is not supported yet. - TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + if (!IsSupportedType(input->type)) { + context->ReportError(context, "Current data type %d is not supported.", + input->type); + return kTfLiteError; + } return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims)); } -inline TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, - float float_func(float)) { +template +inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node, + T func(T), TfLiteType expected_type) { const TfLiteTensor* input = GetInput(context, node, 0); TfLiteTensor* output = GetOutput(context, node, 0); - switch (input->type) { - case kTfLiteFloat32: { - size_t elements = NumElements(input); - const float* in = GetTensorData(input); - const float* in_end = in + elements; - float* out = output->data.f; - for (; in < in_end; in++, out++) *out = float_func(*in); - return kTfLiteOk; - } - default: { - context->ReportError(context, "Input type is %d, requires float32", - input->type); - return kTfLiteError; - } + TF_LITE_ENSURE_EQ(context, input->type, expected_type); + const int64_t num_elements = NumElements(input); + const T* in_data = GetTensorData(input); + T* out_data = GetTensorData(output); + for (int64_t i = 0; i < num_elements; ++i) { + out_data[i] = func(in_data[i]); } + return kTfLiteOk; +} + +inline TfLiteStatus EvalNumeric(TfLiteContext* context, TfLiteNode* node, + float float_func(float)) { + return EvalImpl(context, node, float_func, kTfLiteFloat32); +} + +inline TfLiteStatus EvalLogical(TfLiteContext* context, TfLiteNode* node, + bool bool_func(bool)) { + return EvalImpl(context, node, bool_func, kTfLiteBool); } TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) { - return Eval(context, node, std::sin); + return EvalNumeric(context, node, std::sin); } TfLiteStatus LogEval(TfLiteContext* context, TfLiteNode* node) { - return Eval(context, node, std::log); + return EvalNumeric(context, node, std::log); } TfLiteStatus SqrtEval(TfLiteContext* context, TfLiteNode* node) { - return Eval(context, node, std::sqrt); + return EvalNumeric(context, node, std::sqrt); } TfLiteStatus RsqrtEval(TfLiteContext* context, TfLiteNode* node) { - return Eval(context, node, [](float f) { return 1.f / std::sqrt(f); }); + return EvalNumeric(context, node, [](float f) { return 1.f / std::sqrt(f); }); +} + +TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) { + return EvalLogical(context, node, [](bool v) { return !v; }); } +} // namespace } // namespace elementwise TfLiteRegistration* Register_SIN() { - static TfLiteRegistration r = {nullptr, nullptr, elementwise::GenericPrepare, - elementwise::SinEval}; + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare, + elementwise::SinEval}; return &r; } TfLiteRegistration* Register_LOG() { - static TfLiteRegistration r = {nullptr, nullptr, elementwise::GenericPrepare, - elementwise::LogEval}; + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare, + elementwise::LogEval}; return &r; } TfLiteRegistration* Register_SQRT() { - static TfLiteRegistration r = {nullptr, nullptr, elementwise::GenericPrepare, - elementwise::SqrtEval}; + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare, + elementwise::SqrtEval}; return &r; } TfLiteRegistration* Register_RSQRT() { - static TfLiteRegistration r = {nullptr, nullptr, elementwise::GenericPrepare, - elementwise::RsqrtEval}; + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare, + elementwise::RsqrtEval}; + return &r; +} + +TfLiteRegistration* Register_LOGICAL_NOT() { + static TfLiteRegistration r = { + /*init=*/nullptr, /*free=*/nullptr, + elementwise::GenericPrepare, + elementwise::LogicalNotEval}; return &r; } diff --git a/tensorflow/contrib/lite/kernels/elementwise_test.cc b/tensorflow/contrib/lite/kernels/elementwise_test.cc index ce4c602ee5c788d67701af3ecd3e023f2b25aae7..b9d7d73c52862da9166f6881b1e27a6ff6b76bbc 100644 --- a/tensorflow/contrib/lite/kernels/elementwise_test.cc +++ b/tensorflow/contrib/lite/kernels/elementwise_test.cc @@ -24,26 +24,40 @@ namespace { using ::testing::ElementsAreArray; -class ElementWiseOpModel : public SingleOpModel { +class ElementWiseOpBaseModel : public SingleOpModel { public: - ElementWiseOpModel(BuiltinOperator op, - std::initializer_list input_shape) { + int input() const { return input_; } + int output() const { return output_; } + + protected: + int input_; + int output_; +}; + +class ElementWiseOpFloatModel : public ElementWiseOpBaseModel { + public: + ElementWiseOpFloatModel(BuiltinOperator op, + std::initializer_list input_shape) { input_ = AddInput(TensorType_FLOAT32); output_ = AddOutput(TensorType_FLOAT32); SetBuiltinOp(op, BuiltinOptions_NONE, 0); BuildInterpreter({input_shape}); } +}; - int input() const { return input_; } - int output() const { return output_; } - - private: - int input_; - int output_; +class ElementWiseOpBoolModel : public ElementWiseOpBaseModel { + public: + ElementWiseOpBoolModel(BuiltinOperator op, + std::initializer_list input_shape) { + input_ = AddInput(TensorType_BOOL); + output_ = AddOutput(TensorType_BOOL); + SetBuiltinOp(op, BuiltinOptions_NONE, 0); + BuildInterpreter({input_shape}); + } }; TEST(ElementWise, Sin) { - ElementWiseOpModel m(BuiltinOperator_SIN, {1, 1, 4, 1}); + ElementWiseOpFloatModel m(BuiltinOperator_SIN, {1, 1, 4, 1}); m.PopulateTensor(m.input(), {0, 3.1415926, -3.1415926, 1}); m.Invoke(); EXPECT_THAT(m.ExtractVector(m.output()), @@ -52,7 +66,7 @@ TEST(ElementWise, Sin) { } TEST(ElementWise, Log) { - ElementWiseOpModel m(BuiltinOperator_LOG, {1, 1, 4, 1}); + ElementWiseOpFloatModel m(BuiltinOperator_LOG, {1, 1, 4, 1}); m.PopulateTensor(m.input(), {1, 3.1415926, 1, 1}); m.Invoke(); EXPECT_THAT(m.ExtractVector(m.output()), @@ -61,7 +75,7 @@ TEST(ElementWise, Log) { } TEST(ElementWise, Sqrt) { - ElementWiseOpModel m(BuiltinOperator_SQRT, {1, 1, 4, 1}); + ElementWiseOpFloatModel m(BuiltinOperator_SQRT, {1, 1, 4, 1}); m.PopulateTensor(m.input(), {0, 1, 2, 4}); m.Invoke(); EXPECT_THAT(m.ExtractVector(m.output()), @@ -70,7 +84,7 @@ TEST(ElementWise, Sqrt) { } TEST(ElementWise, Rsqrt) { - ElementWiseOpModel m(BuiltinOperator_RSQRT, {1, 1, 4, 1}); + ElementWiseOpFloatModel m(BuiltinOperator_RSQRT, {1, 1, 4, 1}); m.PopulateTensor(m.input(), {1, 2, 4, 9}); m.Invoke(); EXPECT_THAT(m.ExtractVector(m.output()), @@ -78,6 +92,15 @@ TEST(ElementWise, Rsqrt) { EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1})); } +TEST(ElementWise, LogicalNot) { + ElementWiseOpBoolModel m(BuiltinOperator_LOGICAL_NOT, {1, 1, 4, 1}); + m.PopulateTensor(m.input(), {true, false, true, false}); + m.Invoke(); + EXPECT_THAT(m.ExtractVector(m.output()), + ElementsAreArray({false, true, false, true})); + EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1})); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc index bc370608c092eeb5312dc40b56f47740f473c8ae..eaf5a67d6787b9113bd0835d436b459e00ed7fff 100644 --- a/tensorflow/contrib/lite/kernels/fully_connected.cc +++ b/tensorflow/contrib/lite/kernels/fully_connected.cc @@ -121,10 +121,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); - TF_LITE_ENSURE(context, real_multiplier < 1.0); - QuantizeMultiplierSmallerThanOneExp( - real_multiplier, &data->output_multiplier, &data->output_shift); - data->output_shift *= -1; + int exponent; + QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent); + data->output_shift = -exponent; TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized( context, params->activation, output, &data->output_activation_min, &data->output_activation_max)); diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc index ec949056971ccb5f7a6f93fa9f236a93625ca6ad..08b43209466a1b85613ae41d5aa776194f992c60 100644 --- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc +++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc @@ -423,6 +423,37 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) { ElementsAre(151, 152, 153, 185, 186, 187)); } +TEST_P(QuantizedFullyConnectedOpTest, + SimpleTestQuantizedOutputMultiplierGreaterThan1) { + // real_multiplier = 2. + QuantizedFullyConnectedOpModel m( + GetRegistration(), /*units=*/3, /*batches*/ 2, + /*input=*/{TensorType_UINT8, {2, 10}, -127, 128}, + /*output=*/{TensorType_UINT8, {}, -63.5, 64}); + + m.SetWeights({ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 + }); + m.SetBias({1, 2, 3}); + + m.SetInput({ + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({ + 24, 25, 26, // first batch + 58, 59, 60, // second batch + }))); + EXPECT_THAT(m.GetOutput(), + ElementsAre(175, 177, 179, 243, 245, 247)); +} + void SimpleTestQuantizedInt16OutputCase( TfLiteRegistration* registration, int input_depth, int output_depth, int batches, FullyConnectedOptionsWeightsFormat weights_format) { @@ -631,6 +662,37 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) { ElementsAre(151, 152, 153, 185, 186, 187)); } +TEST_P(QuantizedFullyConnectedOpTest, + SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) { + // real_multiplier = 2. + QuantizedFullyConnectedOpModel m( + GetRegistration(), /*units=*/3, /*batches=*/2, + /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128}, + /*output=*/{TensorType_UINT8, {}, -63.5, 64}); + + m.SetWeights({ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + }); + m.SetBias({1, 2, 3}); + + m.SetInput({ + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({ + 24, 25, 26, // first batch + 58, 59, 60, // second batch + }))); + EXPECT_THAT(m.GetOutput(), + ElementsAre(175, 177, 179, 243, 245, 247)); +} + INSTANTIATE_TEST_CASE_P( FloatFullyConnectedOpTest, FloatFullyConnectedOpTest, ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD index 3a855fe3ddaa7e7de0134f8dfee1ccf67168541a..a97db6c6b2523e09705c22ab0463c362ad3d2ff1 100644 --- a/tensorflow/contrib/lite/kernels/internal/BUILD +++ b/tensorflow/contrib/lite/kernels/internal/BUILD @@ -481,6 +481,9 @@ cc_library( ":darwin": [ ":neon_tensor_utils", ], + ":darwin_x86_64": [ + ":neon_tensor_utils", + ], "//conditions:default": [ ":portable_tensor_utils", ], @@ -493,6 +496,7 @@ cc_library( hdrs = ["test_util.h"], deps = [ ":types", + "//tensorflow/contrib/lite:string", ], ) @@ -535,7 +539,10 @@ cc_test( cc_test( name = "depthwiseconv_quantized_test", srcs = ["depthwiseconv_quantized_test.cc"], - tags = ["no_oss"], + tags = [ + "no_oss", + "tflite_not_portable_ios", + ], deps = [ ":optimized_base", ":reference_base", @@ -573,6 +580,7 @@ cc_test( ":quantization_util", ":reference_base", ":test_util", + "//tensorflow/contrib/lite:string", "@com_google_googletest//:gtest_main", ], ) @@ -592,6 +600,7 @@ cc_test( ":quantization_util", ":reference_base", ":test_util", + "//tensorflow/contrib/lite:string", "@com_google_googletest//:gtest_main", ], ) @@ -603,6 +612,7 @@ cc_test( deps = [ ":optimized_base", ":reference_base", + "//tensorflow/contrib/lite:string", "@com_google_googletest//:gtest_main", ], ) diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h index 310a8980e6943db3804b0671a21ccf0e6ce34c28..eb4d0108bd0438dd27744a864d071cfc166a7a94 100644 --- a/tensorflow/contrib/lite/kernels/internal/common.h +++ b/tensorflow/contrib/lite/kernels/internal/common.h @@ -117,6 +117,9 @@ template int CountLeadingZeros(T integer_input) { static_assert(std::is_unsigned::value, "Only unsigned integer types handled."); +#if defined(__GNUC__) + return integer_input ? __builtin_clz(integer_input) : 0; +#else const T one_in_leading_positive = static_cast(1) << (std::numeric_limits::digits - 1); int leading_zeros = 0; @@ -125,6 +128,7 @@ int CountLeadingZeros(T integer_input) { ++leading_zeros; } return leading_zeros; +#endif } // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING diff --git a/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc index 7e9ff5242a43a8b54e0e6ae167cdcf7a341c918e..8963abb9afd9d51473fe5a22d8e88d314b385ad9 100644 --- a/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc +++ b/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc @@ -29,8 +29,9 @@ limitations under the License. #include #include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" +#include "tensorflow/contrib/lite/string.h" -namespace { +namespace tflite { class NumberGenerator { public: @@ -330,4 +331,4 @@ TEST_F(LogQuantizedTest, SelectedIntegerBits) { &generator_); } -} // namespace +} // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc index d2f1103e14b40b81c59c8053bcdbee30c85e5c78..3624c20ae3bbf5f8eb5cb5fb51aadcde7327fd55 100644 --- a/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc +++ b/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/test_util.h" +#include "tensorflow/contrib/lite/string.h" namespace tflite { namespace { diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h index d85e06a5d5af8d23235a08592d49754e4f493d34..250872c422a3ff9b3353d0055513ff1f7f03d68e 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h @@ -33,7 +33,7 @@ limitations under the License. #include #ifdef _WIN32 -#include +#include #elif defined(__APPLE__) #include #else diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 78567d52eaab779c724d3e3d04fbaf92fe6e589b..b87078977234fd856cb0fcd96363ba92ddb3ad74 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -168,6 +168,18 @@ ArrayMap MapAsArrayWithFirstDimAsRows(Scalar* data, return ArrayMap(data, rows, cols); } +// Copied from tensorflow/core/framework/tensor_types.h +template +struct TTypes { + // Rank-1 tensor (vector) of scalar type T. + typedef Eigen::TensorMap, + Eigen::Aligned> + Flat; + typedef Eigen::TensorMap< + Eigen::Tensor> + UnalignedConstMatrix; +}; + // TODO(b/62193649): this function is only needed as long // as we have the --variable_batch hack. template @@ -881,6 +893,7 @@ inline void FullyConnectedAsGEMV( const int input_size = FlatSizeSkipDim(input_dims, 3); const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); static constexpr int kPeel = 4; + const bool shift_left = (output_shift <= 0); for (int k = 0; k < input_size; k += 64) { optimized_ops_preload_l1_stream(input_data + k); } @@ -992,11 +1005,17 @@ inline void FullyConnectedAsGEMV( int32x4_t bias_vec = vld1q_s32(bias_ptr); bias_ptr += 4; reduced = vaddq_s32(reduced, bias_vec); - // Multiply by the fixed-point multiplier. - reduced = vqrdmulhq_n_s32(reduced, output_multiplier); - // Rounding-shift-right. - using gemmlowp::RoundingDivideByPOT; - reduced = RoundingDivideByPOT(reduced, output_shift); + if (shift_left) { + const int32 multiplier_power_of_two = 1 << -output_shift; + reduced = vmulq_n_s32(reduced, multiplier_power_of_two); + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + } else { + // Multiply by the fixed-point multiplier. + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + // Rounding-shift-right. + using gemmlowp::RoundingDivideByPOT; + reduced = RoundingDivideByPOT(reduced, output_shift); + } // Add the output offset. const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); reduced = vaddq_s32(reduced, output_offset_vec); @@ -1018,10 +1037,10 @@ inline void FullyConnectedAsGEMV( struct GemmlowpOutputPipeline { typedef gemmlowp::VectorMap ColVectorMap; - typedef std::tuple< - gemmlowp::OutputStageBiasAddition, - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, - gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8> + typedef std::tuple, + gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent, + gemmlowp::OutputStageClamp, + gemmlowp::OutputStageSaturatingCastToUint8> Pipeline; static Pipeline MakeExp(const int32* bias_data, int output_rows, int32 output_offset, int32 output_multiplier, @@ -1030,11 +1049,10 @@ struct GemmlowpOutputPipeline { ColVectorMap bias_vector(bias_data, output_rows); gemmlowp::OutputStageBiasAddition bias_addition_stage; bias_addition_stage.bias_vector = bias_vector; - gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint - quantize_down_stage; + gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage; quantize_down_stage.result_offset_after_shift = output_offset; quantize_down_stage.result_fixedpoint_multiplier = output_multiplier; - quantize_down_stage.result_shift = -output_left_shift; + quantize_down_stage.result_exponent = output_left_shift; gemmlowp::OutputStageClamp clamp_stage; clamp_stage.min = output_activation_min; clamp_stage.max = output_activation_max; @@ -2315,7 +2333,8 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, ++*output_shift; } TFLITE_DCHECK_GT(input, 0); - const unsigned max_left_shift_bits = __builtin_clz(input) - 1; + const unsigned max_left_shift_bits = + CountLeadingZeros(static_cast(input)) - 1; const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; *output_shift -= left_shift_bit_pairs; @@ -4023,7 +4042,7 @@ inline void Softmax(const uint8* input_data, const RuntimeShape& input_shape, // perform a division by the above-computed sum-of-exponentials. int32 fixed_sum_of_exps = sum_of_exps.raw(); int headroom_plus_one = - __builtin_clz(static_cast(fixed_sum_of_exps)); + CountLeadingZeros(static_cast(fixed_sum_of_exps)); // This is the number of bits to the left of the binary point above 1.0. // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and // no later adjustment will be needed. @@ -4169,7 +4188,7 @@ log_x_for_x_greater_than_or_equal_to_1_impl( // required shift "ourselves" instead of using, say, Rescale. FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw()); // z_a_pow_2 = input_integer_bits - z_a_headroom; - int z_a_headroom_plus_1 = __builtin_clz(static_cast(z_a.raw())); + int z_a_headroom_plus_1 = CountLeadingZeros(static_cast(z_a.raw())); FixedPoint0 r_a_tmp = SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1)); const int32 r_a_raw = @@ -4184,7 +4203,7 @@ log_x_for_x_greater_than_or_equal_to_1_impl( // z_b is treated like z_a, but premultiplying by sqrt(0.5). FixedPoint0 z_b = z_a * sqrt_half; - int z_b_headroom = __builtin_clz(static_cast(z_b.raw())) - 1; + int z_b_headroom = CountLeadingZeros(static_cast(z_b.raw())) - 1; const int32 r_b_raw = SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom); const FixedPointAccum z_b_pow_2_adj = SaturatingSub( diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc index e224980493aa11f642da103ee7d7377b6c4b1da0..f882f9910e0c65d69eb5a86886bae4d3c881e6ab 100644 --- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc +++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc @@ -109,12 +109,12 @@ int CalculateInputRadius(int input_integer_bits, int input_left_shift) { void NudgeQuantizationRange(const float min, const float max, const int quant_min, const int quant_max, float* nudged_min, float* nudged_max, - float* scale) { + float* nudged_scale) { // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h. const float quant_min_float = static_cast(quant_min); const float quant_max_float = static_cast(quant_max); - *scale = (max - min) / (quant_max_float - quant_min_float); - const float zero_point_from_min = quant_min_float - min / *scale; + *nudged_scale = (max - min) / (quant_max_float - quant_min_float); + const float zero_point_from_min = quant_min_float - min / *nudged_scale; uint16 nudged_zero_point; if (zero_point_from_min < quant_min_float) { nudged_zero_point = static_cast(quant_min); @@ -123,8 +123,25 @@ void NudgeQuantizationRange(const float min, const float max, } else { nudged_zero_point = static_cast(TfLiteRound(zero_point_from_min)); } - *nudged_min = (quant_min_float - nudged_zero_point) * (*scale); - *nudged_max = (quant_max_float - nudged_zero_point) * (*scale); + *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale); + *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale); +} + +void FakeQuantizeArray(const float nudged_scale, const float nudged_min, + const float nudged_max, const float* input_data, + float* output_data, const float size) { + // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h. + const float inv_nudged_scale = 1.0f / nudged_scale; + + for (int i = 0; i < size; i++) { + const float src_val = input_data[i]; + const float clamped = std::min(nudged_max, std::max(nudged_min, src_val)); + const float clamped_shifted = clamped - nudged_min; + const float dst_val = + TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale + + nudged_min; + output_data[i] = dst_val; + } } bool CheckedLog2(const float x, int* log2_result) { diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h index 9b3f1823dc7e08562d8906346bc44e4478642ddc..9ee4a47fbb5bba1a409830f99c7b9ba967325a0a 100644 --- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h +++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h @@ -222,7 +222,15 @@ int CalculateInputRadius(int input_integer_bits, int input_left_shift); // Outputs nudged_min, nudged_max, nudged_scale. void NudgeQuantizationRange(const float min, const float max, const int quant_min, const int quant_max, - float* nudged_min, float* nudged_max, float* scale); + float* nudged_min, float* nudged_max, + float* nudged_scale); + +// Fake quantizes (quantizes and dequantizes) input_data using the scale, +// nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code +// in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor. +void FakeQuantizeArray(const float nudged_scale, const float nudged_min, + const float nudged_max, const float* input_data, + float* output_data, const float size); // If x is approximately a power of two (with any positive or negative // exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc index 94773b47d3817d7ed7240f74545ad04e7fa4bd52..00fc3e91dc90254ca68d637941e5a2482e2832a8 100644 --- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc +++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc @@ -130,22 +130,22 @@ void RunSafeCastTests() { } TEST(QuantizationUtilTest, SafeCast) { - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); - RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); + RunSafeCastTests(); } // Example taken from http://www.tensorflow.org/performance/quantization diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc index 6bd88b5596bc0f7c425745012b7b4a091b64afbb..aa93e857d7a9f98aa06e91ff3d6c743b00b17137 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc +++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc @@ -21,6 +21,10 @@ limitations under the License. #include "tensorflow/contrib/lite/kernels/internal/round.h" #include "tensorflow/contrib/lite/kernels/op_macros.h" +#if defined(_MSC_VER) +#define __restrict__ __restrict +#endif + namespace tflite { namespace tensor_utils { @@ -38,10 +42,8 @@ bool PortableIsZeroVector(const float* vector, int v_size) { } void PortableSymmetricQuantizeFloats(const float* values, const int size, - int8_t* quantized_values, - float* __restrict__ min_value, - float* __restrict__ max_value, - float* __restrict__ scaling_factor) { + int8_t* quantized_values, float* min_value, + float* max_value, float* scaling_factor) { auto minmax = std::minmax_element(values, values + size); *min_value = *minmax.first; *max_value = *minmax.second; @@ -71,10 +73,12 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix, for (int b = 0; b < n_batch; b++) { const float* matrix_ptr = matrix; for (int r = 0; r < m_rows; r++) { + float dot_prod = 0.0f; const float* vector_in_batch = vector + b * m_cols; for (int c = 0; c < m_cols; c++) { - *result_in_batch += *matrix_ptr++ * *vector_in_batch++; + dot_prod += *matrix_ptr++ * *vector_in_batch++; } + *result_in_batch += dot_prod; result_in_batch += result_stride; } } @@ -82,9 +86,8 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix, void PortableMatrixBatchVectorMultiplyAccumulate( const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, - const float* __restrict__ scaling_factors, int n_batch, - float* __restrict__ result, int result_stride) { + const int8_t* __restrict__ vectors, const float* scaling_factors, + int n_batch, float* __restrict__ result, int result_stride) { int batch, row, col; for (batch = 0; batch < n_batch; ++batch, vectors += m_cols) { const float batch_scaling_factor = scaling_factors[batch]; @@ -93,9 +96,11 @@ void PortableMatrixBatchVectorMultiplyAccumulate( for (row = 0; row < m_rows; ++row, result += result_stride) { // Initialize the dot product sum for the row to 0. int32_t dotprod = 0; +#if defined(__GNUC__) // Prefetch the row to cache. __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */); +#endif // For every block of 16 8-bit elements (128-bit register) from each row. for (col = 0; col < m_cols; ++col, ++row_ptr) { dotprod += (*row_ptr) * (vectors[col]); diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index 714613b96e11d417cb962eb76baee794556d12eb..f4176e474e738d83783379fff0e45722396f24a6 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -322,8 +322,8 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims, if (bias_data) { acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; } - acc = MultiplyByQuantizedMultiplierSmallerThanOneExp( - acc, output_multiplier, kReverseShift * output_shift); + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + kReverseShift * output_shift); acc += output_offset; acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); @@ -546,8 +546,8 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, if (bias_data) { acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; } - acc = MultiplyByQuantizedMultiplierSmallerThanOneExp( - acc, output_multiplier, kReverseShift * output_shift); + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + kReverseShift * output_shift); acc += output_offset; acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); @@ -903,7 +903,8 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32 input, ++*output_shift; } TFLITE_DCHECK_GT(input, 0); - const unsigned max_left_shift_bits = __builtin_clz(input) - 1; + const unsigned max_left_shift_bits = + CountLeadingZeros(static_cast(input)) - 1; const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; *output_shift -= left_shift_bit_pairs; @@ -3155,18 +3156,9 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, float nudged_min, nudged_max, nudged_scale; NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, &nudged_max, &nudged_scale); - const float inv_nudged_scale = 1.0f / nudged_scale; - const int flat_size = MatchingFlatSize(output_dims, input_dims); - for (int i = 0; i < flat_size; i++) { - const float src_val = input_data[i]; - const float clamped = std::min(nudged_max, std::max(nudged_min, src_val)); - const float clamped_shifted = clamped - nudged_min; - const float dst_val = - TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale + - nudged_min; - output_data[i] = dst_val; - } + FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data, + output_data, flat_size); } template @@ -4190,8 +4182,8 @@ inline void RankOneSelect(const D* input_condition_data, } // For easy implementation, the indices is always a vector of size-4 vectors. -template -inline void SparseToDense(const std::vector>& indices, +template +inline void SparseToDense(const std::vector>& indices, const T* values, T default_value, T* output_data, const Dims<4>& output_dims, bool value_is_scalar) { const int value_count = indices.size(); @@ -4206,7 +4198,7 @@ inline void SparseToDense(const std::vector>& indices, // condition within the loop every time. if (value_is_scalar) { for (int i = 0; i < value_count; ++i) { - const std::vector& index = indices[i]; + const std::vector& index = indices[i]; TFLITE_DCHECK_EQ(index.size(), 4); const T value = *values; // just use the first value. output_data[Offset(output_dims, index[3], index[2], index[1], index[0])] = @@ -4217,7 +4209,7 @@ inline void SparseToDense(const std::vector>& indices, // Go through the values and indices to fill the sparse values. for (int i = 0; i < value_count; ++i) { - const std::vector& index = indices[i]; + const std::vector& index = indices[i]; TFLITE_DCHECK_EQ(index.size(), 4); const T value = values[i]; output_data[Offset(output_dims, index[3], index[2], index[1], index[0])] = @@ -4287,6 +4279,33 @@ inline void BroadcastLogical(const bool* input1_data, } } +// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more +// generalized and efficient BroadcastBinaryFunction. +// +// R: Result type. T1: Input 1 type. T2: Input 2 type. +template +inline void BroadcastBinaryFunction(const T1* input1_data, + const Dims<4>& input1_dims, + const T2* input2_data, + const Dims<4>& input2_dims, R* output_data, + const Dims<4>& output_dims, + R (*func)(T1, T2)) { + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + for (int b = 0; b < ArraySize(output_dims, 3); ++b) { + for (int y = 0; y < ArraySize(output_dims, 2); ++y) { + for (int x = 0; x < ArraySize(output_dims, 1); ++x) { + for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + output_data[Offset(output_dims, c, x, y, b)] = + func(input1_data[SubscriptToIndex(desc1, c, x, y, b)], + input2_data[SubscriptToIndex(desc2, c, x, y, b)]); + } + } + } + } +} + } // namespace reference_ops } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc index a7dad3c14e60fac9da9c0bcfd5d1d4c8f10b71c7..ca94e7740eb18e9d2d36c676e1db2766d7050852 100644 --- a/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc +++ b/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc @@ -27,6 +27,7 @@ limitations under the License. #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h" #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h" #include "tensorflow/contrib/lite/kernels/internal/test_util.h" +#include "tensorflow/contrib/lite/string.h" namespace tflite { namespace { diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc index 372a6efec5c69e53d558edf8c822f638a4d33d81..e8343f1223b2137a7df9cc264c56100bc66f9fc1 100644 --- a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc +++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc @@ -72,7 +72,7 @@ TEST(uKernels, SymmetricQuantizeFloatsTest) { static float input[kVectorSize] = {-640, -635.0, -630, 10.0, 2.0, -5.0, -10.0, 0.0, 1000.0}; - int8 output[kVectorSize]; + int8_t output[kVectorSize]; float min, max, scaling_factor; SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max, &scaling_factor); @@ -89,7 +89,7 @@ TEST(uKernels, SymmetricQuantizeFloatsAllZerosTest) { constexpr int kVectorSize = 9; static float input[kVectorSize] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; - int8 output[kVectorSize]; + int8_t output[kVectorSize]; float min, max, scaling_factor; SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max, &scaling_factor); @@ -105,7 +105,7 @@ TEST(uKernels, SymmetricQuantizeFloatsAllAlmostZeroTest) { static float input[kVectorSize] = {-1e-5, 3e-5, -7e-6, -9e-5, 1e-6, 4e-5, 9e-6, 2e-4, 0}; - int8 output[kVectorSize]; + int8_t output[kVectorSize]; float min, max, scaling_factor; SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max, &scaling_factor); @@ -143,6 +143,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) { -1., 3., 7., 3., 23., 3.}))); } +#ifdef __ANDROID__ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { // Note we use 29 columns as this exercises all the neon kernel: the // 16-block SIMD code, the 8-block postamble, and the leftover postamble. @@ -166,13 +167,13 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { -13.13, 14.14, -15.15, 16.16, -17.17, 18.18, -19.19, 20.2, -21.21, 22.22, -23.23, 24.24, -25.25, 26.26, -27.27, 28.28, 0}; - int8* a_int8_data = reinterpret_cast( + int8_t* a_int8_data = reinterpret_cast( aligned_malloc(a_rows * a_cols, kWeightsPerUint32)); float a_min, a_max; float scaling_factor_a; SymmetricQuantizeFloats(a_float_data, a_rows * a_cols, a_int8_data, &a_min, &a_max, &scaling_factor_a); - const int8 expected_a_int8_data[] = { + const int8_t expected_a_int8_data[] = { /* 1st row */ 5, 10, @@ -363,7 +364,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { }; // Quantized values of B: - int8 b_int8_data[b_rows * b_cols * batches]; + int8_t b_int8_data[b_rows * b_cols * batches]; float b_min, b_max; float scaling_factor_b[batches]; SymmetricQuantizeFloats(b_float_data, b_rows * b_cols, b_int8_data, &b_min, @@ -372,7 +373,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { &b_int8_data[b_rows * b_cols], &b_min, &b_max, &scaling_factor_b[1]); - const int8 expected_b_int8_data[] = { + const int8_t expected_b_int8_data[] = { /* batch 1 */ 127, -127, @@ -465,6 +466,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) { aligned_free(a_int8_data); } +#endif // __ANDROID__ TEST(uKernels, VectorVectorCwiseProductTest) { constexpr int kVectorSize = 10; diff --git a/tensorflow/contrib/lite/kernels/logical.cc b/tensorflow/contrib/lite/kernels/logical.cc index 3dc39bf79a1c054c4d1c82b51a74a21051b58838..87c2fee667ccaf7bfdc4e2316309d2abc35b5324 100644 --- a/tensorflow/contrib/lite/kernels/logical.cc +++ b/tensorflow/contrib/lite/kernels/logical.cc @@ -105,6 +105,11 @@ TfLiteStatus LogicalOrEval(TfLiteContext* context, TfLiteNode* node) { return LogicalImpl(context, node, logical_or_func); } +TfLiteStatus LogicalAndEval(TfLiteContext* context, TfLiteNode* node) { + const auto logical_and_func = std::logical_and(); + return LogicalImpl(context, node, logical_and_func); +} + } // namespace } // namespace logical @@ -116,6 +121,14 @@ TfLiteRegistration* Register_LOGICAL_OR() { return &r; } +TfLiteRegistration* Register_LOGICAL_AND() { + // Init, Free, Prepare, Eval are satisfying the Interface required by + // TfLiteRegistration. + static TfLiteRegistration r = {logical::Init, logical::Free, logical::Prepare, + logical::LogicalAndEval}; + return &r; +} + } // namespace builtin } // namespace ops } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/logical_test.cc b/tensorflow/contrib/lite/kernels/logical_test.cc index 382008245bf0b0e39218e16228b67ae389ac6add..206cbde98fa48ec5f7c863bbced9dccc9cab5207 100644 --- a/tensorflow/contrib/lite/kernels/logical_test.cc +++ b/tensorflow/contrib/lite/kernels/logical_test.cc @@ -52,6 +52,11 @@ class LogicalOpModel : public SingleOpModel { CreateLogicalOrOptions(builder_).Union()); break; } + case BuiltinOperator_LOGICAL_AND: { + SetBuiltinOp(op, BuiltinOptions_LogicalAndOptions, + CreateLogicalAndOptions(builder_).Union()); + break; + } default: { FAIL() << "We shouldn't get here."; } } } @@ -77,6 +82,26 @@ TEST(LogicalTest, BroadcastLogicalOr) { EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); } +TEST(LogicalTest, LogicalAnd) { + LogicalOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, BuiltinOperator_LOGICAL_AND); + model.PopulateTensor(model.input1(), {true, false, false, true}); + model.PopulateTensor(model.input2(), {true, false, true, false}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, false)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + +TEST(LogicalTest, BroadcastLogicalAnd) { + LogicalOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, BuiltinOperator_LOGICAL_AND); + model.PopulateTensor(model.input1(), {true, false, false, true}); + model.PopulateTensor(model.input2(), {true}); + model.Invoke(); + + EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true)); + EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4)); +} + } // namespace } // namespace tflite diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc index e63272884141006f2a5613aa536c1bf4d4c4c53c..8d2c108116e1666f342392ada44854190a5b80ee 100644 --- a/tensorflow/contrib/lite/kernels/register.cc +++ b/tensorflow/contrib/lite/kernels/register.cc @@ -109,6 +109,35 @@ TfLiteRegistration* Register_FAKE_QUANT(); TfLiteRegistration* Register_PACK(); TfLiteRegistration* Register_ONE_HOT(); TfLiteRegistration* Register_LOGICAL_OR(); +TfLiteRegistration* Register_LOGICAL_AND(); +TfLiteRegistration* Register_LOGICAL_NOT(); + +TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) { + context->ReportError( + context, + "Regular TensorFlow ops are not supported by this interpreter. Make sure " + "you invoke the Eager delegate before inference."); + return kTfLiteError; +} + +const TfLiteRegistration* BuiltinOpResolver::FindOp(tflite::BuiltinOperator op, + int version) const { + return MutableOpResolver::FindOp(op, version); +} + +const TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op, + int version) const { + // Return the NULL Op for all ops whose name start with "Eager:", allowing + // the interpreter to delegate their execution. + if (string(op).find("Eager:") == 0) { + static TfLiteRegistration null_op{ + nullptr, nullptr, &UnsupportedTensorFlowOp, + nullptr, nullptr, BuiltinOperator_CUSTOM, + "Eager", 1}; + return &null_op; + } + return MutableOpResolver::FindOp(op, version); +} BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_RELU, Register_RELU()); @@ -201,6 +230,8 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_PACK, Register_PACK()); AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT()); AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR()); + AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND()); + AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT()); // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that // custom ops aren't always included by default. diff --git a/tensorflow/contrib/lite/kernels/register.h b/tensorflow/contrib/lite/kernels/register.h index 940718d67e70b7206227b891ea529cb9e9619161..0296152d68d6836fd592a65eeea69a7d4ebbb6ef 100644 --- a/tensorflow/contrib/lite/kernels/register.h +++ b/tensorflow/contrib/lite/kernels/register.h @@ -26,6 +26,10 @@ namespace builtin { class BuiltinOpResolver : public MutableOpResolver { public: BuiltinOpResolver(); + + const TfLiteRegistration* FindOp(tflite::BuiltinOperator op, + int version) const override; + const TfLiteRegistration* FindOp(const char* op, int version) const override; }; } // namespace builtin diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc index 7be5e66c166cd752fc325f25d38e6522948e0f06..fec2a6f0d97ae48e0c49d82c726278a46d96a7fc 100644 --- a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc +++ b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc @@ -187,7 +187,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { return ResizeOutputShape(context, output_shape, output); } -template +template TfLiteStatus SparseToDenseImpl(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor); const TfLiteTensor* output_shape = @@ -204,10 +204,10 @@ TfLiteStatus SparseToDenseImpl(TfLiteContext* context, TfLiteNode* node) { const int num_indices = SizeOfDimension(indices, 0); const bool value_is_scalar = NumDimensions(values) == 0; - std::vector> indices_vector; + std::vector> indices_vector; indices_vector.reserve(num_indices); - TF_LITE_ENSURE_OK(context, GetIndicesVector(context, indices, num_indices, - &indices_vector)); + TF_LITE_ENSURE_OK(context, GetIndicesVector(context, indices, num_indices, + &indices_vector)); reference_ops::SparseToDense(indices_vector, GetTensorData(values), *GetTensorData(default_value), GetTensorData(output), GetTensorDims(output), diff --git a/tensorflow/contrib/lite/kernels/tile.cc b/tensorflow/contrib/lite/kernels/tile.cc index af77f074742eb3fef10a74616ff679255911fbb2..5181a8f89a376302bad02913e3c7c1d094821da8 100644 --- a/tensorflow/contrib/lite/kernels/tile.cc +++ b/tensorflow/contrib/lite/kernels/tile.cc @@ -87,8 +87,9 @@ std::pair TileOneDimension(const TfLiteIntArray& in_dimensions, if (dimension == in_dimensions.size - 1) { CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data); - return std::make_pair(dimension_size, - dimension_size * multipliers[dimension]); + return std::make_pair( + dimension_size, + dimension_size * static_cast(multipliers[dimension])); } int total_stride_size = 0, total_tiled_stride_size = 0; const T* copy_from_data = in_data; diff --git a/tensorflow/contrib/lite/mmap_allocation.cc b/tensorflow/contrib/lite/mmap_allocation.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa9a3cd1d839b07149bb80c3b7714b32b5eda235 --- /dev/null +++ b/tensorflow/contrib/lite/mmap_allocation.cc @@ -0,0 +1,61 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include +#include +#include +#include + +#include "tensorflow/contrib/lite/allocation.h" +#include "tensorflow/contrib/lite/error_reporter.h" + +namespace tflite { + +MMAPAllocation::MMAPAllocation(const char* filename, + ErrorReporter* error_reporter) + : Allocation(error_reporter), mmapped_buffer_(MAP_FAILED) { + mmap_fd_ = open(filename, O_RDONLY); + if (mmap_fd_ == -1) { + error_reporter_->Report("Could not open '%s'.", filename); + return; + } + struct stat sb; + fstat(mmap_fd_, &sb); + buffer_size_bytes_ = sb.st_size; + mmapped_buffer_ = + mmap(nullptr, buffer_size_bytes_, PROT_READ, MAP_SHARED, mmap_fd_, 0); + if (mmapped_buffer_ == MAP_FAILED) { + error_reporter_->Report("Mmap of '%s' failed.", filename); + return; + } +} + +MMAPAllocation::~MMAPAllocation() { + if (valid()) { + munmap(const_cast(mmapped_buffer_), buffer_size_bytes_); + } + if (mmap_fd_ != -1) close(mmap_fd_); +} + +const void* MMAPAllocation::base() const { return mmapped_buffer_; } + +size_t MMAPAllocation::bytes() const { return buffer_size_bytes_; } + +bool MMAPAllocation::valid() const { return mmapped_buffer_ != MAP_FAILED; } + +bool MMAPAllocation::IsSupported() { return true; } + +} // namespace tflite diff --git a/tensorflow/contrib/lite/mmap_allocation_disabled.cc b/tensorflow/contrib/lite/mmap_allocation_disabled.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3d4cf1a257d43ebd56cc9b8831de0bb1994d40c --- /dev/null +++ b/tensorflow/contrib/lite/mmap_allocation_disabled.cc @@ -0,0 +1,39 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/lite/allocation.h" + +#include + +namespace tflite { + +MMAPAllocation::MMAPAllocation(const char* filename, + ErrorReporter* error_reporter) + : Allocation(error_reporter), mmapped_buffer_(nullptr) { + // The disabled variant should never be created. + assert(false); +} + +MMAPAllocation::~MMAPAllocation() {} + +const void* MMAPAllocation::base() const { return nullptr; } + +size_t MMAPAllocation::bytes() const { return 0; } + +bool MMAPAllocation::valid() const { return false; } + +bool MMAPAllocation::IsSupported() { return false; } + +} // namespace tflite diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc index 5814cddc5ba8d4099a449ea6e42fc031f6ef46b9..9edf5ba38f4c6506524074bc0a3ebe7e068c7ee3 100644 --- a/tensorflow/contrib/lite/model.cc +++ b/tensorflow/contrib/lite/model.cc @@ -16,7 +16,6 @@ limitations under the License. #include #include #include -#include #include #include @@ -24,7 +23,9 @@ limitations under the License. #include "tensorflow/contrib/lite/builtin_op_data.h" #include "tensorflow/contrib/lite/error_reporter.h" #include "tensorflow/contrib/lite/model.h" +#ifndef TFLITE_MCU #include "tensorflow/contrib/lite/nnapi_delegate.h" +#endif #include "tensorflow/contrib/lite/version.h" namespace tflite { @@ -73,6 +74,7 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type, return kTfLiteOk; } +#ifndef TFLITE_MCU // Loads a model from `filename`. If `mmap_file` is true then use mmap, // otherwise make a copy of the model in a buffer. std::unique_ptr GetAllocationFromFile(const char* filename, @@ -80,8 +82,8 @@ std::unique_ptr GetAllocationFromFile(const char* filename, ErrorReporter* error_reporter, bool use_nnapi) { std::unique_ptr allocation; - if (mmap_file) { - if (use_nnapi && NNAPIExists()) + if (mmap_file && MMAPAllocation::IsSupported()) { + if (use_nnapi && NNAPIDelegate::IsSupported()) allocation.reset(new NNAPIAllocation(filename, error_reporter)); else allocation.reset(new MMAPAllocation(filename, error_reporter)); @@ -120,6 +122,7 @@ std::unique_ptr FlatBufferModel::VerifyAndBuildFromFile( if (!model->initialized()) model.reset(); return model; } +#endif std::unique_ptr FlatBufferModel::BuildFromBuffer( const char* buffer, size_t buffer_size, ErrorReporter* error_reporter) { @@ -781,6 +784,8 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type, case BuiltinOperator_TRANSPOSE: case BuiltinOperator_POW: case BuiltinOperator_LOGICAL_OR: + case BuiltinOperator_LOGICAL_AND: + case BuiltinOperator_LOGICAL_NOT: break; } return kTfLiteOk; diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.h b/tensorflow/contrib/lite/models/smartreply/predictor.h index 90260c8d620b0e756f72089d3f4d8d9f92d44fbe..3151192d9277b6df513a76afb08af30d0379b7b1 100644 --- a/tensorflow/contrib/lite/models/smartreply/predictor.h +++ b/tensorflow/contrib/lite/models/smartreply/predictor.h @@ -65,9 +65,9 @@ struct SmartReplyConfig { float backoff_confidence; // Backoff responses are used when predicted responses cannot fulfill the // list. - const std::vector& backoff_responses; + std::vector backoff_responses; - SmartReplyConfig(std::vector backoff_responses) + SmartReplyConfig(const std::vector& backoff_responses) : num_response(kDefaultNumResponse), backoff_confidence(kDefaultBackoffConfidence), backoff_responses(backoff_responses) {} diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h index becd1f615f04a806cba9c494323285c004ec41df..42b8163445d252c766491e7bcd2fd7eea0dd7571 100644 --- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h +++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h @@ -44,6 +44,19 @@ inline void* loadLibrary(const char* name) { return handle; } +typedef int (*ASharedMemory_create_fn)(const char* name, size_t size); + +// ASharedMemory_create was added in Android 8.0, so safe to use with NNAPI +// which was added in 8.1. +inline int ASharedMemory_create(const char* name, size_t size) { + static void* handle = loadLibrary("libandroid.so"); + static ASharedMemory_create_fn fn = + handle != nullptr ? reinterpret_cast( + dlsym(handle, "ASharedMemory_create")) + : nullptr; + return fn(name, size); +} + inline void* getLibraryHandle() { static void* handle = loadLibrary("libneuralnetworks.so"); return handle; diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc index 1c06b29deb541fa73dd597c7f8e465c760f1720b..45c92a86716ae22f2c44fed5f94cf81336fdddaa 100644 --- a/tensorflow/contrib/lite/nnapi_delegate.cc +++ b/tensorflow/contrib/lite/nnapi_delegate.cc @@ -24,20 +24,27 @@ limitations under the License. #include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h" #ifdef __ANDROID__ +#include #include #endif namespace tflite { void logError(const char* format, ...) { - // TODO(mikie): use android logging, stderr is not captured for Java - // applications - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); + // stderr is convenient for native tests, but is not captured for apps + va_list args_for_stderr; + va_start(args_for_stderr, format); + vfprintf(stderr, format, args_for_stderr); + va_end(args_for_stderr); fprintf(stderr, "\n"); fflush(stderr); +#ifdef __ANDROID__ + // produce logcat output for general consumption + va_list args_for_log; + va_start(args_for_log, format); + __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log); + va_end(args_for_log); +#endif } #define FATAL(...) \ @@ -564,13 +571,27 @@ TfLiteStatus AddOpsAndParams( nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION; if (reinterpret_cast(node.builtin_data) ->activation != kTfLiteActNone) { - FATAL( + logError( "NNAPI does not support L2Normalization with fused activations"); + return kTfLiteError; + } + if ((node.inputs->size > 0) && + (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) { + logError("NNAPI only supports input rank 4 for L2Normalization"); + return kTfLiteError; } break; + case tflite::BuiltinOperator_HASHTABLE_LOOKUP: + if (interpreter->tensor(node.outputs->data[0])->type != + kTfLiteFloat32) { + logError("NNAPI only support HASHTABLE_LOOKUP with float32 output", + builtin); + return kTfLiteError; + } + nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP; + break; case tflite::BuiltinOperator_CONCAT_EMBEDDINGS: case tflite::BuiltinOperator_LSH_PROJECTION: - case tflite::BuiltinOperator_HASHTABLE_LOOKUP: case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN: case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: @@ -624,6 +645,8 @@ TfLiteStatus AddOpsAndParams( case tflite::BuiltinOperator_PACK: case tflite::BuiltinOperator_LOGICAL_OR: case tflite::BuiltinOperator_ONE_HOT: + case tflite::BuiltinOperator_LOGICAL_AND: + case tflite::BuiltinOperator_LOGICAL_NOT: logError("Op code %d is currently not delegated to NNAPI", builtin); return kTfLiteError; break; @@ -789,4 +812,6 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) { return kTfLiteOk; } +bool NNAPIDelegate::IsSupported() { return NNAPIExists(); } + } // namespace tflite diff --git a/tensorflow/contrib/lite/nnapi_delegate.h b/tensorflow/contrib/lite/nnapi_delegate.h index 8dc7d38a303f51b7ccefefd8c9d2990b443e6827..2bdb2cc5c8211a48ea07e7ec45f9eebc0a3f7c10 100644 --- a/tensorflow/contrib/lite/nnapi_delegate.h +++ b/tensorflow/contrib/lite/nnapi_delegate.h @@ -19,9 +19,10 @@ limitations under the License. #include "tensorflow/contrib/lite/context.h" #include "tensorflow/contrib/lite/error_reporter.h" #include "tensorflow/contrib/lite/interpreter.h" -#include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h" -class ANeuralNetworsModel; +class ANeuralNetworksModel; +class ANeuralNetworksMemory; +class ANeuralNetworksCompilation; namespace tflite { @@ -54,6 +55,9 @@ class NNAPIDelegate { // Run TfLiteStatus Invoke(Interpreter* interpreter); + // Whether the current platform supports NNAPI delegation. + static bool IsSupported(); + private: // The NN API model handle ANeuralNetworksModel* nn_model_ = nullptr; diff --git a/tensorflow/contrib/lite/nnapi_delegate_disabled.cc b/tensorflow/contrib/lite/nnapi_delegate_disabled.cc new file mode 100644 index 0000000000000000000000000000000000000000..efde72b1a76a86728f4cccd8782ca0e993dd0338 --- /dev/null +++ b/tensorflow/contrib/lite/nnapi_delegate_disabled.cc @@ -0,0 +1,42 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/contrib/lite/nnapi_delegate.h" + +#include + +namespace tflite { + +NNAPIAllocation::NNAPIAllocation(const char* filename, + ErrorReporter* error_reporter) + : MMAPAllocation(filename, error_reporter) { + // The disabled variant should never be created. + assert(false); +} + +NNAPIAllocation::~NNAPIAllocation() {} + +NNAPIDelegate::~NNAPIDelegate() {} + +TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) { + return kTfLiteError; +} + +TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) { + return kTfLiteError; +} + +bool NNAPIDelegate::IsSupported() { return false; } + +} // namespace tflite diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py index ec49738fb5365a16c41cc6737198b5707508a3e2..11d4bdbe82295bff9a7a457e2fd5ca1f8fe04036 100644 --- a/tensorflow/contrib/lite/python/convert.py +++ b/tensorflow/contrib/lite/python/convert.py @@ -54,7 +54,7 @@ def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str): """Convert `input_data_str` according to model and toco parameters. Unless you know what you are doing consider using - the more friendly @{tf.contrib.lite.toco_convert}}. + the more friendly `tf.contrib.lite.toco_convert`. Args: model_flags_str: Serialized proto describing model properties, see diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py index 3243bddac879b8eb0ca7a03d28b2f6094f905983..1be61fe05343a0e7d39f2808c78672698e0d767f 100644 --- a/tensorflow/contrib/lite/python/interpreter.py +++ b/tensorflow/contrib/lite/python/interpreter.py @@ -54,6 +54,10 @@ class Interpreter(object): if not self._interpreter: raise ValueError('Failed to open {}'.format(model_path)) elif model_content and not model_path: + # Take a reference, so the pointer remains valid. + # Since python strings are immutable then PyString_XX functions + # will always return the same pointer. + self._model_content = model_content self._interpreter = ( _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer( model_content)) diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h index 3e03751da40064c64ab646d0b976a2ff5ca9c250..641dd93db5b9df292e03e9704a218299f48b14fb 100644 --- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h +++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h @@ -15,12 +15,15 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_ #define TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_ -// Place `` before to avoid build failures in macOS. -#include #include #include #include +// Place `` before to avoid build failures in macOS. +#include + +// The empty line above is on purpose as otherwise clang-format will +// automatically move before . #include // We forward declare TFLite classes here to avoid exposing them to SWIG. diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py index 2f9b9d469a27cc8910cb61c0da14769e5ff0baf0..5ec52035add63ffe5a47fffae258ce4a2efd1bcc 100644 --- a/tensorflow/contrib/lite/python/lite.py +++ b/tensorflow/contrib/lite/python/lite.py @@ -53,8 +53,8 @@ from tensorflow.core.framework import graph_pb2 as _graph_pb2 from tensorflow.python import keras as _keras from tensorflow.python.client import session as _session from tensorflow.python.framework import graph_util as _tf_graph_util +from tensorflow.python.framework import ops as _ops from tensorflow.python.framework.importer import import_graph_def as _import_graph_def -from tensorflow.python.ops.variables import global_variables_initializer as _global_variables_initializer from tensorflow.python.saved_model import signature_constants as _signature_constants from tensorflow.python.saved_model import tag_constants as _tag_constants @@ -194,42 +194,41 @@ class TocoConverter(object): The graph is not frozen. input_arrays or output_arrays contains an invalid tensor name. """ - with _session.Session() as sess: - sess.run(_global_variables_initializer()) - - # Read GraphDef from file. - graph_def = _graph_pb2.GraphDef() - with open(graph_def_file, "rb") as f: - file_content = f.read() - try: - graph_def.ParseFromString(file_content) - except (_text_format.ParseError, DecodeError): + with _ops.Graph().as_default(): + with _session.Session() as sess: + # Read GraphDef from file. + graph_def = _graph_pb2.GraphDef() + with open(graph_def_file, "rb") as f: + file_content = f.read() try: - print("Ignore 'tcmalloc: large alloc' warnings.") - - if not isinstance(file_content, str): - if PY3: - file_content = file_content.decode('utf-8') - else: - file_content = file_content.encode('utf-8') - _text_format.Merge(file_content, graph_def) + graph_def.ParseFromString(file_content) except (_text_format.ParseError, DecodeError): - raise ValueError( - "Unable to parse input file '{}'.".format(graph_def_file)) - sess.graph.as_default() - _import_graph_def(graph_def, name="") - - # Get input and output tensors. - input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays) - output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays) - _set_tensor_shapes(input_tensors, input_shapes) - - # Check if graph is frozen. - if not _is_frozen_graph(sess): - raise ValueError("Please freeze the graph using freeze_graph.py.") - - # Create TocoConverter class. - return cls(sess.graph_def, input_tensors, output_tensors) + try: + print("Ignore 'tcmalloc: large alloc' warnings.") + + if not isinstance(file_content, str): + if PY3: + file_content = file_content.decode("utf-8") + else: + file_content = file_content.encode("utf-8") + _text_format.Merge(file_content, graph_def) + except (_text_format.ParseError, DecodeError): + raise ValueError( + "Unable to parse input file '{}'.".format(graph_def_file)) + _import_graph_def(graph_def, name="") + + # Get input and output tensors. + input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays) + output_tensors = _get_tensors_from_tensor_names(sess.graph, + output_arrays) + _set_tensor_shapes(input_tensors, input_shapes) + + # Check if graph is frozen. + if not _is_frozen_graph(sess): + raise ValueError("Please freeze the graph using freeze_graph.py.") + + # Create TocoConverter class. + return cls(sess.graph_def, input_tensors, output_tensors) @classmethod def from_saved_model(cls, @@ -427,7 +426,6 @@ def _freeze_graph(sess, output_tensors): Frozen GraphDef. """ if not _is_frozen_graph(sess): - sess.run(_global_variables_initializer()) output_arrays = [_tensor_name(tensor) for tensor in output_tensors] return _tf_graph_util.convert_variables_to_constants( sess, sess.graph_def, output_arrays) diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py index ca2af5aaed3ee4f4fce5f0d31eaa61df0e11f364..2f1368422842846aa616eaa7bc1e60ee6b0deaaf 100644 --- a/tensorflow/contrib/lite/python/lite_test.py +++ b/tensorflow/contrib/lite/python/lite_test.py @@ -33,6 +33,7 @@ from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope +from tensorflow.python.ops.variables import global_variables_initializer as _global_variables_initializer from tensorflow.python.platform import gfile from tensorflow.python.platform import test from tensorflow.python.saved_model import saved_model @@ -198,6 +199,7 @@ class FromSessionTest(test_util.TensorFlowTestCase): 'weights', shape=[1, 16, 16, 3], dtype=dtypes.float32) out_tensor = in_tensor + var sess = session.Session() + sess.run(_global_variables_initializer()) # Convert model and ensure model is not None. converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor]) @@ -655,9 +657,7 @@ class FromKerasFile(test_util.TensorFlowTestCase): tflite_model = converter.convert() self.assertTrue(tflite_model) - os.remove(keras_file) - - # Check values from converted model. + # Check tensor details of converted model. interpreter = Interpreter(model_content=tflite_model) interpreter.allocate_tensors() @@ -675,6 +675,18 @@ class FromKerasFile(test_util.TensorFlowTestCase): self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all()) self.assertEqual((0., 0.), output_details[0]['quantization']) + # Check inference of converted model. + input_data = np.array([[1, 2, 3]], dtype=np.float32) + interpreter.set_tensor(input_details[0]['index'], input_data) + interpreter.invoke() + tflite_result = interpreter.get_tensor(output_details[0]['index']) + + keras_model = keras.models.load_model(keras_file) + keras_result = keras_model.predict(input_data) + + np.testing.assert_almost_equal(tflite_result, keras_result, 5) + os.remove(keras_file) + def testSequentialModelInputArray(self): """Test a Sequential tf.keras model testing input arrays argument.""" keras_file = self._getSequentialModel() @@ -755,17 +767,17 @@ class FromKerasFile(test_util.TensorFlowTestCase): model.predict(x) fd, keras_file = tempfile.mkstemp('.h5') - keras.models.save_model(model, keras_file) + try: + keras.models.save_model(model, keras_file) + finally: + os.close(fd) # Convert to TFLite model. converter = lite.TocoConverter.from_keras_model_file(keras_file) tflite_model = converter.convert() self.assertTrue(tflite_model) - os.close(fd) - os.remove(keras_file) - - # Check values from converted model. + # Check tensor details of converted model. interpreter = Interpreter(model_content=tflite_model) interpreter.allocate_tensors() @@ -783,6 +795,18 @@ class FromKerasFile(test_util.TensorFlowTestCase): self.assertTrue(([1, 3] == output_details[0]['shape']).all()) self.assertEqual((0., 0.), output_details[0]['quantization']) + # Check inference of converted model. + input_data = np.array([[1, 2, 3]], dtype=np.float32) + interpreter.set_tensor(input_details[0]['index'], input_data) + interpreter.invoke() + tflite_result = interpreter.get_tensor(output_details[0]['index']) + + keras_model = keras.models.load_model(keras_file) + keras_result = keras_model.predict(input_data) + + np.testing.assert_almost_equal(tflite_result, keras_result, 5) + os.remove(keras_file) + def testFunctionalModelMultipleInputs(self): """Test a Functional tf.keras model with multiple inputs and outputs.""" a = keras.layers.Input(shape=(3,), name='input_a') @@ -865,17 +889,17 @@ class FromKerasFile(test_util.TensorFlowTestCase): model.predict(x) fd, keras_file = tempfile.mkstemp('.h5') - keras.models.save_model(model, keras_file) + try: + keras.models.save_model(model, keras_file) + finally: + os.close(fd) # Convert to TFLite model. converter = lite.TocoConverter.from_keras_model_file(keras_file) tflite_model = converter.convert() self.assertTrue(tflite_model) - os.close(fd) - os.remove(keras_file) - - # Check values from converted model. + # Check tensor details of converted model. interpreter = Interpreter(model_content=tflite_model) interpreter.allocate_tensors() @@ -893,6 +917,18 @@ class FromKerasFile(test_util.TensorFlowTestCase): self.assertTrue(([1, 3, 3] == output_details[0]['shape']).all()) self.assertEqual((0., 0.), output_details[0]['quantization']) + # Check inference of converted model. + input_data = np.array([[1, 2, 3]], dtype=np.float32) + interpreter.set_tensor(input_details[0]['index'], input_data) + interpreter.invoke() + tflite_result = interpreter.get_tensor(output_details[0]['index']) + + keras_model = keras.models.load_model(keras_file) + keras_result = keras_model.predict(input_data) + + np.testing.assert_almost_equal(tflite_result, keras_result, 5) + os.remove(keras_file) + if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py index d17482e60113da5bad3a76fa2ab634ae0ffb89fd..a76cc3963580767ab8bd745a9bcd7c9c780ec2b5 100644 --- a/tensorflow/contrib/lite/python/tflite_convert.py +++ b/tensorflow/contrib/lite/python/tflite_convert.py @@ -203,8 +203,9 @@ def _check_flags(flags, unparsed): raise ValueError("--default_ranges_min and --default_ranges_max must be " "used together") - if flags.dump_graphviz_video and not flags.dump_graphviz: - raise ValueError("--dump_graphviz_video must be used with --dump_graphviz") + if flags.dump_graphviz_video and not flags.dump_graphviz_dir: + raise ValueError("--dump_graphviz_video must be used with " + "--dump_graphviz_dir") def run_main(_): diff --git a/tensorflow/contrib/lite/rpi_makefile.inc b/tensorflow/contrib/lite/rpi_makefile.inc deleted file mode 100644 index 832ef5824bea86a368184bd7e3d17915739e9d46..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/lite/rpi_makefile.inc +++ /dev/null @@ -1,33 +0,0 @@ -# Settings for Raspberry Pi. -ifeq ($(TARGET), RPI) - ifeq ($(TARGET_ARCH), armv7) - CXXFLAGS += \ - -march=armv7-a \ - -mfpu=neon-vfpv4 \ - -funsafe-math-optimizations \ - -ftree-vectorize - - CCFLAGS += \ - -march=armv7-a \ - -mfpu=neon-vfpv4 \ - -funsafe-math-optimizations \ - -ftree-vectorize - - LDFLAGS := \ - -Wl,--no-export-dynamic \ - -Wl,--exclude-libs,ALL \ - -Wl,--gc-sections \ - -Wl,--as-needed - endif - - LIBS := \ - -lstdc++ \ - -lpthread \ - -lm \ - -ldl - - OBJDIR := $(OBJDIR)rpi_$(TARGET_ARCH)/ - LIBDIR := $(LIBDIR)rpi_$(TARGET_ARCH)/ - BINDIR := $(BINDIR)rpi_$(TARGET_ARCH)/ - DEPDIR := $(DEPDIR)rpi_$(TARGET_ARCH)/ -endif diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs index 8ed98ddaf40d1ca4d524407458d7b65d76c3ef2c..14f88b4c009e4f7cd913c2a27799ab418562fb1f 100644 --- a/tensorflow/contrib/lite/schema/schema.fbs +++ b/tensorflow/contrib/lite/schema/schema.fbs @@ -167,6 +167,8 @@ enum BuiltinOperator : byte { PACK = 83, LOGICAL_OR = 84, ONE_HOT = 85, + LOGICAL_AND = 86, + LOGICAL_NOT = 87, } // Options for the builtin operators. @@ -232,6 +234,8 @@ union BuiltinOptions { PackOptions, LogicalOrOptions, OneHotOptions, + LogicalAndOptions, + LogicalNotOptions, } enum Padding : byte { SAME, VALID } @@ -555,6 +559,12 @@ table OneHotOptions { axis:int; } +table LogicalAndOptions { +} + +table LogicalNotOptions { +} + // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a // builtin, or a string if the operator is custom. table OperatorCode { diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h index 4402f89b85de1df958fd32f57fae8ba9a0c6efee..3efa153e2cfd98dcac9352ff0ef4d8eb9bb6b66a 100755 --- a/tensorflow/contrib/lite/schema/schema_generated.h +++ b/tensorflow/contrib/lite/schema/schema_generated.h @@ -214,6 +214,12 @@ struct LogicalOrOptionsT; struct OneHotOptions; struct OneHotOptionsT; +struct LogicalAndOptions; +struct LogicalAndOptionsT; + +struct LogicalNotOptions; +struct LogicalNotOptionsT; + struct OperatorCode; struct OperatorCodeT; @@ -365,11 +371,13 @@ enum BuiltinOperator { BuiltinOperator_PACK = 83, BuiltinOperator_LOGICAL_OR = 84, BuiltinOperator_ONE_HOT = 85, + BuiltinOperator_LOGICAL_AND = 86, + BuiltinOperator_LOGICAL_NOT = 87, BuiltinOperator_MIN = BuiltinOperator_ADD, - BuiltinOperator_MAX = BuiltinOperator_ONE_HOT + BuiltinOperator_MAX = BuiltinOperator_LOGICAL_NOT }; -inline BuiltinOperator (&EnumValuesBuiltinOperator())[85] { +inline BuiltinOperator (&EnumValuesBuiltinOperator())[87] { static BuiltinOperator values[] = { BuiltinOperator_ADD, BuiltinOperator_AVERAGE_POOL_2D, @@ -455,7 +463,9 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[85] { BuiltinOperator_REDUCE_MAX, BuiltinOperator_PACK, BuiltinOperator_LOGICAL_OR, - BuiltinOperator_ONE_HOT + BuiltinOperator_ONE_HOT, + BuiltinOperator_LOGICAL_AND, + BuiltinOperator_LOGICAL_NOT }; return values; } @@ -548,6 +558,8 @@ inline const char **EnumNamesBuiltinOperator() { "PACK", "LOGICAL_OR", "ONE_HOT", + "LOGICAL_AND", + "LOGICAL_NOT", nullptr }; return names; @@ -621,11 +633,13 @@ enum BuiltinOptions { BuiltinOptions_PackOptions = 59, BuiltinOptions_LogicalOrOptions = 60, BuiltinOptions_OneHotOptions = 61, + BuiltinOptions_LogicalAndOptions = 62, + BuiltinOptions_LogicalNotOptions = 63, BuiltinOptions_MIN = BuiltinOptions_NONE, - BuiltinOptions_MAX = BuiltinOptions_OneHotOptions + BuiltinOptions_MAX = BuiltinOptions_LogicalNotOptions }; -inline BuiltinOptions (&EnumValuesBuiltinOptions())[62] { +inline BuiltinOptions (&EnumValuesBuiltinOptions())[64] { static BuiltinOptions values[] = { BuiltinOptions_NONE, BuiltinOptions_Conv2DOptions, @@ -688,7 +702,9 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[62] { BuiltinOptions_FakeQuantOptions, BuiltinOptions_PackOptions, BuiltinOptions_LogicalOrOptions, - BuiltinOptions_OneHotOptions + BuiltinOptions_OneHotOptions, + BuiltinOptions_LogicalAndOptions, + BuiltinOptions_LogicalNotOptions }; return values; } @@ -757,6 +773,8 @@ inline const char **EnumNamesBuiltinOptions() { "PackOptions", "LogicalOrOptions", "OneHotOptions", + "LogicalAndOptions", + "LogicalNotOptions", nullptr }; return names; @@ -1015,6 +1033,14 @@ template<> struct BuiltinOptionsTraits { static const BuiltinOptions enum_value = BuiltinOptions_OneHotOptions; }; +template<> struct BuiltinOptionsTraits { + static const BuiltinOptions enum_value = BuiltinOptions_LogicalAndOptions; +}; + +template<> struct BuiltinOptionsTraits { + static const BuiltinOptions enum_value = BuiltinOptions_LogicalNotOptions; +}; + struct BuiltinOptionsUnion { BuiltinOptions type; void *value; @@ -1534,6 +1560,22 @@ struct BuiltinOptionsUnion { return type == BuiltinOptions_OneHotOptions ? reinterpret_cast(value) : nullptr; } + LogicalAndOptionsT *AsLogicalAndOptions() { + return type == BuiltinOptions_LogicalAndOptions ? + reinterpret_cast(value) : nullptr; + } + const LogicalAndOptionsT *AsLogicalAndOptions() const { + return type == BuiltinOptions_LogicalAndOptions ? + reinterpret_cast(value) : nullptr; + } + LogicalNotOptionsT *AsLogicalNotOptions() { + return type == BuiltinOptions_LogicalNotOptions ? + reinterpret_cast(value) : nullptr; + } + const LogicalNotOptionsT *AsLogicalNotOptions() const { + return type == BuiltinOptions_LogicalNotOptions ? + reinterpret_cast(value) : nullptr; + } }; bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type); @@ -5527,6 +5569,86 @@ inline flatbuffers::Offset CreateOneHotOptions( flatbuffers::Offset CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct LogicalAndOptionsT : public flatbuffers::NativeTable { + typedef LogicalAndOptions TableType; + LogicalAndOptionsT() { + } +}; + +struct LogicalAndOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef LogicalAndOptionsT NativeTableType; + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + verifier.EndTable(); + } + LogicalAndOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(LogicalAndOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct LogicalAndOptionsBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + explicit LogicalAndOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + LogicalAndOptionsBuilder &operator=(const LogicalAndOptionsBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateLogicalAndOptions( + flatbuffers::FlatBufferBuilder &_fbb) { + LogicalAndOptionsBuilder builder_(_fbb); + return builder_.Finish(); +} + +flatbuffers::Offset CreateLogicalAndOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + +struct LogicalNotOptionsT : public flatbuffers::NativeTable { + typedef LogicalNotOptions TableType; + LogicalNotOptionsT() { + } +}; + +struct LogicalNotOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef LogicalNotOptionsT NativeTableType; + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + verifier.EndTable(); + } + LogicalNotOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(LogicalNotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct LogicalNotOptionsBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + explicit LogicalNotOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + LogicalNotOptionsBuilder &operator=(const LogicalNotOptionsBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateLogicalNotOptions( + flatbuffers::FlatBufferBuilder &_fbb) { + LogicalNotOptionsBuilder builder_(_fbb); + return builder_.Finish(); +} + +flatbuffers::Offset CreateLogicalNotOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + struct OperatorCodeT : public flatbuffers::NativeTable { typedef OperatorCode TableType; BuiltinOperator builtin_code; @@ -5843,6 +5965,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const OneHotOptions *builtin_options_as_OneHotOptions() const { return builtin_options_type() == BuiltinOptions_OneHotOptions ? static_cast(builtin_options()) : nullptr; } + const LogicalAndOptions *builtin_options_as_LogicalAndOptions() const { + return builtin_options_type() == BuiltinOptions_LogicalAndOptions ? static_cast(builtin_options()) : nullptr; + } + const LogicalNotOptions *builtin_options_as_LogicalNotOptions() const { + return builtin_options_type() == BuiltinOptions_LogicalNotOptions ? static_cast(builtin_options()) : nullptr; + } const flatbuffers::Vector *custom_options() const { return GetPointer *>(VT_CUSTOM_OPTIONS); } @@ -6118,6 +6246,14 @@ template<> inline const OneHotOptions *Operator::builtin_options_as inline const LogicalAndOptions *Operator::builtin_options_as() const { + return builtin_options_as_LogicalAndOptions(); +} + +template<> inline const LogicalNotOptions *Operator::builtin_options_as() const { + return builtin_options_as_LogicalNotOptions(); +} + struct OperatorBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; @@ -8259,6 +8395,52 @@ inline flatbuffers::Offset CreateOneHotOptions(flatbuffers::FlatB _axis); } +inline LogicalAndOptionsT *LogicalAndOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new LogicalAndOptionsT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void LogicalAndOptions::UnPackTo(LogicalAndOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; +} + +inline flatbuffers::Offset LogicalAndOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateLogicalAndOptions(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateLogicalAndOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalAndOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalAndOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + return tflite::CreateLogicalAndOptions( + _fbb); +} + +inline LogicalNotOptionsT *LogicalNotOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new LogicalNotOptionsT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void LogicalNotOptions::UnPackTo(LogicalNotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; +} + +inline flatbuffers::Offset LogicalNotOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateLogicalNotOptions(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateLogicalNotOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalNotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalNotOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + return tflite::CreateLogicalNotOptions( + _fbb); +} + inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const { auto _o = new OperatorCodeT(); UnPackTo(_o, _resolver); @@ -8692,6 +8874,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } + case BuiltinOptions_LogicalAndOptions: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case BuiltinOptions_LogicalNotOptions: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } default: return false; } } @@ -8954,6 +9144,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c auto ptr = reinterpret_cast(obj); return ptr->UnPack(resolver); } + case BuiltinOptions_LogicalAndOptions: { + auto ptr = reinterpret_cast(obj); + return ptr->UnPack(resolver); + } + case BuiltinOptions_LogicalNotOptions: { + auto ptr = reinterpret_cast(obj); + return ptr->UnPack(resolver); + } default: return nullptr; } } @@ -9204,6 +9402,14 @@ inline flatbuffers::Offset BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff auto ptr = reinterpret_cast(value); return CreateOneHotOptions(_fbb, ptr, _rehasher).Union(); } + case BuiltinOptions_LogicalAndOptions: { + auto ptr = reinterpret_cast(value); + return CreateLogicalAndOptions(_fbb, ptr, _rehasher).Union(); + } + case BuiltinOptions_LogicalNotOptions: { + auto ptr = reinterpret_cast(value); + return CreateLogicalNotOptions(_fbb, ptr, _rehasher).Union(); + } default: return 0; } } @@ -9454,6 +9660,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL value = new OneHotOptionsT(*reinterpret_cast(u.value)); break; } + case BuiltinOptions_LogicalAndOptions: { + value = new LogicalAndOptionsT(*reinterpret_cast(u.value)); + break; + } + case BuiltinOptions_LogicalNotOptions: { + value = new LogicalNotOptionsT(*reinterpret_cast(u.value)); + break; + } default: break; } @@ -9766,6 +9980,16 @@ inline void BuiltinOptionsUnion::Reset() { delete ptr; break; } + case BuiltinOptions_LogicalAndOptions: { + auto ptr = reinterpret_cast(value); + delete ptr; + break; + } + case BuiltinOptions_LogicalNotOptions: { + auto ptr = reinterpret_cast(value); + delete ptr; + break; + } default: break; } value = nullptr; diff --git a/tensorflow/contrib/lite/schema/upgrade_schema.py b/tensorflow/contrib/lite/schema/upgrade_schema.py index e0b36d3d3ee94b00cccd3968d14c63fe19c3c27c..a2ddf6295014f3b29fa584f2bb367a7e0a4399e7 100644 --- a/tensorflow/contrib/lite/schema/upgrade_schema.py +++ b/tensorflow/contrib/lite/schema/upgrade_schema.py @@ -99,9 +99,9 @@ class Converter(object): # dispatch function table. self._schemas.sort() self._new_version, self._new_schema = self._schemas[-1][:2] - self._upgrade_dispatch = dict( - (version, dispatch) - for version, unused1, unused2, dispatch in self._schemas) + self._upgrade_dispatch = { + version: dispatch + for version, unused1, unused2, dispatch in self._schemas} def _Read(self, input_file, schema, raw_binary=False): """Read a tflite model assuming the given flatbuffer schema. diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py index 6d03c0fd9ec980272b45d6a8072a98ee6564ca03..52ef0d5b86524d605b2f5d6dbae98d4c343ad6a0 100644 --- a/tensorflow/contrib/lite/testing/generate_examples.py +++ b/tensorflow/contrib/lite/testing/generate_examples.py @@ -90,8 +90,6 @@ TEST_INPUT_DEPTH = 3 # matching the expression will be considered due to the corresponding bug. KNOWN_BUGS = { # TOCO doesn't support scalars as input. - r"relu.*input_shape=\[\]": "67587484", - r"sigmoid.*input_shape=\[\]": "67645668", # Concat doesn't work with a single input tensor r"concat.*num_tensors=1": "67378344", # Transposition in MatMul is not fully supported. @@ -104,8 +102,6 @@ KNOWN_BUGS = { r"div.*int32": "72051395", # No support for SplitV r"split.*num_or_size_splits=\[2,2\]": "73377559", - # Scalar constants don't work. - r"constant.*shape=\[\]": "109811500", } @@ -230,6 +226,7 @@ _TF_TYPE_INFO = { tf.float16: (np.float16, "FLOAT"), tf.int32: (np.int32, "INT32"), tf.uint8: (np.uint8, "QUANTIZED_UINT8"), + tf.int16: (np.int16, "QUANTIZED_INT16"), tf.int64: (np.int64, "INT64"), tf.bool: (np.bool, "BOOL"), } @@ -243,7 +240,7 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100): if dtype in (tf.float32, tf.float16): value = (max_value-min_value)*np.random.random_sample(shape)+min_value - elif dtype in (tf.int32, tf.uint8, tf.int64): + elif dtype in (tf.int32, tf.uint8, tf.int64, tf.int16): value = np.random.randint(min_value, max_value+1, shape) elif dtype == tf.bool: value = np.random.choice([True, False], size=shape) @@ -259,7 +256,7 @@ def create_scalar_data(dtype, min_value=-100, max_value=100): if dtype in (tf.float32, tf.float16): value = (max_value - min_value) * np.random.random() + min_value - elif dtype in (tf.int32, tf.uint8, tf.int64): + elif dtype in (tf.int32, tf.uint8, tf.int64, tf.int16): value = np.random.randint(min_value, max_value + 1) return np.array(value, dtype=dtype) @@ -687,12 +684,20 @@ def make_relu6_tests(zip_path): def make_prelu_tests(zip_path): """Make a set of tests to do PReLU.""" - test_parameters = [{ - # The canonical case for image processing is having a 4D `input` (NHWC) - # and `shared_axes`=[1, 2], so the alpha parameter is per channel. - "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]], - "shared_axes": [[1, 2], [1]], - }] + test_parameters = [ + { + # The canonical case for image processing is having a 4D `input` + # (NHWC)and `shared_axes`=[1, 2], so the alpha parameter is per + # channel. + "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]], + "shared_axes": [[1, 2], [1]], + }, + { + # 2D-3D example. Share the 2nd axis. + "input_shape": [[20, 20], [20, 20, 20]], + "shared_axes": [[1]], + } + ] def build_graph(parameters): """Build the graph for the test case.""" @@ -816,11 +821,13 @@ def make_binary_op_tests(zip_path, binary_operator): make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) -def make_reduce_tests(reduce_op): +def make_reduce_tests(reduce_op, min_value=-10, max_value=10): """Make a set of tests to do reduce operation. Args: reduce_op: TensorFlow reduce operation to test, i.e. `tf.reduce_mean`. + min_value: min value for created tensor data. + max_value: max value for created tensor data. Returns: a function representing the true generator with `reduce_op_in` curried. @@ -883,10 +890,12 @@ def make_reduce_tests(reduce_op): def build_inputs(parameters, sess, inputs, outputs): values = [ - create_tensor_data(parameters["input_dtype"], - parameters["input_shape"], - min_value=-10, - max_value=10)] + create_tensor_data( + parameters["input_dtype"], + parameters["input_shape"], + min_value=min_value, + max_value=max_value) + ] if not parameters["const_axis"]: values.append(np.array(parameters["axis"])) return values, sess.run(outputs, feed_dict=dict(zip(inputs, values))) @@ -908,7 +917,8 @@ def make_sum_tests(zip_path): def make_reduce_prod_tests(zip_path): """Make a set of tests to do prod.""" - return make_reduce_tests(tf.reduce_prod)(zip_path) + # set min max value to be -2, 2 to avoid overflow. + return make_reduce_tests(tf.reduce_prod, -2, 2)(zip_path) def make_reduce_max_tests(zip_path): @@ -1347,6 +1357,7 @@ def make_concat_tests(zip_path): "base_shape": [[1, 3, 4, 3], [3, 4]], "num_tensors": [1, 2, 3, 4, 5, 6], "axis": [0, 1, 2, 3, -3, -2, -1], + "type": [tf.float32, tf.uint8, tf.int32, tf.int64], }] def get_shape(parameters, delta): @@ -1362,7 +1373,8 @@ def make_concat_tests(zip_path): def build_graph(parameters): all_tensors = [] for n in range(0, parameters["num_tensors"]): - input_tensor = tf.placeholder(dtype=tf.float32, name=("input%d" % n), + input_tensor = tf.placeholder(dtype=parameters["type"], + name=("input%d" % n), shape=get_shape(parameters, n)) all_tensors.append(input_tensor) out = tf.concat(all_tensors, parameters["axis"]) @@ -1371,8 +1383,8 @@ def make_concat_tests(zip_path): def build_inputs(parameters, sess, inputs, outputs): all_values = [] for n in range(0, parameters["num_tensors"]): - input_values = create_tensor_data(np.float32, - get_shape(parameters, n)) + input_values = create_tensor_data( + parameters["type"], get_shape(parameters, n)) all_values.append(input_values) return all_values, sess.run( outputs, feed_dict=dict(zip(inputs, all_values))) @@ -1661,7 +1673,7 @@ def make_shape_tests(zip_path): }] def build_graph(parameters): - """Build the topk op testing graph.""" + """Build the shape op testing graph.""" # Note that we intentionally leave out the shape from the input placeholder # to prevent the Shape operation from being optimized out during conversion. input_value = tf.placeholder(dtype=parameters["input_dtype"], name="input") @@ -2309,6 +2321,7 @@ def make_topk_tests(zip_path): test_parameters = [{ "input_dtype": [tf.float32, tf.int32], "input_shape": [[10], [5, 20]], + "input_k": [None, 1, 3], }] def build_graph(parameters): @@ -2317,15 +2330,23 @@ def make_topk_tests(zip_path): dtype=parameters["input_dtype"], name="input", shape=parameters["input_shape"]) - k = tf.constant(3, name="k") + if parameters["input_k"] is not None: + k = tf.placeholder(dtype=tf.int32, name="input_k", shape=[]) + else: + k = tf.constant(3, name="k") out = tf.nn.top_k(input_value, k) - return [input_value], [out[1]] + return [input_value, k], [out[1]] def build_inputs(parameters, sess, inputs, outputs): input_value = create_tensor_data(parameters["input_dtype"], parameters["input_shape"]) - return [input_value], sess.run( - outputs, feed_dict=dict(zip(inputs, [input_value]))) + if parameters["input_k"] is not None: + k = np.array(parameters["input_k"], dtype=np.int32) + return [input_value, k], sess.run( + outputs, feed_dict=dict(zip(inputs, [input_value, k]))) + else: + return [input_value], sess.run( + outputs, feed_dict=dict(zip(inputs, [input_value]))) make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) @@ -2989,33 +3010,55 @@ def make_pack_tests(zip_path): make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) +def _make_logical_tests(op): + """Make a set of tests to do logical operations.""" + + def logical(zip_path): + """Generate examples.""" + test_parameters = [{ + "input_shape_pair": [([], []), ([1, 1, 1, 3], [1, 1, 1, 3]), + ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]), + ([5, 5], [1]), ([10], [2, 4, 10])], + }] + + def build_graph(parameters): + """Build the logical testing graph.""" + input_value1 = tf.placeholder( + dtype=tf.bool, name="input1", shape=parameters["input_shape_pair"][0]) + input_value2 = tf.placeholder( + dtype=tf.bool, name="input2", shape=parameters["input_shape_pair"][1]) + out = op(input_value1, input_value2) + return [input_value1, input_value2], [out] + + def build_inputs(parameters, sess, inputs, outputs): + input_value1 = create_tensor_data(tf.bool, + parameters["input_shape_pair"][0]) + input_value2 = create_tensor_data(tf.bool, + parameters["input_shape_pair"][1]) + return [input_value1, input_value2], sess.run( + outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2]))) + + make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) + + return logical + + def make_logical_or_tests(zip_path): """Make a set of tests to do logical_or.""" + return _make_logical_tests(tf.logical_or)(zip_path) - test_parameters = [{ - "input_shape_pair": [([], []), ([1, 1, 1, 3], [1, 1, 1, 3]), - ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]), - ([5, 5], [1]), ([10], [2, 4, 10])], - }] - def build_graph(parameters): - """Build the logical_or op testing graph.""" - input_value1 = tf.placeholder( - dtype=tf.bool, name="input1", shape=parameters["input_shape_pair"][0]) - input_value2 = tf.placeholder( - dtype=tf.bool, name="input2", shape=parameters["input_shape_pair"][1]) - out = tf.logical_or(input_value1, input_value2) - return [input_value1, input_value2], [out] +def make_logical_and_tests(zip_path): + """Make a set of tests to do logical_and.""" + return _make_logical_tests(tf.logical_and)(zip_path) - def build_inputs(parameters, sess, inputs, outputs): - input_value1 = create_tensor_data(tf.bool, - parameters["input_shape_pair"][0]) - input_value2 = create_tensor_data(tf.bool, - parameters["input_shape_pair"][1]) - return [input_value1, input_value2], sess.run( - outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2]))) - make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs) +def make_logical_xor_tests(zip_path): + """Make a set of tests to do logical_xor. + + Test logical_not as well. + """ + return _make_logical_tests(tf.logical_xor)(zip_path) # Toco binary path provided by the generate rule. diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc index 106cbc1b8e1d289ec04721611294c6a4c79dabb4..e67fee2a1ca40790a171dc236dd2d85203690a62 100644 --- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc +++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc @@ -33,13 +33,18 @@ namespace testing { namespace { bool FLAGS_ignore_known_bugs = true; -// TODO(b/71769302) zip_files_dir should have a more accurate default, if -// possible -string* FLAGS_zip_file_path = new string("./"); +// As archive file names are test-specific, no default is possible. +// +// This test supports input as both zip and tar, as a stock android image does +// not have unzip but does have tar. +string* FLAGS_zip_file_path = new string; +string* FLAGS_tar_file_path = new string; #ifndef __ANDROID__ string* FLAGS_unzip_binary_path = new string("/usr/bin/unzip"); +string* FLAGS_tar_binary_path = new string("/bin/tar"); #else string* FLAGS_unzip_binary_path = new string("/system/bin/unzip"); +string* FLAGS_tar_binary_path = new string("/system/bin/tar"); #endif bool FLAGS_use_nnapi = false; bool FLAGS_ignore_unsupported_nnapi = false; @@ -86,9 +91,6 @@ std::map kBrokenTests = { // Transpose only supports 1D-4D input tensors. {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"}, - // PRelu only supports 4D input with (1, 1, channels) 3D alpha now. - {R"(^\/prelu.*shared_axes=\[1\])", "75975192"}, - // No support for axis!=0 in GatherV2. {R"(^\/gather.*axis=1)", "76910444"}, @@ -101,11 +103,11 @@ std::map kBrokenTests = { "77546240"}, }; -// Allows test data to be unzipped into a temporary directory and makes +// Allows test data to be unarchived into a temporary directory and makes // sure those temporary directories are removed later. -class ZipEnvironment : public ::testing::Environment { +class ArchiveEnvironment : public ::testing::Environment { public: - ~ZipEnvironment() override {} + ~ArchiveEnvironment() override {} // Delete all temporary directories on teardown. void TearDown() override { @@ -117,15 +119,26 @@ class ZipEnvironment : public ::testing::Environment { temporary_directories_.clear(); } - // Unzip `zip` file into a new temporary directory `out_dir`. - tensorflow::Status UnZip(const string& zip, string* out_dir) { + // Unarchive `archive` file into a new temporary directory `out_dir`. + tensorflow::Status UnArchive(const string& zip, const string& tar, + string* out_dir) { string dir; TF_CHECK_OK(MakeTemporaryDirectory(&dir)); tensorflow::SubProcess proc; - string unzip_binary = *FLAGS_unzip_binary_path; - TF_CHECK_OK(env->FileExists(unzip_binary)); - TF_CHECK_OK(env->FileExists(zip)); - proc.SetProgram(unzip_binary, {"unzip", "-d", dir, zip}); + if (!zip.empty()) { + string unzip_binary = *FLAGS_unzip_binary_path; + TF_CHECK_OK(env->FileExists(unzip_binary)); + TF_CHECK_OK(env->FileExists(zip)); + proc.SetProgram(unzip_binary, {"unzip", "-d", dir, zip}); + } else { + string tar_binary = *FLAGS_tar_binary_path; + TF_CHECK_OK(env->FileExists(tar_binary)); + TF_CHECK_OK(env->FileExists(tar)); + // 'o' needs to be explicitly set on Android so that + // untarring works as non-root (otherwise tries to chown + // files, which fails) + proc.SetProgram(tar_binary, {"tar", "xfo", tar, "-C", dir}); + } proc.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE); proc.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE); if (!proc.Start()) @@ -159,15 +172,15 @@ class ZipEnvironment : public ::testing::Environment { std::vector temporary_directories_; }; -// Return the singleton zip_environment. -ZipEnvironment* zip_environment() { - static ZipEnvironment* env = new ZipEnvironment; +// Return the singleton archive_environment. +ArchiveEnvironment* archive_environment() { + static ArchiveEnvironment* env = new ArchiveEnvironment; return env; } -// Read the manifest.txt out of the unarchived zip file. Specifically +// Read the manifest.txt out of the unarchived archive file. Specifically // `original_file` is the original zip file for error messages. `dir` is -// the temporary directory where the zip file has been unarchived and +// the temporary directory where the archive file has been unarchived and // `test_paths` is the list of test prefixes that were in the manifest. // Note, it is an error for a manifest to contain no tests. tensorflow::Status ReadManifest(const string& original_file, const string& dir, @@ -193,12 +206,22 @@ tensorflow::Status ReadManifest(const string& original_file, const string& dir, return tensorflow::Status::OK(); } -// Get a list of tests from a zip file `zip_file_name`. -std::vector UnarchiveZipAndFindTestNames(const string& zip_file) { +// Get a list of tests from either zip or tar file +std::vector UnarchiveAndFindTestNames(const string& zip_file, + const string& tar_file) { + if (zip_file.empty() && tar_file.empty()) { + TF_CHECK_OK(tensorflow::Status(tensorflow::error::UNKNOWN, + "Neither zip_file nor tar_file was given")); + } string decompress_tmp_dir; - TF_CHECK_OK(zip_environment()->UnZip(zip_file, &decompress_tmp_dir)); + TF_CHECK_OK(archive_environment()->UnArchive(zip_file, tar_file, + &decompress_tmp_dir)); std::vector stuff; - TF_CHECK_OK(ReadManifest(zip_file, decompress_tmp_dir, &stuff)); + if (!zip_file.empty()) { + TF_CHECK_OK(ReadManifest(zip_file, decompress_tmp_dir, &stuff)); + } else { + TF_CHECK_OK(ReadManifest(tar_file, decompress_tmp_dir, &stuff)); + } return stuff; } @@ -226,8 +249,7 @@ TEST_P(OpsTest, RunZipTests) { string message = test_driver.GetErrorMessage(); if (bug_number.empty()) { if (FLAGS_use_nnapi && FLAGS_ignore_unsupported_nnapi && !result) { - EXPECT_EQ(message, string("Failed to invoke NNAPI interpreter")) - << message; + EXPECT_EQ(message, string("Failed to invoke interpreter")) << message; } else { EXPECT_TRUE(result) << message; } @@ -259,27 +281,34 @@ struct ZipPathParamName { } }; -INSTANTIATE_TEST_CASE_P( - tests, OpsTest, - ::testing::ValuesIn(UnarchiveZipAndFindTestNames(*FLAGS_zip_file_path)), - ZipPathParamName()); +INSTANTIATE_TEST_CASE_P(tests, OpsTest, + ::testing::ValuesIn(UnarchiveAndFindTestNames( + *FLAGS_zip_file_path, *FLAGS_tar_file_path)), + ZipPathParamName()); } // namespace testing } // namespace tflite int main(int argc, char** argv) { - ::testing::AddGlobalTestEnvironment(tflite::testing::zip_environment()); + ::testing::AddGlobalTestEnvironment(tflite::testing::archive_environment()); std::vector flags = { tensorflow::Flag( "ignore_known_bugs", &tflite::testing::FLAGS_ignore_known_bugs, "If a particular model is affected by a known bug, the " "corresponding test should expect the outputs to not match."), - tensorflow::Flag("zip_file_path", tflite::testing::FLAGS_zip_file_path, - "Required: Location of the test zip file."), + tensorflow::Flag( + "tar_file_path", tflite::testing::FLAGS_tar_file_path, + "Required (or zip_file_path): Location of the test tar file."), + tensorflow::Flag( + "zip_file_path", tflite::testing::FLAGS_zip_file_path, + "Required (or tar_file_path): Location of the test zip file."), tensorflow::Flag("unzip_binary_path", tflite::testing::FLAGS_unzip_binary_path, - "Required: Location of a suitable unzip binary."), + "Location of a suitable unzip binary."), + tensorflow::Flag("tar_binary_path", + tflite::testing::FLAGS_tar_binary_path, + "Location of a suitable tar binary."), tensorflow::Flag("use_nnapi", &tflite::testing::FLAGS_use_nnapi, "Whether to enable the NNAPI delegate"), tensorflow::Flag("ignore_unsupported_nnapi", diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD index c88079717ddc9bf39850762dffe711f0d2832d38..02d0890a7af606627c237314fe5ee108924d761d 100644 --- a/tensorflow/contrib/lite/toco/BUILD +++ b/tensorflow/contrib/lite/toco/BUILD @@ -11,6 +11,7 @@ load( "//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", + "tf_copts", ) tf_proto_library_cc( @@ -241,9 +242,11 @@ cc_library( "graph_transformations/resolve_constant_random_uniform.cc", "graph_transformations/resolve_constant_range.cc", "graph_transformations/resolve_constant_reshape.cc", + "graph_transformations/resolve_constant_select.cc", "graph_transformations/resolve_constant_shape_or_rank.cc", "graph_transformations/resolve_constant_slice.cc", "graph_transformations/resolve_constant_strided_slice.cc", + "graph_transformations/resolve_constant_tile.cc", "graph_transformations/resolve_constant_transpose.cc", "graph_transformations/resolve_constant_unary.cc", "graph_transformations/resolve_fake_quant_args_from_vars.cc", @@ -305,7 +308,7 @@ cc_library( "tensorflow_util.h", "toco_tooling.h", ], - copts = select({ + copts = tf_copts() + select({ "//tensorflow:darwin": ["-DTOCO_SUPPORT_PORTABLE_PROTOS=0"], "//conditions:default": [], }), @@ -360,6 +363,7 @@ cc_library( "dump_graphviz.h", "tooling_util.h", ], + copts = tf_copts(), visibility = ["//visibility:public"], deps = [ ":model", diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc index 6877fb237c0514a972589ac0301647104f5ed7ed..30525efd2391bb63afd7035b8134e5858add45f2 100644 --- a/tensorflow/contrib/lite/toco/dump_graphviz.cc +++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc @@ -167,7 +167,7 @@ NodeProperties GetPropertiesForArray(const Model& model, node_properties.label += "]"; int buffer_size = 0; - if (IsValid(array.shape())) { + if (IsNonEmpty(array.shape())) { buffer_size = RequiredBufferSizeForShape(array.shape()); node_properties.log2_buffer_size = std::log2(static_cast(buffer_size)); diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc index 378212cb74b5a43607e93d6d00e15c296403a071..02671f0408f55726df730dbe0fe9a4f936d22632 100644 --- a/tensorflow/contrib/lite/toco/export_tensorflow.cc +++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc @@ -664,13 +664,25 @@ void ConvertAddNOperator(const Model& model, const AddNOperator& src_op, void ConvertMulOperator(const Model& model, const MulOperator& src_op, GraphDef* tensorflow_graph) { - tensorflow::NodeDef* add_op = tensorflow_graph->add_node(); - add_op->set_op("Mul"); - add_op->set_name(src_op.outputs[0]); + tensorflow::NodeDef* mul_op = tensorflow_graph->add_node(); + mul_op->set_op("Mul"); + mul_op->set_name(src_op.outputs[0]); CHECK_EQ(src_op.inputs.size(), 2); - *add_op->add_input() = src_op.inputs[0]; - *add_op->add_input() = src_op.inputs[1]; - (*add_op->mutable_attr())["T"].set_type( + *mul_op->add_input() = src_op.inputs[0]; + *mul_op->add_input() = src_op.inputs[1]; + (*mul_op->mutable_attr())["T"].set_type( + GetTensorFlowDataType(model, src_op.outputs[0])); +} + +void ConvertDivOperator(const Model& model, const DivOperator& src_op, + GraphDef* tensorflow_graph) { + tensorflow::NodeDef* div_op = tensorflow_graph->add_node(); + div_op->set_op("Div"); + div_op->set_name(src_op.outputs[0]); + CHECK_EQ(src_op.inputs.size(), 2); + *div_op->add_input() = src_op.inputs[0]; + *div_op->add_input() = src_op.inputs[1]; + (*div_op->mutable_attr())["T"].set_type( GetTensorFlowDataType(model, src_op.outputs[0])); } @@ -1940,6 +1952,21 @@ void ConvertLogicalOrOperator(const Model& model, (*logical_or_op->mutable_attr())["T"].set_type(data_type); } +void ConvertCTCBeamSearchDecoderOperator( + const Model& model, const CTCBeamSearchDecoderOperator& src_op, + const char* op_name, GraphDef* tensorflow_graph) { + auto* op = tensorflow_graph->add_node(); + op->set_op(op_name); + op->set_name(src_op.outputs[0]); + CHECK_EQ(src_op.inputs.size(), 2); + for (int i = 0; i < 2; ++i) { + *op->add_input() = src_op.inputs[i]; + } + (*op->mutable_attr())["beam_width"].set_i(src_op.beam_width); + (*op->mutable_attr())["top_paths"].set_i(src_op.top_paths); + (*op->mutable_attr())["merge_repeated"].set_b(src_op.merge_repeated); +} + void ConvertOperator(const Model& model, const Operator& src_op, GraphDef* tensorflow_graph) { if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) { @@ -1975,6 +2002,9 @@ void ConvertOperator(const Model& model, const Operator& src_op, } else if (src_op.type == OperatorType::kMul) { ConvertMulOperator(model, static_cast(src_op), tensorflow_graph); + } else if (src_op.type == OperatorType::kDiv) { + ConvertDivOperator(model, static_cast(src_op), + tensorflow_graph); } else if (src_op.type == OperatorType::kRelu) { ConvertReluOperator(model, static_cast(src_op), tensorflow_graph); @@ -2194,6 +2224,10 @@ void ConvertOperator(const Model& model, const Operator& src_op, ConvertLogicalOrOperator(model, static_cast(src_op), "LogicalOr", tensorflow_graph); + } else if (src_op.type == OperatorType::kCTCBeamSearchDecoder) { + ConvertCTCBeamSearchDecoderOperator( + model, static_cast(src_op), + "CTCBeamSearchDecoder", tensorflow_graph); } else { LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type); } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h index 8d9a4c4700e12ac1a187038a0a5efc1b033d4e57..99f4a7d8f61eb3e75b643673d42d4b2103309f2e 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h +++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h @@ -190,6 +190,8 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantSlice) DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStridedSlice) DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFill) DECLARE_GRAPH_TRANSFORMATION(ResolveConstantGather) +DECLARE_GRAPH_TRANSFORMATION(ResolveConstantSelect) +DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTile) DECLARE_GRAPH_TRANSFORMATION(ResolveMultiplyByZero) DECLARE_GRAPH_TRANSFORMATION(Dequantize) DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup) diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc index 2f1bb8f0ad6374243e5a094701eef54cd086151a..d26c3b2878b8499fcbabc5448de9ec045eb07879 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc @@ -371,12 +371,26 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) { case OperatorType::kStridedSlice: case OperatorType::kSqueeze: case OperatorType::kReshape: + case OperatorType::kExpandDims: case OperatorType::kPad: case OperatorType::kGather: case OperatorType::kTranspose: case OperatorType::kMean: changed = HardcodeMinMaxFromFirstInput(model, op); break; + case OperatorType::kSum: + // reduce_sum is expected to change the output range. Hence + // a fake_quant op is necessary in the output to minimize error. However + // in special circumstances like when computing expected value using + // reduce_sum the input range and the output range matches. Hence the + // below code would act as a fallback. If a fake_quant node is observed in + // the output that takes precendence over the hard coding logic below. + changed = HardcodeMinMaxFromFirstInput(model, op); + if (changed) { + LOG(WARNING) << "Using the input range for output in reduce_sum op." + << "This could have an impact on your model accuracy."; + } + break; case OperatorType::kSelect: changed = HardcodeMinMaxForSelect(model, op); break; diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc index f033ee013ee6f51d4e23083c467effae95a9a85d..c8310161cb33bcc7137e8b163ea6469698ed2fd7 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc @@ -215,6 +215,18 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) { model->GetArray(op->outputs[0]).data_type = on_value_type; break; } + case OperatorType::kCTCBeamSearchDecoder: { + CHECK_EQ(op->inputs.size(), 2); + // All outputs (sparse tensors) are int32s (although tf uses int64s) + // except the last one (log probabilities) is float. + const int output_size = op->outputs.size(); + for (int i = 0; i < output_size - 1; ++i) { + model->GetArray(op->outputs[i]).data_type = ArrayDataType::kInt32; + } + model->GetArray(op->outputs[output_size - 1]).data_type = + ArrayDataType::kFloat; + break; + } default: { // These operators produce outputs with the same type as their 1st input CHECK_GT(op->inputs.size(), 0); diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc index 3c9379fd878ea350064c6b0f562ae11e9a713365..91e290439ae4bfd491c8201b02b161fe2caf2f8d 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc @@ -1082,27 +1082,23 @@ void ProcessTopkV2Operator(Model* model, TopKV2Operator* op) { } // Yield until input dims have been resolved. - if (!input_values.has_shape()) { + if (!input_values.has_shape() || !input_k.has_shape()) { return; } - const auto& input_values_shape = input_values.shape(); - auto output_indexes_dims = output_indexes.mutable_shape()->mutable_dims(); - auto output_values_dims = output_values.mutable_shape()->mutable_dims(); - for (int dim = 0; dim < input_values_shape.dimensions_count() - 1; dim++) { - output_indexes_dims->push_back(input_values_shape.dims(dim)); - output_values_dims->push_back(input_values_shape.dims(dim)); - } // If the value is initialized, we can specify the last dimension, otherwise // unknown. if (input_k.buffer) { + const auto& input_values_shape = input_values.shape(); + auto output_indexes_dims = output_indexes.mutable_shape()->mutable_dims(); + auto output_values_dims = output_values.mutable_shape()->mutable_dims(); + for (int dim = 0; dim < input_values_shape.dimensions_count() - 1; dim++) { + output_indexes_dims->push_back(input_values_shape.dims(dim)); + output_values_dims->push_back(input_values_shape.dims(dim)); + } const int32_t k_value = input_k.GetBuffer().data[0]; output_indexes_dims->push_back(k_value); output_values_dims->push_back(k_value); - - } else { - output_indexes_dims->push_back(0); - output_values_dims->push_back(0); } } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc index f6ce3b3ecb2cc06708287804bf34aa152d668f8c..8d22ae2eb1356b8c9c9430c517acddfc971b9f57 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc @@ -50,7 +50,7 @@ bool SupportsQuantization(const Operator& op) { type == OperatorType::kSqueeze || type == OperatorType::kPad || type == OperatorType::kPadV2 || type == OperatorType::kReshape || type == OperatorType::kTanh || type == OperatorType::kMul || - type == OperatorType::kBatchToSpaceND || + type == OperatorType::kBatchToSpaceND || type == OperatorType::kSum || type == OperatorType::kSpaceToBatchND || type == OperatorType::kSpaceToDepth || type == OperatorType::kStridedSlice || @@ -61,9 +61,20 @@ bool SupportsQuantization(const Operator& op) { type == OperatorType::kGreaterEqual || type == OperatorType::kLess || type == OperatorType::kLessEqual || type == OperatorType::kSelect || type == OperatorType::kArgMax || type == OperatorType::kRelu || - type == OperatorType::kRelu1 || type == OperatorType::kRelu6; + type == OperatorType::kRelu1 || type == OperatorType::kRelu6 || + type == OperatorType::kShape || type == OperatorType::kExpandDims; } +// The quantized op allows output arrays of type float using +// the attribute support_output_type_float_in_quantized_op +bool SupportOutputTypeFloatInQuantizedOp(const Operator& op) { + auto type = op.type; + if (type == OperatorType::kUnsupported) { + auto* unsupported = static_cast(&op); + return unsupported->support_output_type_float_in_quantized_op; + } + return false; +} const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) { auto& array = model->GetArray(array_name); // Normally we should have a MinMax recorded on this Array, @@ -584,61 +595,67 @@ bool Quantize::Run(Model* model, std::size_t op_index) { } // Quantize outputs, add Dequantize ops as needed on the outputs side - for (std::size_t output_index = 0; output_index < op.outputs.size(); - output_index++) { - ArrayDataType quantized_data_type; - QuantizationParams quantization_params; - if (ChooseQuantizationForOperatorOutput(this, model, op, output_index, - &quantized_data_type, - &quantization_params)) { - changed = true; - const auto& output = op.outputs[output_index]; - auto& output_array = model->GetArray(output); - - // Fix up the min/max information on the output array to match the chosen - // quantization parameters. - CHECK(output_array.minmax) - << "Output array named " << output << " lacks minmax"; - auto& output_minmax = output_array.GetMinMax(); - FixMinMaxPostQuantization(this, quantized_data_type, quantization_params, - &output_minmax); - - QuantizeArray(this, model, output, quantized_data_type, - quantization_params); - - const auto& dequantized_output = - AvailableArrayName(*model, output + "_dequantized"); - auto& dequantized_output_array = - model->GetOrCreateArray(dequantized_output); - dequantized_output_array.data_type = ArrayDataType::kFloat; - dequantized_output_array.final_data_type = output_array.data_type; - auto& dequantized_output_minmax = - dequantized_output_array.GetOrCreateMinMax(); - dequantized_output_minmax.min = output_minmax.min; - dequantized_output_minmax.max = output_minmax.max; - for (const auto& other_op : model->operators) { - for (auto& other_op_input : other_op->inputs) { - if (other_op_input == output) { - other_op_input = dequantized_output; + if (SupportOutputTypeFloatInQuantizedOp(op)) { + LOG(WARNING) + << HelpfulOperatorTypeName(op) << " is a quantized op" + << "but it has a model flag that sets the output arrays to float."; + } else { + for (std::size_t output_index = 0; output_index < op.outputs.size(); + output_index++) { + QuantizationParams quantization_params; + ArrayDataType quantized_data_type; + if (ChooseQuantizationForOperatorOutput(this, model, op, output_index, + &quantized_data_type, + &quantization_params)) { + changed = true; + const auto& output = op.outputs[output_index]; + auto& output_array = model->GetArray(output); + + // Fix up the min/max information on the output array to match the + // chosen quantization parameters. + CHECK(output_array.minmax) + << "Output array named " << output << " lacks minmax"; + auto& output_minmax = output_array.GetMinMax(); + FixMinMaxPostQuantization(this, quantized_data_type, + quantization_params, &output_minmax); + + QuantizeArray(this, model, output, quantized_data_type, + quantization_params); + + const auto& dequantized_output = + AvailableArrayName(*model, output + "_dequantized"); + auto& dequantized_output_array = + model->GetOrCreateArray(dequantized_output); + dequantized_output_array.data_type = ArrayDataType::kFloat; + dequantized_output_array.final_data_type = output_array.data_type; + auto& dequantized_output_minmax = + dequantized_output_array.GetOrCreateMinMax(); + dequantized_output_minmax.min = output_minmax.min; + dequantized_output_minmax.max = output_minmax.max; + for (const auto& other_op : model->operators) { + for (auto& other_op_input : other_op->inputs) { + if (other_op_input == output) { + other_op_input = dequantized_output; + } } } - } - auto* dequantize_op = new DequantizeOperator; - dequantize_op->inputs = {output}; - dequantize_op->outputs = {dequantized_output}; - for (int i = 0; i < model->flags.output_arrays_size(); i++) { - if (model->flags.output_arrays(i) == output) { - // TODO(b/78013785): never rename output arrays. - AddMessageF( - "Renaming output array %d after inserting dequant op %s: %s -> " - "%s", - i, LogName(*dequantize_op), model->flags.output_arrays(i), - dequantized_output); - model->flags.set_output_arrays(i, dequantized_output); + auto* dequantize_op = new DequantizeOperator; + dequantize_op->inputs = {output}; + dequantize_op->outputs = {dequantized_output}; + for (int i = 0; i < model->flags.output_arrays_size(); i++) { + if (model->flags.output_arrays(i) == output) { + // TODO(b/78013785): never rename output arrays. + AddMessageF( + "Renaming output array %d after inserting dequant op %s: %s -> " + "%s", + i, LogName(*dequantize_op), model->flags.output_arrays(i), + dequantized_output); + model->flags.set_output_arrays(i, dequantized_output); + } } + const auto op_it = FindOp(*model, &op); + model->operators.emplace(op_it + 1, dequantize_op); } - const auto op_it = FindOp(*model, &op); - model->operators.emplace(op_it + 1, dequantize_op); } } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc index 9f5d8b94507ec11957c3ae55ffca510eeb81ac89..fc49fbda59c78f056a7e194367618b43c0a4a7db 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc @@ -48,20 +48,26 @@ void RerouteEdges(const string& from_array, const string& to_array, } // namespace bool RemoveTrivialPassthroughOp(GraphTransformation* transformation, - Model* model, std::size_t op_index) { + Model* model, std::size_t op_index, + int input_index) { const auto passthru_it = model->operators.begin() + op_index; auto* passthru_op = passthru_it->get(); CHECK_EQ(passthru_op->outputs.size(), 1); CHECK_GE(passthru_op->inputs.size(), 1); - int count_nonconstant_input_arrays = 0; - // We call 'main input' the unique nonconstant input array if there is one, - // or else the 0-th input. + int main_input_array_index = 0; - for (int i = 0; i < passthru_op->inputs.size(); i++) { - if (!model->GetArray(passthru_op->inputs[i]).buffer) { - count_nonconstant_input_arrays++; - if (count_nonconstant_input_arrays == 1) { - main_input_array_index = i; + if (input_index != -1) { + main_input_array_index = input_index; + } else { + // We call 'main input' the unique nonconstant input array if there is one, + // or else the 0-th input. + int count_nonconstant_input_arrays = 0; + for (int i = 0; i < passthru_op->inputs.size(); i++) { + if (!model->GetArray(passthru_op->inputs[i]).buffer) { + count_nonconstant_input_arrays++; + if (count_nonconstant_input_arrays == 1) { + main_input_array_index = i; + } } } } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h index 9d448c3ee9088c16b96aa7ddc84457d2cab3231a..663704e5acf745d3768ad682e0a7888f0a690e6c 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h +++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h @@ -50,7 +50,8 @@ namespace toco { // and then discards it and returns true, or, if it's not trivial (if neither // the input nor the output may be discarded), returns false. bool RemoveTrivialPassthroughOp(GraphTransformation* transformation, - Model* model, std::size_t op_index); + Model* model, std::size_t op_index, + int input_index = -1); } // namespace toco diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc index 058f314b338aeeab94cb11fb8c1163427b559d3e..d395d7a6a0862d93fd4f52bb8b8d8d3ea7f8dc1e 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc @@ -26,14 +26,17 @@ limitations under the License. namespace toco { template -void GetBoundsForQuantizedDataType(double* min, double* max) { +void GetBoundsForQuantizedDataType(float* min, float* max) { using limits = std::numeric_limits>; *min = limits::min(); *max = limits::max(); } void GetBoundsForQuantizedDataType(ArrayDataType quantized_data_type, - double* min, double* max) { + float* min, float* max) { + // It is important for matching accuracy between TF training and TFLite + // inference, that the min and max values are float to match TF's + // FakeQuantWithMinMaxVarsFunctor. switch (quantized_data_type) { case ArrayDataType::kUint8: return GetBoundsForQuantizedDataType(min, max); @@ -109,22 +112,22 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) { QuantizationParams qparams; ChooseQuantizationParamsForArrayAndQuantizedDataType( output_array, quantized_data_type, &qparams); - double quantized_min, quantized_max; + float quantized_min, quantized_max; GetBoundsForQuantizedDataType(quantized_data_type, &quantized_min, &quantized_max); if (fakequant_op->narrow_range) { quantized_min++; } - for (int i = 0; i < size; i++) { - const double src_val = input_buffer.data[i]; - const double unclamped_quantized_val = - std::round(qparams.zero_point + src_val / qparams.scale); - const double quantized_val = std::min( - quantized_max, std::max(quantized_min, unclamped_quantized_val)); - const double dst_val = qparams.scale * (quantized_val - qparams.zero_point); - output_buffer.data[i] = dst_val; - } + // It is important for matching accuracy between TF training and TFLite + // inference, that the following variables are float to match TF's + // FakeQuantWithMinMaxVarsFunctor. + const float scale = qparams.scale; + const float nudged_min = (quantized_min - qparams.zero_point) * scale; + const float nudged_max = (quantized_max - qparams.zero_point) * scale; + tflite::FakeQuantizeArray(scale, nudged_min, nudged_max, + input_buffer.data.data(), output_buffer.data.data(), + size); if (IsDiscardableArray(*model, fakequant_op->inputs[0]) && CountOpsWithInput(*model, fakequant_op->inputs[0]) == 1) { diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc new file mode 100644 index 0000000000000000000000000000000000000000..e880a3f44dab376e5e441e3d6c0f747ee8490489 --- /dev/null +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc @@ -0,0 +1,78 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include + +#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h" +#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h" +#include "tensorflow/contrib/lite/toco/model.h" +#include "tensorflow/contrib/lite/toco/tooling_util.h" +#include "tensorflow/core/platform/logging.h" + +namespace toco { + +// Resolves a constant Select operation. +// +// This implementation is looking strictly for all-or-nothing on the select +// condition. It's possible to enhance this by looking per-element and possibly +// producing a Mul op. +bool ResolveConstantSelect::Run(Model* model, std::size_t op_index) { + auto it = model->operators.begin() + op_index; + const auto* base_op = it->get(); + if (base_op->type != OperatorType::kSelect) { + return false; + } + const auto* op = static_cast(base_op); + + CHECK_GE(op->inputs.size(), 3); + CHECK_EQ(op->outputs.size(), 1); + auto& output_array = model->GetArray(op->outputs[0]); + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes. + return false; + } + if (!output_array.has_shape()) { + // Yield until the output shape has been set by PropagateFixedShapes. + return false; + } + + // We require the cond input to be constant. + if (!IsConstantParameterArray(*model, op->inputs[0])) { + return false; + } + const Array& cond_array = model->GetArray(op->inputs[0]); + CHECK(cond_array.data_type == ArrayDataType::kBool) + << "Only bool conditions are supported"; + const auto& cond_data = cond_array.GetBuffer().data; + if (cond_data.empty()) { + return false; + } + + // Check if the condition is the same for all elements. + bool cond_value = cond_data[0]; + for (size_t i = 1; i < cond_data.size(); ++i) { + if (cond_data[i] != cond_value) { + AddMessageF( + "Cannot resolve %s as constant; cond_array has differing " + "per-element values", + LogName(*op)); + return false; + } + } + + // Pass-through the selected input. + return RemoveTrivialPassthroughOp(this, model, op_index, cond_value ? 1 : 2); +} + +} // namespace toco diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b0d0707146255562c093dd27b91ccb2b603a587 --- /dev/null +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc @@ -0,0 +1,173 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include + +#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h" +#include "tensorflow/contrib/lite/toco/model.h" +#include "tensorflow/contrib/lite/toco/tooling_util.h" +#include "tensorflow/core/platform/logging.h" + +namespace toco { + +namespace { + +// NOTE: the Tile implementation here is taken from tflite's Tile kernel. + +template +void CopyMultipleTimes(const T* in_data, int32_t in_size, int32_t multiplier, + T* out_data) { + for (int i = 0; i < multiplier; ++i) { + const T* in_end = in_data + in_size; + T* new_out_data = std::copy(in_data, in_end, out_data); + in_data = out_data; + out_data = new_out_data; + } +} + +template +std::pair TileOneDimension(const Shape& in_dimensions, + const T* in_data, const M* multipliers, + T* out_data, int dimension) { + const int dimension_size = in_dimensions.dims(dimension); + if (dimension == in_dimensions.dimensions_count() - 1) { + CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], + out_data); + return std::make_pair( + dimension_size, + dimension_size * static_cast(multipliers[dimension])); + } + int total_stride_size = 0, total_tiled_stride_size = 0; + const T* copy_from_data = in_data; + T* copy_to_data = out_data; + for (int i = 0; i < dimension_size; ++i) { + int stride_size = 0, tiled_stride_size = 0; + std::tie(stride_size, tiled_stride_size) = + TileOneDimension(in_dimensions, copy_from_data, multipliers, + copy_to_data, dimension + 1); + copy_from_data += stride_size; + copy_to_data += tiled_stride_size; + total_stride_size += stride_size; + total_tiled_stride_size += tiled_stride_size; + } + CopyMultipleTimes(out_data, total_tiled_stride_size, + multipliers[dimension] - 1, + out_data + total_tiled_stride_size); + return std::make_pair(total_stride_size, + total_tiled_stride_size * multipliers[dimension]); +} + +template +inline void Tile(const Array& input_array, const Array& multiples_array, + Array* output_array) { + // Allocate output storage. + auto& output_data = output_array->GetMutableBuffer().data; + output_data.resize(RequiredBufferSizeForShape(output_array->shape())); + + switch (multiples_array.data_type) { + case ArrayDataType::kInt32: + TileOneDimension( + input_array.shape(), input_array.GetBuffer().data.data(), + multiples_array.GetBuffer().data.data(), + output_array->GetMutableBuffer().data.data(), 0); + break; + case ArrayDataType::kInt64: + TileOneDimension( + input_array.shape(), input_array.GetBuffer().data.data(), + multiples_array.GetBuffer().data.data(), + output_array->GetMutableBuffer().data.data(), 0); + break; + default: + CHECK(false); + break; + } +} + +} // namespace + +// Resolves a constant Tile operation. +bool ResolveConstantTile::Run(Model* model, std::size_t op_index) { + auto it = model->operators.begin() + op_index; + const auto* base_op = it->get(); + if (base_op->type != OperatorType::kTile) { + return false; + } + const auto* op = static_cast(base_op); + + CHECK_GE(op->inputs.size(), 2); + CHECK_EQ(op->outputs.size(), 1); + auto& output_array = model->GetArray(op->outputs[0]); + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes. + return false; + } + if (!output_array.has_shape()) { + // Yield until the output shape has been set by PropagateFixedShapes. + return false; + } + + // We require constant inputs. + if (!IsConstantParameterArray(*model, op->inputs[0]) || + !IsConstantParameterArray(*model, op->inputs[1])) { + return false; + } + const Array& input_array = model->GetArray(op->inputs[0]); + const Array& multiples_array = model->GetArray(op->inputs[1]); + CHECK(multiples_array.data_type == ArrayDataType::kInt32 || + multiples_array.data_type == ArrayDataType::kInt64) + << "Only int32/int64 indices are supported"; + + // Copy min/max info if present. The ranges of the selected values may be + // a subset of the original range but we want to ensure the quantization + // params stay the same. + if (input_array.minmax) { + const auto& input_minmax = input_array.GetMinMax(); + auto& output_minmax = output_array.GetOrCreateMinMax(); + output_minmax.min = input_minmax.min; + output_minmax.max = input_minmax.max; + } + + CHECK(!output_array.buffer); + switch (output_array.data_type) { + case ArrayDataType::kFloat: + Tile(input_array, multiples_array, &output_array); + break; + case ArrayDataType::kUint8: + Tile(input_array, multiples_array, &output_array); + break; + case ArrayDataType::kInt16: + Tile(input_array, multiples_array, &output_array); + break; + case ArrayDataType::kInt32: + Tile(input_array, multiples_array, &output_array); + break; + case ArrayDataType::kInt64: + Tile(input_array, multiples_array, &output_array); + break; + default: + LOG(FATAL) << "Unsupported data type given to Tile op with output \"" + << op->outputs[0] << "\""; + break; + } + + // Erase input arrays if no longer used after we remove the op. + DeleteArrayIfUsedOnce(op->inputs[0], model); + DeleteArrayIfUsedOnce(op->inputs[1], model); + + // Erase the operator. + model->operators.erase(it); + return true; +} + +} // namespace toco diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc index da8e7a2d1c06cf89b9708b404da7667565245f8f..8bef440afd21572d7014e4f376be3aba2d80127d 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc @@ -92,7 +92,9 @@ bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) { if (*input_it == switch_op->outputs[nonselected_output_index]) { // Let us guard our assumption that only Merge nodes consume the outputs // of Switch nodes: - CHECK(other_op->type == OperatorType::kMerge); + CHECK(other_op->type == OperatorType::kMerge) + << "Found " << HelpfulOperatorTypeName(*other_op) + << " as non-selected output from Switch, but only Merge supported."; input_it = other_op->inputs.erase(input_it); } else { ++input_it; diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc index 9a3db5c888cd091a28bb4feaf0bbffc8742f90b9..b7fffbce2223a71ac1e16ec1ce18ba9f610cc2ac 100644 --- a/tensorflow/contrib/lite/toco/import_tensorflow.cc +++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc @@ -1049,6 +1049,8 @@ tensorflow::Status ConvertUnsupportedOperator( static constexpr char kAttrOutputQuantized[] = "_output_quantized"; static constexpr char kAttrOutputTypes[] = "_output_types"; static constexpr char kAttrOutputShapes[] = "_output_shapes"; + static constexpr char kAttrSupportOutputTypeFloatInQuantizedOp[] = + "_support_output_type_float_in_quantized_op"; LOG(INFO) << "Converting unsupported operation: " << node.op(); auto* op = new TensorFlowUnsupportedOperator; @@ -1060,9 +1062,15 @@ tensorflow::Status ConvertUnsupportedOperator( op->tensorflow_op = node.op(); node.SerializeToString(&op->tensorflow_node_def); model->operators.emplace_back(op); + // Parse if the op supports quantization if (HasAttr(node, kAttrOutputQuantized)) { op->quantized = GetBoolAttr(node, kAttrOutputQuantized); } + // Parse if the quantized op allows output arrays of type float + if (HasAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp)) { + op->support_output_type_float_in_quantized_op = + GetBoolAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp); + } if (HasAttr(node, kAttrOutputTypes)) { const auto& output_types = GetListAttr(node, kAttrOutputTypes); for (int i = 0; i < output_types.type_size(); ++i) { @@ -1215,11 +1223,10 @@ tensorflow::Status ConvertGatherOperator( return tensorflow::Status::OK(); } -template +template tensorflow::Status ConvertArgMinMaxOperator( const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, Model* model) { - CHECK_EQ(node.op(), op_name); TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2)); const auto axis_data_type = HasAttr(node, "Tidx") ? GetDataTypeAttr(node, "Tidx") : DT_INT32; @@ -1237,6 +1244,20 @@ tensorflow::Status ConvertArgMinMaxOperator( return tensorflow::Status::OK(); } +tensorflow::Status ConvertArgMaxOperator( + const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, + Model* model) { + CHECK_EQ(node.op(), "ArgMax"); + return ConvertArgMinMaxOperator(node, tf_import_flags, model); +} + +tensorflow::Status ConvertArgMinOperator( + const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, + Model* model) { + CHECK_EQ(node.op(), "ArgMin"); + return ConvertArgMinMaxOperator(node, tf_import_flags, model); +} + tensorflow::Status ConvertResizeBilinearOperator( const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, Model* model) { @@ -1854,6 +1875,34 @@ tensorflow::Status ConvertOneHotOperator( return tensorflow::Status::OK(); } +tensorflow::Status ConvertCTCBeamSearchDecoderOperator( + const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, + Model* model) { + CHECK_EQ(node.op(), "CTCBeamSearchDecoder"); + TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2)); + + auto* op = new CTCBeamSearchDecoderOperator; + for (const string& input : node.input()) { + op->inputs.push_back(input); + } + + op->beam_width = + HasAttr(node, "beam_width") ? GetIntAttr(node, "beam_width") : 1; + op->top_paths = + HasAttr(node, "top_paths") ? GetIntAttr(node, "top_paths") : 1; + op->merge_repeated = HasAttr(node, "merge_repeated") + ? GetBoolAttr(node, "merge_repeated") + : true; + + // There are top_paths + 1 outputs. + op->outputs.push_back(node.name()); // Implicit :0. + for (int i = 0; i < op->top_paths; ++i) { + op->outputs.push_back(node.name() + ":" + std::to_string(i + 1)); + } + model->operators.emplace_back(op); + return tensorflow::Status::OK(); +} + } // namespace namespace internal { @@ -1863,17 +1912,14 @@ using ConverterType = tensorflow::Status (*)( Model* model); using ConverterMapType = std::unordered_map; -constexpr char kArgMax[] = "ArgMax"; -constexpr char kArgMin[] = "ArgMin"; - ConverterMapType GetTensorFlowNodeConverterMap() { return std::unordered_map({ {"Add", ConvertSimpleOperator}, {"AddN", ConvertSimpleOperator}, {"All", ConvertSimpleOperator}, {"Any", ConvertAnyOperator}, - {"ArgMax", ConvertArgMinMaxOperator}, - {"ArgMin", ConvertArgMinMaxOperator}, + {"ArgMax", ConvertArgMaxOperator}, + {"ArgMin", ConvertArgMinOperator}, {"Assert", ConvertSimpleOperator}, {"AvgPool", ConvertAvgPoolOperator}, {"BatchMatMul", ConvertBatchMatMulOperator}, @@ -1888,6 +1934,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() { {"Const", ConvertConstOperator}, {"Conv2D", ConvertConvOperator}, {"Conv2DBackpropInput", ConvertTransposeConvOperator}, + {"CTCBeamSearchDecoder", ConvertCTCBeamSearchDecoderOperator}, {"DepthToSpace", ConvertDepthToSpaceOperator}, {"DepthwiseConv2dNative", ConvertDepthwiseConvOperator}, {"Div", ConvertSimpleOperator}, diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h index 7d0dbfcc0550c043e868ceb4d131fbe9c2fdfd0d..412e14c4ada3280dafcd2fcfa59e2908dd785f9f 100644 --- a/tensorflow/contrib/lite/toco/model.h +++ b/tensorflow/contrib/lite/toco/model.h @@ -148,6 +148,7 @@ enum class OperatorType : uint8 { kLogicalAnd, kLogicalNot, kLogicalOr, + kCTCBeamSearchDecoder, }; // Helper to deal with TensorFlow arrays using a different ordering of @@ -438,6 +439,28 @@ struct ConvOperator : Operator { int dilation_height_factor = 1; }; +// CTCBeamSearchDecoder operator: +// +// Inputs: +// inputs[0]: required: the logits. +// inputs[1]: required: sequence length. +// inputs[2]: optional: beam width. +// inputs[3]: optional: top paths. +// inputs[4]: optional: merge repeated. +// +// Outputs: +// outputs[0]: deocoded. +// outputs[1]: log probability. +// +// TensorFlow equivalent: CTCBeamSearchDecoder +struct CTCBeamSearchDecoderOperator : Operator { + CTCBeamSearchDecoderOperator() + : Operator(OperatorType::kCTCBeamSearchDecoder) {} + int beam_width; + int top_paths; + bool merge_repeated = true; +}; + // Depthwise-separable convolution operator. // // Inputs: @@ -1509,6 +1532,9 @@ struct TensorFlowUnsupportedOperator : Operator { string tensorflow_node_def; // A boolean indicating if the unsupported op should be treated as quantized. bool quantized = false; + // A boolean indicating if the unsupported op output should allow float values + // in quantized mode. + bool support_output_type_float_in_quantized_op = false; // Output data types std::vector output_data_types; // Output shapes. @@ -2045,7 +2071,7 @@ class Model { std::size_t transient_data_size = 0; // For code-generation only: required alignment of the transient_data buffer std::size_t transient_data_alignment = 0; - // Arithmatic operations performed in the model. + // Arithmetic operations performed in the model. int64 ops_count = 0; private: diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD index 83e977d7b3b0a4d572faee3ba7e36690896ac8e8..709c53606b1081111fb2e2f8971ba71e5d38b629 100644 --- a/tensorflow/contrib/lite/toco/tflite/BUILD +++ b/tensorflow/contrib/lite/toco/tflite/BUILD @@ -27,6 +27,7 @@ cc_library( "//tensorflow/contrib/lite/toco:graph_transformations", "//tensorflow/contrib/lite/toco:model", "//tensorflow/core:protos_all_cc", + "//tensorflow/core:ptr_util", "@com_google_absl//absl/memory", "@flatbuffers", ], diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc index 9380168f30522ad49f0cc6bc8d50539e45905e1e..75808f2b690fb6699f86d61a3078ef458db6d295 100644 --- a/tensorflow/contrib/lite/toco/tflite/operator.cc +++ b/tensorflow/contrib/lite/toco/tflite/operator.cc @@ -21,9 +21,9 @@ limitations under the License. #include "tensorflow/contrib/lite/toco/tflite/custom_operator.h" #include "tensorflow/contrib/lite/toco/tflite/simple_operator.h" #include "tensorflow/contrib/lite/toco/tflite/types.h" - #include "tensorflow/core/framework/attr_value.pb.h" #include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/util/ptr_util.h" namespace toco { @@ -1070,6 +1070,27 @@ class OneHot : public BuiltinOperator { + public: + using CustomOperator::CustomOperator; + + void WriteOptions(const TocoOperator& op, + flexbuffers::Builder* fbb) const override { + fbb->Int("beam_width", op.beam_width); + fbb->Int("top_paths", op.top_paths); + fbb->Bool("merge_repeated", op.merge_repeated); + } + + void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override { + op->beam_width = m["beam_width"].AsInt32(); + op->top_paths = m["top_paths"].AsInt32(); + op->merge_repeated = m["merge_repeated"].AsBool(); + } + + int GetVersion(const Operator& op) const override { return 1; } +}; + class TensorFlowUnsupported : public BaseOperator { public: using BaseOperator::BaseOperator; @@ -1179,6 +1200,12 @@ class TensorFlowUnsupported : public BaseOperator { break; case flexbuffers::TYPE_BOOL: (*attr)[key].set_b(value.AsBool()); + if (string(key) == "_output_quantized") { + op->quantized = value.AsBool(); + } + if (string(key) == "_support_output_type_float_in_quantized_op") { + op->support_output_type_float_in_quantized_op = value.AsBool(); + } break; case flexbuffers::TYPE_VECTOR_INT: { auto* list = (*attr)[key].mutable_list(); @@ -1208,156 +1235,175 @@ namespace { // Build a vector containing all the known operators. std::vector> BuildOperatorList() { std::vector> ops; - + using tensorflow::MakeUnique; // Builtin Operators. - ops.emplace_back(new Add(::tflite::BuiltinOperator_ADD, OperatorType::kAdd)); - ops.emplace_back(new Div(::tflite::BuiltinOperator_DIV, OperatorType::kDiv)); - ops.emplace_back(new Sub(::tflite::BuiltinOperator_SUB, OperatorType::kSub)); - ops.emplace_back(new AveragePool(::tflite::BuiltinOperator_AVERAGE_POOL_2D, - OperatorType::kAveragePool)); - ops.emplace_back( - new SpaceToBatchND(::tflite::BuiltinOperator_SPACE_TO_BATCH_ND, - OperatorType::kSpaceToBatchND)); - ops.emplace_back( - new BatchToSpaceND(::tflite::BuiltinOperator_BATCH_TO_SPACE_ND, - OperatorType::kBatchToSpaceND)); - ops.emplace_back(new Concatenation(::tflite::BuiltinOperator_CONCATENATION, - OperatorType::kConcatenation)); - ops.emplace_back( - new Convolution(::tflite::BuiltinOperator_CONV_2D, OperatorType::kConv)); - ops.emplace_back( - new DepthwiseConvolution(::tflite::BuiltinOperator_DEPTHWISE_CONV_2D, - OperatorType::kDepthwiseConv)); - ops.emplace_back(new FullyConnected(::tflite::BuiltinOperator_FULLY_CONNECTED, - OperatorType::kFullyConnected)); - ops.emplace_back( - new Gather(::tflite::BuiltinOperator_GATHER, OperatorType::kGather)); - ops.emplace_back( - new L2Normalization(::tflite::BuiltinOperator_L2_NORMALIZATION, - OperatorType::kL2Normalization)); - ops.emplace_back( - new L2Pool(::tflite::BuiltinOperator_L2_POOL_2D, OperatorType::kL2Pool)); - ops.emplace_back(new LocalResponseNormalization( + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_ADD, OperatorType::kAdd)); + ops.push_back( + MakeUnique
(::tflite::BuiltinOperator_DIV, OperatorType::kDiv)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_SUB, OperatorType::kSub)); + ops.push_back(MakeUnique( + ::tflite::BuiltinOperator_AVERAGE_POOL_2D, OperatorType::kAveragePool)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_SPACE_TO_BATCH_ND, + OperatorType::kSpaceToBatchND)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_BATCH_TO_SPACE_ND, + OperatorType::kBatchToSpaceND)); + ops.push_back(MakeUnique( + ::tflite::BuiltinOperator_CONCATENATION, OperatorType::kConcatenation)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_CONV_2D, + OperatorType::kConv)); + ops.push_back(MakeUnique( + ::tflite::BuiltinOperator_DEPTHWISE_CONV_2D, + OperatorType::kDepthwiseConv)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_FULLY_CONNECTED, + OperatorType::kFullyConnected)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_GATHER, + OperatorType::kGather)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_L2_NORMALIZATION, + OperatorType::kL2Normalization)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_L2_POOL_2D, + OperatorType::kL2Pool)); + ops.push_back(MakeUnique( ::tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION, OperatorType::kLocalResponseNormalization)); - ops.emplace_back(new MaxPool(::tflite::BuiltinOperator_MAX_POOL_2D, - OperatorType::kMaxPool)); - ops.emplace_back(new Mul(::tflite::BuiltinOperator_MUL, OperatorType::kMul)); - ops.emplace_back(new Pad(::tflite::BuiltinOperator_PAD, OperatorType::kPad)); - ops.emplace_back( - new PadV2(::tflite::BuiltinOperator_PADV2, OperatorType::kPadV2)); - ops.emplace_back( - new Reshape(::tflite::BuiltinOperator_RESHAPE, OperatorType::kReshape)); - ops.emplace_back( - new Softmax(::tflite::BuiltinOperator_SOFTMAX, OperatorType::kSoftmax)); - ops.emplace_back(new SpaceToDepth(::tflite::BuiltinOperator_SPACE_TO_DEPTH, - OperatorType::kSpaceToDepth)); - ops.emplace_back( - new Svdf(::tflite::BuiltinOperator_SVDF, OperatorType::kSvdf)); - ops.emplace_back(new Transpose(::tflite::BuiltinOperator_TRANSPOSE, - OperatorType::kTranspose)); - ops.emplace_back( - new Mean(::tflite::BuiltinOperator_MEAN, OperatorType::kMean)); - ops.emplace_back(new Sum(::tflite::BuiltinOperator_SUM, OperatorType::kSum)); - ops.emplace_back(new ReduceProd(::tflite::BuiltinOperator_REDUCE_PROD, - OperatorType::kReduceProd)); - ops.emplace_back(new ReduceMax(::tflite::BuiltinOperator_REDUCE_MAX, - OperatorType::kReduceMax)); - ops.emplace_back(new ResizeBilinear(::tflite::BuiltinOperator_RESIZE_BILINEAR, - OperatorType::kResizeBilinear)); - ops.emplace_back( - new Squeeze(::tflite::BuiltinOperator_SQUEEZE, OperatorType::kSqueeze)); - ops.emplace_back( - new Split(::tflite::BuiltinOperator_SPLIT, OperatorType::kSplit)); - ops.emplace_back(new StridedSlice(::tflite::BuiltinOperator_STRIDED_SLICE, - OperatorType::kStridedSlice)); - ops.emplace_back( - new TopK_V2(::tflite::BuiltinOperator_TOPK_V2, OperatorType::kTopK_V2)); - ops.emplace_back( - new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell)); - ops.emplace_back( - new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast)); - ops.emplace_back( - new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax)); - ops.emplace_back( - new ArgMin(::tflite::BuiltinOperator_ARG_MIN, OperatorType::kArgMin)); - ops.emplace_back( - new Tile(::tflite::BuiltinOperator_TILE, OperatorType::kTile)); - ops.emplace_back(new ExpandDims(::tflite::BuiltinOperator_EXPAND_DIMS, - OperatorType::kExpandDims)); - ops.emplace_back(new TransposeConv(::tflite::BuiltinOperator_TRANSPOSE_CONV, - OperatorType::kTransposeConv)); - ops.emplace_back(new SparseToDense(::tflite::BuiltinOperator_SPARSE_TO_DENSE, - OperatorType::kSparseToDense)); - ops.emplace_back( - new Shape(::tflite::BuiltinOperator_SHAPE, OperatorType::kShape)); - ops.emplace_back(new FakeQuant(::tflite::BuiltinOperator_FAKE_QUANT, - OperatorType::kFakeQuant)); - ops.emplace_back( - new Pack(::tflite::BuiltinOperator_PACK, OperatorType::kPack)); - ops.emplace_back( - new OneHot(::tflite::BuiltinOperator_ONE_HOT, OperatorType::kOneHot)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_MAX_POOL_2D, + OperatorType::kMaxPool)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_MUL, OperatorType::kMul)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_PAD, OperatorType::kPad)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_PADV2, OperatorType::kPadV2)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_RESHAPE, + OperatorType::kReshape)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_SOFTMAX, + OperatorType::kSoftmax)); + ops.push_back(MakeUnique( + ::tflite::BuiltinOperator_SPACE_TO_DEPTH, OperatorType::kSpaceToDepth)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_SVDF, OperatorType::kSvdf)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_TRANSPOSE, + OperatorType::kTranspose)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_MEAN, OperatorType::kMean)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_SUM, OperatorType::kSum)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_REDUCE_PROD, + OperatorType::kReduceProd)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_REDUCE_MAX, + OperatorType::kReduceMax)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_RESIZE_BILINEAR, + OperatorType::kResizeBilinear)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_SQUEEZE, + OperatorType::kSqueeze)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_SPLIT, OperatorType::kSplit)); + ops.push_back(MakeUnique( + ::tflite::BuiltinOperator_STRIDED_SLICE, OperatorType::kStridedSlice)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_TOPK_V2, + OperatorType::kTopK_V2)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_LSTM, + OperatorType::kLstmCell)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_CAST, OperatorType::kCast)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_ARG_MAX, + OperatorType::kArgMax)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_ARG_MIN, + OperatorType::kArgMin)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_TILE, OperatorType::kTile)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_EXPAND_DIMS, + OperatorType::kExpandDims)); + ops.push_back(MakeUnique( + ::tflite::BuiltinOperator_TRANSPOSE_CONV, OperatorType::kTransposeConv)); + ops.push_back(MakeUnique( + ::tflite::BuiltinOperator_SPARSE_TO_DENSE, OperatorType::kSparseToDense)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_SHAPE, OperatorType::kShape)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_FAKE_QUANT, + OperatorType::kFakeQuant)); + ops.push_back( + MakeUnique(::tflite::BuiltinOperator_PACK, OperatorType::kPack)); + ops.push_back(MakeUnique(::tflite::BuiltinOperator_ONE_HOT, + OperatorType::kOneHot)); // Custom Operators. - ops.emplace_back( - new DepthToSpace("DEPTH_TO_SPACE", OperatorType::kDepthToSpace)); - ops.emplace_back(new TensorFlowUnsupported("TENSORFLOW_UNSUPPORTED", - OperatorType::kUnsupported)); + ops.push_back( + MakeUnique("DEPTH_TO_SPACE", OperatorType::kDepthToSpace)); + ops.push_back(MakeUnique( + "CTC_BEAM_SEARCH_DECODER", OperatorType::kCTCBeamSearchDecoder)); + ops.push_back(MakeUnique("TENSORFLOW_UNSUPPORTED", + OperatorType::kUnsupported)); // There operators are supported by Toco, but not by TF Lite, and has no // attributes. - ops.emplace_back( - new SimpleOperator("ADDN", OperatorType::kAddN)); + ops.push_back( + MakeUnique>("ADDN", OperatorType::kAddN)); // Simple Operators. - ops.emplace_back(new SimpleOperator( + ops.push_back(MakeUnique>( "DEQUANTIZE", OperatorType::kDequantize)); - ops.emplace_back( - new SimpleOperator("FLOOR", OperatorType::kFloor)); - ops.emplace_back( - new SimpleOperator("RELU", OperatorType::kRelu)); - ops.emplace_back( - new SimpleOperator("RELU_N1_TO_1", OperatorType::kRelu1)); - ops.emplace_back( - new SimpleOperator("RELU6", OperatorType::kRelu6)); - ops.emplace_back( - new SimpleOperator("PRELU", OperatorType::kPRelu)); - ops.emplace_back(new SimpleOperator( + ops.push_back( + MakeUnique>("FLOOR", OperatorType::kFloor)); + ops.push_back( + MakeUnique>("RELU", OperatorType::kRelu)); + ops.push_back(MakeUnique>( + "RELU_N1_TO_1", OperatorType::kRelu1)); + ops.push_back( + MakeUnique>("RELU6", OperatorType::kRelu6)); + ops.push_back( + MakeUnique>("PRELU", OperatorType::kPRelu)); + ops.push_back(MakeUnique>( "LOGISTIC", OperatorType::kLogistic)); - ops.emplace_back( - new SimpleOperator("TANH", OperatorType::kTanh)); - ops.emplace_back(new SimpleOperator("EXP", OperatorType::kExp)); - ops.emplace_back(new SimpleOperator( + ops.push_back( + MakeUnique>("TANH", OperatorType::kTanh)); + ops.push_back( + MakeUnique>("EXP", OperatorType::kExp)); + ops.push_back(MakeUnique>( "LOG_SOFTMAX", OperatorType::kLogSoftmax)); - ops.emplace_back(new SimpleOperator( + ops.push_back(MakeUnique>( "MAXIMUM", OperatorType::kMaximum)); // Element-wise Maximum - ops.emplace_back(new SimpleOperator( + ops.push_back(MakeUnique>( "MINIMUM", OperatorType::kMinimum)); // Element-wise Minimum - ops.emplace_back(new SimpleOperator( + ops.push_back(MakeUnique>( "GREATER", OperatorType::kGreater)); - ops.emplace_back(new SimpleOperator( + ops.push_back(MakeUnique>( "GREATER_EQUAL", OperatorType::kGreaterEqual)); - ops.emplace_back( - new SimpleOperator("LESS", OperatorType::kLess)); - ops.emplace_back(new SimpleOperator( + ops.push_back(MakeUnique>( + "LESS", OperatorType::kLess)); + ops.push_back(MakeUnique>( "LESS_EQUAL", OperatorType::kLessEqual)); - ops.emplace_back(new SimpleOperator( + ops.push_back(MakeUnique>( "EQUAL", OperatorType::kEqual)); - ops.emplace_back(new SimpleOperator( + ops.push_back(MakeUnique>( "NOT_EQUAL", OperatorType::kNotEqual)); - ops.emplace_back(new SimpleOperator("NEG", OperatorType::kNeg)); - ops.emplace_back( - new SimpleOperator("SELECT", OperatorType::kSelect)); - ops.emplace_back( - new SimpleOperator("SLICE", OperatorType::kSlice)); - ops.emplace_back(new SimpleOperator("POW", OperatorType::kPow)); - ops.emplace_back(new SimpleOperator( + ops.push_back( + MakeUnique>("NEG", OperatorType::kNeg)); + ops.push_back(MakeUnique>( + "SELECT", OperatorType::kSelect)); + ops.push_back( + MakeUnique>("SLICE", OperatorType::kSlice)); + ops.push_back( + MakeUnique>("POW", OperatorType::kPow)); + ops.push_back(MakeUnique>( "LOGICAL_OR", OperatorType::kLogicalOr)); + ops.emplace_back(new SimpleOperator( + "LOGICAL_AND", OperatorType::kLogicalAnd)); + ops.emplace_back(new SimpleOperator( + "LOGICAL_NOT", OperatorType::kLogicalNot)); // Element-wise operator - ops.emplace_back(new SimpleOperator("SIN", OperatorType::kSin)); - ops.emplace_back(new SimpleOperator("LOG", OperatorType::kLog)); - ops.emplace_back( - new SimpleOperator("SQRT", OperatorType::kSqrt)); - ops.emplace_back(new SimpleOperator( + ops.push_back( + MakeUnique>("SIN", OperatorType::kSin)); + ops.push_back( + MakeUnique>("LOG", OperatorType::kLog)); + ops.push_back(MakeUnique>( + "SQRT", OperatorType::kSqrt)); + ops.push_back(MakeUnique>( "RSQRT", OperatorType::kRsqrt)); return ops; diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc index 384f7c118de82d9907e3791f880a5beee2a1a77a..fc854461b4e816e12e12590479501b6542258fef 100644 --- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc +++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc @@ -129,6 +129,10 @@ TEST_F(OperatorTest, SimpleOperators) { CheckSimpleOperator("POW", OperatorType::kPow); CheckSimpleOperator("LOGICAL_OR", OperatorType::kLogicalOr); + CheckSimpleOperator("LOGICAL_AND", + OperatorType::kLogicalAnd); + CheckSimpleOperator("LOGICAL_NOT", + OperatorType::kLogicalNot); } TEST_F(OperatorTest, BuiltinAdd) { @@ -472,6 +476,20 @@ TEST_F(OperatorTest, BuiltinOneHot) { EXPECT_EQ(op.axis, output_toco_op->axis); } +TEST_F(OperatorTest, CustomCTCBeamSearchDecoder) { + CTCBeamSearchDecoderOperator op; + op.beam_width = 3; + op.top_paths = 2; + op.merge_repeated = false; + std::unique_ptr output_toco_op = + SerializeAndDeserialize(GetOperator("CTC_BEAM_SEARCH_DECODER", + OperatorType::kCTCBeamSearchDecoder), + op); + EXPECT_EQ(op.beam_width, output_toco_op->beam_width); + EXPECT_EQ(op.top_paths, output_toco_op->top_paths); + EXPECT_EQ(op.merge_repeated, output_toco_op->merge_repeated); +} + TEST_F(OperatorTest, TensorFlowUnsupported) { TensorFlowUnsupportedOperator op; op.tensorflow_op = "MyCustomUnsupportedOp"; diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc index de76fd4032d24eff8a6c2fd0c16a911b9c00186b..14168fa33f77a75706a52f00ddfa6b1120d90626 100644 --- a/tensorflow/contrib/lite/toco/toco_port.cc +++ b/tensorflow/contrib/lite/toco/toco_port.cc @@ -38,7 +38,8 @@ void CopyToBuffer(const Cord& src, char* dest) { src.CopyToArray(dest); } } // namespace port } // namespace toco -#if defined(PLATFORM_GOOGLE) && !defined(__APPLE__) && !defined(__ANDROID__) +#if defined(PLATFORM_GOOGLE) && !defined(__APPLE__) && \ + !defined(__ANDROID__) && !defined(_WIN32) // Wrap Google file operations. @@ -115,9 +116,12 @@ string JoinPath(const string& a, const string& b) { } // namespace port } // namespace toco -#else // (__APPLE__ || __ANDROID__) +#else // !PLATFORM_GOOGLE || __APPLE__ || __ANDROID__ || _WIN32 #include +#if defined(_WIN32) +#include // for _close, _open, _read +#endif #include #include #include @@ -130,6 +134,19 @@ string JoinPath(const string& a, const string& b) { namespace toco { namespace port { +#if defined(_WIN32) +#define close _close +#define open _open +#define read _read +#define O_RDONLY _O_RDONLY +#define O_CREAT _O_CREAT +#define O_WRONLY _O_WRONLY +// Windows does not support the same set of file permissions as other platforms. +constexpr int kFileCreateMode = _S_IREAD | _S_IWRITE; +#else +constexpr int kFileCreateMode = 0664; +#endif // _WIN32 + static bool port_initialized = false; void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags) { @@ -209,7 +226,7 @@ tensorflow::Status GetContents(const string& path, string* output, tensorflow::Status SetContents(const string& filename, const string& contents, const file::Options& options) { - int fd = open(filename.c_str(), O_WRONLY | O_CREAT, 0664); + int fd = open(filename.c_str(), O_WRONLY | O_CREAT, kFileCreateMode); if (fd == -1) { return tensorflow::errors::Internal("can't open() for write"); } @@ -243,4 +260,4 @@ string JoinPath(const string& base, const string& filename) { } // namespace port } // namespace toco -#endif // (__APPLE || __ANDROID__) +#endif // !PLATFORM_GOOGLE || __APPLE || __ANDROID__ || _WIN32 diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc index fcd3cbab07c06737f43d822e5b16f7c188f56b1a..34130a02b03d0104df6f2a16ebccc50202f34f46 100644 --- a/tensorflow/contrib/lite/toco/toco_tooling.cc +++ b/tensorflow/contrib/lite/toco/toco_tooling.cc @@ -90,8 +90,10 @@ void MakeGeneralGraphTransformationsSet( transformations->Add(new ResolveConstantRandomUniform); transformations->Add(new ResolveConstantRange); transformations->Add(new ResolveConstantReshape); + transformations->Add(new ResolveConstantSelect); transformations->Add(new ResolveConstantSlice); transformations->Add(new ResolveConstantStridedSlice); + transformations->Add(new ResolveConstantTile); transformations->Add(new ResolveConstantTranspose); transformations->Add(new ResolveConstantUnaryOperator); transformations->Add(new ResolveTensorFlowMerge); diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc index 68155c73294d1cdd1a258aac98da9cd81fa4bbca..2ad27198119b4a8150a7381c047a4edb51aebfe6 100644 --- a/tensorflow/contrib/lite/toco/tooling_util.cc +++ b/tensorflow/contrib/lite/toco/tooling_util.cc @@ -404,6 +404,7 @@ const char* OperatorTypeName(OperatorType type) { HANDLE_OPERATORTYPENAME_CASE(LogicalAnd) HANDLE_OPERATORTYPENAME_CASE(LogicalNot) HANDLE_OPERATORTYPENAME_CASE(LogicalOr) + HANDLE_OPERATORTYPENAME_CASE(CTCBeamSearchDecoder) default: LOG(FATAL) << "Unhandled op type"; #undef HANDLE_OPERATORTYPENAME_CASE @@ -601,14 +602,33 @@ void UnextendShape(Shape* shape, int new_shape_size) { shape_dims.erase(shape_dims.begin(), shape_dims.begin() + size_reduction); } -bool IsValid(const Shape& shape) { +// In general, zero-sized dimensions are disallowed, but there are exceptions, +// e.g., if the tensor data itself represents a scalar (rank 0) shape, its +// shape will have dimensions [0]. CheckNonEmptyShapeDimensions is more +// strict, and is appropriate for ops and comparisons where an empty shape +// doesn't make sense. +template +void CheckValidShapeDimensions(const Dims& dims) { + if (dims.size() == 1 && dims[0] == 0) { + return; + } + for (const auto& dim : dims) { + CHECK_GE(dim, 1); + } +} + +void CheckValidShape(const Shape& shape) { + CheckValidShapeDimensions(shape.dims()); +} + +bool IsNonEmpty(const Shape& shape) { for (int i = 0; i < shape.dimensions_count(); ++i) { if (shape.dims(i) < 1) return false; } return true; } -void CheckShapeDimensions(const Shape& shape) { +void CheckNonEmptyShapeDimensions(const Shape& shape) { for (int i = 0; i < shape.dimensions_count(); ++i) { CHECK_GE(shape.dims()[i], 1) << "shape has dimension 0 at index << " << i << ". shape = " << ShapeToString(shape); @@ -616,8 +636,8 @@ void CheckShapeDimensions(const Shape& shape) { } bool ShapesAgreeUpToBroadcasting(const Shape& shape0, const Shape& shape1) { - CheckShapeDimensions(shape0); - CheckShapeDimensions(shape1); + CheckNonEmptyShapeDimensions(shape0); + CheckNonEmptyShapeDimensions(shape1); const Shape* longer = &shape0; const Shape* shorter = &shape1; @@ -644,8 +664,8 @@ bool ShapesAgreeUpToBroadcasting(const Shape& shape0, const Shape& shape1) { } bool ShapesAgreeUpToExtending(const Shape& shape0, const Shape& shape1) { - CheckShapeDimensions(shape0); - CheckShapeDimensions(shape1); + CheckNonEmptyShapeDimensions(shape0); + CheckNonEmptyShapeDimensions(shape1); const Shape* longer = &shape0; const Shape* shorter = &shape1; @@ -682,9 +702,9 @@ bool ShapesAgreeUpToExtending(const Shape& shape0, const Shape& shape1) { } int RequiredBufferSizeForShape(const Shape& shape) { + CheckValidShape(shape); int max_offset = 1; for (const auto& dim : shape.dims()) { - CHECK_GE(dim, 1); max_offset *= dim; } return max_offset; @@ -945,13 +965,7 @@ void CheckEachArray(const Model& model) { // shape. CHECK(array->has_shape()); // Constant buffer should has a valid shape. - bool is_scalar = - array->shape().dimensions_count() == 1 && array->shape().dims(0) == 0; - if (!is_scalar) { - for (int d : array->shape().dims()) { - CHECK_GE(d, 1); - } - } + CheckValidShape(array->shape()); // The shape flat-size should agree with the buffer length. CHECK_EQ(array->buffer->Length(), RequiredBufferSizeForShape(array->shape())); @@ -1543,8 +1557,8 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) { if (!input_array.has_shape()) { if (input_array_proto.has_shape()) { auto& input_array_dims = *input_array.mutable_shape()->mutable_dims(); + CheckValidShapeDimensions(input_array_proto.shape().dims()); for (auto dim : input_array_proto.shape().dims()) { - CHECK_GE(dim, 1); input_array_dims.push_back(dim); } } diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h index 5dbfa54fa0369676dce638aec171b409a468da9f..b99e6111fe92be178b5ff8b83477f1ce10c20926 100644 --- a/tensorflow/contrib/lite/toco/tooling_util.h +++ b/tensorflow/contrib/lite/toco/tooling_util.h @@ -115,10 +115,9 @@ void ExtendShape(Shape* shape, int new_shape_size); // TODO(b/36075966): Clean up when dims superseded by array shape. void UnextendShape(Shape* shape, int new_shape_size); -// Checks that all dimensions of 'shape' are at least 1. -bool IsValid(const Shape& shape); -// Same as above, but reports error using CHECK. -void CheckShapeDimensions(const Shape& shape); +// Checks that all dimensions of 'shape' are at least 1. Note that scalars, +// lacking dimensions, satisfy this condition and are considered non-empty. +bool IsNonEmpty(const Shape& shape); // Given two shapes with potentially different dimensionality and dimension // arrays d0 and d1. Without loss of generality, assume that shape0 may have diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/tools/make/Makefile similarity index 67% rename from tensorflow/contrib/lite/Makefile rename to tensorflow/contrib/lite/tools/make/Makefile index df5954744a41191d922e91553303e052969c24fb..e30cc1d70e1370f6243d9dcd39eeaa8f20cc4b1a 100644 --- a/tensorflow/contrib/lite/Makefile +++ b/tensorflow/contrib/lite/tools/make/Makefile @@ -6,119 +6,74 @@ endif # Try to figure out the host system HOST_OS := ifeq ($(OS),Windows_NT) - HOST_OS = WINDOWS + HOST_OS = windows else UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Linux) - HOST_OS := LINUX + HOST_OS := linux endif ifeq ($(UNAME_S),Darwin) - HOST_OS := OSX + HOST_OS := osx endif endif HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi) -# Self-hosting -TARGET_ARCH := ${HOST_ARCH} +# Override these on the make command line to target a specific architecture. For example: +# make -f tensorflow/contrib/lite/Makefile TARGET=rpi TARGET_ARCH=armv7l +TARGET := $(HOST_OS) +TARGET_ARCH := $(HOST_ARCH) -# Cross compiling -ifeq ($(CROSS),rpi) - TARGET_ARCH := armv7l - TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf- -endif - -ifeq ($(CROSS),riscv) - TARGET_ARCH := riscv - TARGET_TOOLCHAIN_PREFIX := riscv32-unknown-elf- -endif -ifeq ($(CROSS),stm32f7) - TARGET_ARCH := armf7 - TARGET_TOOLCHAIN_PREFIX := arm-none-eabi- -endif -ifeq ($(CROSS),stm32f1) - TARGET_ARCH := armm1 - TARGET_TOOLCHAIN_PREFIX := arm-none-eabi- -endif - -# Where compiled objects are stored. -OBJDIR := $(MAKEFILE_DIR)/gen/obj/ -BINDIR := $(MAKEFILE_DIR)/gen/bin/ -LIBDIR := $(MAKEFILE_DIR)/gen/lib/ -GENDIR := $(MAKEFILE_DIR)/gen/obj/ - -LIBS := -ifeq ($(TARGET_ARCH),x86_64) - CXXFLAGS += -fPIC -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -pthread # -msse4.2 -endif - -ifeq ($(TARGET_ARCH),armv7l) - CXXFLAGS += -mfpu=neon -pthread -fPIC - LIBS += -ldl -endif - -ifeq ($(TARGET_ARCH),riscv) -# CXXFLAGS += -march=gap8 - CXXFLAGS += -DTFLITE_MCU - LIBS += -ldl - BUILD_TYPE := micro -endif - -ifeq ($(TARGET_ARCH),armf7) - CXXFLAGS += -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_MCU - CXXFLAGS += -fno-rtti -fmessage-length=0 -fno-exceptions -fno-builtin -ffunction-sections -fdata-sections - CXXFLAGS += -funsigned-char -MMD - CXXFLAGS += -mcpu=cortex-m7 -mthumb -mfpu=fpv5-sp-d16 -mfloat-abi=softfp - CXXFLAGS += '-std=gnu++11' '-fno-rtti' '-Wvla' '-c' '-Wall' '-Wextra' '-Wno-unused-parameter' '-Wno-missing-field-initializers' '-fmessage-length=0' '-fno-exceptions' '-fno-builtin' '-ffunction-sections' '-fdata-sections' '-funsigned-char' '-MMD' '-fno-delete-null-pointer-checks' '-fomit-frame-pointer' '-Os' - LIBS += -ldl - BUILD_TYPE := micro -endif -ifeq ($(TARGET_ARCH),armm1) - CXXFLAGS += -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -mcpu=cortex-m1 -mthumb -DTFLITE_MCU - CXXFLAGS += -fno-rtti -fmessage-length=0 -fno-exceptions -fno-builtin -ffunction-sections -fdata-sections - CXXFLAGS += -funsigned-char -MMD - LIBS += -ldl -endif +# These are the default libraries needed, but they can be added to or +# overridden by the platform-specific settings in target makefiles. +LIBS := \ +-lstdc++ \ +-lpthread \ +-lm \ +-lz -# Settings for the host compiler. -CXX := $(CC_PREFIX) ${TARGET_TOOLCHAIN_PREFIX}g++ -CXXFLAGS += -O3 -DNDEBUG +# There are no rules for compiling objects for the host system (since we don't +# generate things like the protobuf compiler that require that), so all of +# these settings are for the target compiler. +CXXFLAGS := -O3 -DNDEBUG CCFLAGS := ${CXXFLAGS} CXXFLAGS += --std=c++11 -CC := $(CC_PREFIX) ${TARGET_TOOLCHAIN_PREFIX}gcc -AR := $(CC_PREFIX) ${TARGET_TOOLCHAIN_PREFIX}ar CFLAGS := -LDOPTS := -LDOPTS += -L/usr/local/lib +LDOPTS := -L/usr/local/lib ARFLAGS := -r +TARGET_TOOLCHAIN_PREFIX := +CC_PREFIX := + +# These target-specific makefiles should modify or replace options like +# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic +# based on platforms or architectures should happen within these files, to +# keep this main makefile focused on the sources and dependencies. +include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc) + +# Where compiled objects are stored. +GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/ +OBJDIR := $(GENDIR)obj/ +BINDIR := $(GENDIR)bin/ +LIBDIR := $(GENDIR)lib/ INCLUDES := \ -I. \ --I$(MAKEFILE_DIR)/../../../ \ +-I$(MAKEFILE_DIR)/../../../../../ \ +-I$(MAKEFILE_DIR)/../../../../../../ \ -I$(MAKEFILE_DIR)/downloads/ \ -I$(MAKEFILE_DIR)/downloads/eigen \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/neon_2_sse \ -I$(MAKEFILE_DIR)/downloads/farmhash/src \ -I$(MAKEFILE_DIR)/downloads/flatbuffers/include \ --I$(GENDIR) +-I$(OBJDIR) # This is at the end so any globally-installed frameworks like protobuf don't # override local versions in the source tree. INCLUDES += -I/usr/local/include -LIBS += \ --lstdc++ \ --lpthread \ --lm \ --lz - -# If we're on Linux, also link in the dl library. -ifeq ($(HOST_OS),LINUX) - LIBS += -ldl -endif - -include $(MAKEFILE_DIR)/ios_makefile.inc -include $(MAKEFILE_DIR)/rpi_makefile.inc +CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++ +CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc +AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar # This library is the main target for this makefile. It will contain a minimal # runtime that can be linked in to other programs. @@ -162,8 +117,8 @@ $(wildcard tensorflow/contrib/lite/kernels/*.c) \ $(wildcard tensorflow/contrib/lite/kernels/internal/*.c) \ $(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.c) \ $(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.c) \ -$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc) \ -$(wildcard tensorflow/contrib/lite/downloads/fft2d/fftsg.c) +$(wildcard tensorflow/contrib/lite/tools/make/downloads/farmhash/src/farmhash.cc) \ +$(wildcard tensorflow/contrib/lite/tools/make/downloads/fft2d/fftsg.c) endif # Remove any duplicates. CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS)) @@ -176,7 +131,7 @@ $(wildcard tensorflow/contrib/lite/kernels/test_util.cc) \ $(MINIMAL_SRCS) ifeq ($(BUILD_TYPE),micro) CORE_CC_EXCLUDE_SRCS += \ -tensorflow/contrib/lite/model.cc \ +tensorflow/contrib/lite/mmap_allocation.cc \ tensorflow/contrib/lite/nnapi_delegate.cc endif # Filter out all the excluded files. @@ -214,8 +169,12 @@ all: $(LIB_PATH) $(MINIMAL_PATH) $(BENCHMARK_BINARY) # The target that's compiled for micro-controllers micro: $(LIB_PATH) +# Hack for generating schema file bypassing flatbuffer parsing +tensorflow/contrib/lite/schema/schema_generated.h: + @cp -u tensorflow/contrib/lite/schema/schema_generated.h.OPENSOURCE tensorflow/contrib/lite/schema/schema_generated.h + # Gathers together all the objects we've compiled into a single '.a' archive. -$(LIB_PATH): $(LIB_OBJS) +$(LIB_PATH): tensorflow/contrib/lite/schema/schema_generated.h $(LIB_OBJS) @mkdir -p $(dir $@) $(AR) $(ARFLAGS) $(LIB_PATH) $(LIB_OBJS) diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh similarity index 66% rename from tensorflow/contrib/lite/build_ios_universal_lib.sh rename to tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh index 31df43a1754bd753a82a613dc15704aaa056a87e..fe056945a652b04d078947f58bfe6ab60aa1f387 100755 --- a/tensorflow/contrib/lite/build_ios_universal_lib.sh +++ b/tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh @@ -17,23 +17,23 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR/../../.." +cd "$SCRIPT_DIR/../../../../.." # Build library for supported architectures and packs them in a fat binary. make_library() { for arch in x86_64 armv7 armv7s arm64 do - make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=${arch} \ - -j 8 \ - $SCRIPT_DIR/gen/lib/ios_${arch}/${1} + make -f tensorflow/contrib/lite/tools/make/Makefile TARGET=ios TARGET_ARCH=${arch} \ + -j 8 done + mkdir -p tensorflow/contrib/lite/tools/make/gen/lib lipo \ - tensorflow/contrib/lite/gen/lib/ios_x86_64/${1} \ - tensorflow/contrib/lite/gen/lib/ios_armv7/${1} \ - tensorflow/contrib/lite/gen/lib/ios_armv7s/${1} \ - tensorflow/contrib/lite/gen/lib/ios_arm64/${1} \ + tensorflow/contrib/lite/tools/make/gen/ios_x86_64/lib/${1} \ + tensorflow/contrib/lite/tools/make/gen/ios_armv7/lib/${1} \ + tensorflow/contrib/lite/tools/make/gen/ios_armv7s/lib/${1} \ + tensorflow/contrib/lite/tools/make/gen/ios_arm64/lib/${1} \ -create \ - -output tensorflow/contrib/lite/gen/lib/${1} + -output tensorflow/contrib/lite/tools/make/gen/lib/${1} } make_library libtensorflow-lite.a diff --git a/tensorflow/contrib/lite/build_rpi_lib.sh b/tensorflow/contrib/lite/tools/make/build_rpi_lib.sh similarity index 90% rename from tensorflow/contrib/lite/build_rpi_lib.sh rename to tensorflow/contrib/lite/tools/make/build_rpi_lib.sh index 3824b16412ed26a6cab79df3242da6017c3322b0..24ecd4356df12c25dbdbf81684b7de128e8d11f4 100755 --- a/tensorflow/contrib/lite/build_rpi_lib.sh +++ b/tensorflow/contrib/lite/tools/make/build_rpi_lib.sh @@ -17,6 +17,6 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR/../../.." +cd "$SCRIPT_DIR/../../../../.." -CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/contrib/lite/Makefile TARGET=RPI TARGET_ARCH=armv7 +CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/contrib/lite/tools/make/Makefile TARGET=rpi TARGET_ARCH=armv7l diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/tools/make/download_dependencies.sh similarity index 98% rename from tensorflow/contrib/lite/download_dependencies.sh rename to tensorflow/contrib/lite/tools/make/download_dependencies.sh index 8c7df474d55a85d7a6659b436e33ebf7632ab960..29afa45133775224cef5c2bdd59cc513b0a47914 100755 --- a/tensorflow/contrib/lite/download_dependencies.sh +++ b/tensorflow/contrib/lite/tools/make/download_dependencies.sh @@ -17,9 +17,9 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR/../../.." +cd "$SCRIPT_DIR/../../../../.." -DOWNLOADS_DIR=tensorflow/contrib/lite/downloads +DOWNLOADS_DIR=tensorflow/contrib/lite/tools/make/downloads BZL_FILE_PATH=tensorflow/workspace.bzl # Ensure it is being run from repo root diff --git a/tensorflow/contrib/lite/ios_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/ios_makefile.inc similarity index 67% rename from tensorflow/contrib/lite/ios_makefile.inc rename to tensorflow/contrib/lite/tools/make/targets/ios_makefile.inc index 079320586ffd01fc77818a81e0c5962f1d28c1f1..7f36b8ecef4715a4b89e74bd9ef17d28bbf72ae2 100644 --- a/tensorflow/contrib/lite/ios_makefile.inc +++ b/tensorflow/contrib/lite/tools/make/targets/ios_makefile.inc @@ -1,11 +1,11 @@ # Settings for iOS. -ifeq ($(TARGET), IOS) - BUILD_FOR_IOS_SIMULATOR := false - ifeq ($(IOS_ARCH), x86_64) - BUILD_FOR_IOS_SIMULATOR := true +ifeq ($(TARGET), ios) + BUILD_FOR_IOS_SIMULATOR := false + ifeq ($(TARGET_ARCH), x86_64) + BUILD_FOR_IOS_SIMULATOR := true endif - ifeq ($(IOS_ARCH), i386) - BUILD_FOR_IOS_SIMULATOR := true + ifeq ($(TARGET_ARCH), i386) + BUILD_FOR_IOS_SIMULATOR := true endif ifeq ($(BUILD_FOR_IOS_SIMULATOR), true) IPHONEOS_PLATFORM := $(shell xcrun --sdk iphonesimulator \ @@ -18,8 +18,8 @@ ifeq ($(TARGET), IOS) endif IOS_SDK_VERSION := $(shell xcrun --sdk iphoneos --show-sdk-version) MIN_SDK_VERSION := 9.0 - # Override IOS_ARCH with armv7, armv7s, arm64, i386, or x86_64. - IOS_ARCH := x86_64 + # Override TARGET_ARCH with armv7, armv7s, arm64, i386, or x86_64. + TARGET_ARCH := x86_64 CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \ -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \ -DTFLITE_USE_APPLE_ACCELERATE_FOR_CONV \ @@ -29,21 +29,17 @@ ifeq ($(TARGET), IOS) -fno-exceptions \ -isysroot \ ${IPHONEOS_SYSROOT} \ - -arch $(IOS_ARCH) \ + -arch $(TARGET_ARCH) \ -O3 CCFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \ -fembed-bitcode \ -mno-thumb \ -isysroot \ ${IPHONEOS_SYSROOT} \ - -arch $(IOS_ARCH) \ + -arch $(TARGET_ARCH) \ -O3 LDFLAGS := -fembed-bitcode \ -miphoneos-version-min=${MIN_SDK_VERSION} \ -framework Accelerate \ - -arch $(IOS_ARCH) - OBJDIR := $(OBJDIR)ios_$(IOS_ARCH)/ - LIBDIR := $(LIBDIR)ios_$(IOS_ARCH)/ - BINDIR := $(BINDIR)ios_$(IOS_ARCH)/ - DEPDIR := $(DEPDIR)ios_$(IOS_ARCH)/ + -arch $(TARGET_ARCH) endif diff --git a/tensorflow/contrib/lite/tools/make/targets/linux_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/linux_makefile.inc new file mode 100644 index 0000000000000000000000000000000000000000..86499da99e25c4d025707bc71ebf47d821b3a924 --- /dev/null +++ b/tensorflow/contrib/lite/tools/make/targets/linux_makefile.inc @@ -0,0 +1,10 @@ +# Settings for Linux. +ifeq ($(TARGET), linux) + CXXFLAGS += \ + -fPIC \ + -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \ + -pthread + # TODO(petewarden): In the future we may want to add architecture-specific + # flags like -msse4.2 + LIBS += -ldl +endif diff --git a/tensorflow/contrib/lite/tools/make/targets/riscv_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/riscv_makefile.inc new file mode 100644 index 0000000000000000000000000000000000000000..1a82afec33e092090ebb90c1fe18c5adf881f959 --- /dev/null +++ b/tensorflow/contrib/lite/tools/make/targets/riscv_makefile.inc @@ -0,0 +1,10 @@ +# Settings for RiscV platforms. +ifeq ($(TARGET), riscv) + TARGET_ARCH := riscv + TARGET_TOOLCHAIN_PREFIX := riscv32-unknown-elf- + + #CXXFLAGS += -march=gap8 + CXXFLAGS += -DTFLITE_MCU + LIBS += -ldl + BUILD_TYPE := micro +endif diff --git a/tensorflow/contrib/lite/tools/make/targets/rpi_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/rpi_makefile.inc new file mode 100644 index 0000000000000000000000000000000000000000..1ad0c502372e32a5f5d01cde6c8d775189406777 --- /dev/null +++ b/tensorflow/contrib/lite/tools/make/targets/rpi_makefile.inc @@ -0,0 +1,60 @@ +# Settings for Raspberry Pi. +ifeq ($(TARGET),rpi) + # Default to the architecture used on the Pi Two/Three (ArmV7), but override this + # with TARGET_ARCH=armv6 to build for the Pi Zero or One. + TARGET_ARCH := armv7l + TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf- + + ifeq ($(TARGET_ARCH), armv7l) + CXXFLAGS += \ + -march=armv7-a \ + -mfpu=neon-vfpv4 \ + -funsafe-math-optimizations \ + -ftree-vectorize \ + -fPIC + + CCFLAGS += \ + -march=armv7-a \ + -mfpu=neon-vfpv4 \ + -funsafe-math-optimizations \ + -ftree-vectorize \ + -fPIC + + LDFLAGS := \ + -Wl,--no-export-dynamic \ + -Wl,--exclude-libs,ALL \ + -Wl,--gc-sections \ + -Wl,--as-needed + endif + + # TODO(petewarden) In the future, we'll want to use OpenBLAS as a faster + # alternative to Eigen on non-NEON ARM hardware like armv6. + ifeq ($(TARGET_ARCH), armv6) + CXXFLAGS += \ + -march=armv6 \ + -mfpu=vfp \ + -funsafe-math-optimizations \ + -ftree-vectorize \ + -fPIC + + CCFLAGS += \ + -march=armv6 \ + -mfpu=vfp \ + -funsafe-math-optimizations \ + -ftree-vectorize \ + -fPIC + + LDFLAGS := \ + -Wl,--no-export-dynamic \ + -Wl,--exclude-libs,ALL \ + -Wl,--gc-sections \ + -Wl,--as-needed + endif + + LIBS := \ + -lstdc++ \ + -lpthread \ + -lm \ + -ldl + +endif diff --git a/tensorflow/contrib/lite/tools/make/targets/stm32f1_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/stm32f1_makefile.inc new file mode 100644 index 0000000000000000000000000000000000000000..7418e4d196ed1384bc16baa2c0289173060f74ac --- /dev/null +++ b/tensorflow/contrib/lite/tools/make/targets/stm32f1_makefile.inc @@ -0,0 +1,21 @@ +# Settings for STM32F1 platforms. +ifeq ($(TARGET), stm32f1) + TARGET_ARCH := armm1 + TARGET_TOOLCHAIN_PREFIX := arm-none-eabi- + + CXXFLAGS += \ + -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \ + -mcpu=cortex-m1 \ + -mthumb \ + -DTFLITE_MCU \ + -fno-rtti \ + -fmessage-length=0 \ + -fno-exceptions \ + -fno-builtin \ + -ffunction-sections \ + -fdata-sections \ + -funsigned-char \ + -MMD + LIBS += -ldl + BUILD_TYPE := micro +endif diff --git a/tensorflow/contrib/lite/tools/make/targets/stm32f7_makefile.inc b/tensorflow/contrib/lite/tools/make/targets/stm32f7_makefile.inc new file mode 100644 index 0000000000000000000000000000000000000000..48af71e5b4ba34897bd20d42b6a01ae1198a83ef --- /dev/null +++ b/tensorflow/contrib/lite/tools/make/targets/stm32f7_makefile.inc @@ -0,0 +1,41 @@ +# Settings for STM32F7 platforms. +ifeq ($(TARGET), stm32f7) + TARGET_ARCH := armf7 + TARGET_TOOLCHAIN_PREFIX := arm-none-eabi- + + CXXFLAGS += \ + -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \ + -DTFLITE_MCU \ + -fno-rtti \ + -fmessage-length=0 \ + -fno-exceptions \ + -fno-builtin \ + -ffunction-sections \ + -fdata-sections \ + -funsigned-char \ + -MMD \ + -mcpu=cortex-m7 \ + -mthumb \ + -mfpu=fpv5-sp-d16 \ + -mfloat-abi=softfp \ + -std=gnu++11 \ + -fno-rtti \ + -Wvla \ + -c \ + -Wall \ + -Wextra \ + -Wno-unused-parameter \ + -Wno-missing-field-initializers \ + -fmessage-length=0 \ + -fno-exceptions \ + -fno-builtin \ + -ffunction-sections \ + -fdata-sections \ + -funsigned-char \ + -MMD \ + -fno-delete-null-pointer-checks \ + -fomit-frame-pointer \ + -Os + LIBS += -ldl + BUILD_TYPE := micro +endif diff --git a/tensorflow/contrib/lite/tools/visualize.py b/tensorflow/contrib/lite/tools/visualize.py index e07f899e4d8c249cb03d4251a722df0614007fed..597dede63b0c089da21f4b0ede065189d8bbe1d8 100644 --- a/tensorflow/contrib/lite/tools/visualize.py +++ b/tensorflow/contrib/lite/tools/visualize.py @@ -334,7 +334,7 @@ def CreateHtmlFile(tflite_input, html_output): for key, mapping in toplevel_stuff: if not mapping: mapping = lambda x: x - html += "
\n" % (key, mapping(data[key])) + html += "\n" % (key, mapping(data.get(key))) html += "
- Mobilenet_1.0_224(float) + Mobilenet_1.0_224(float) Pixel 2 166.5 ms (2.6 ms)
- Mobilenet_1.0_224 (quant) + Mobilenet_1.0_224 (quant) Pixel 2 69.5 ms (0.9 ms)
- Mobilenet_1.0_224(float) + Mobilenet_1.0_224(float) iPhone 8 32.2 ms (0.8 ms)
- Mobilenet_1.0_224 (quant) + Mobilenet_1.0_224 (quant) iPhone 8 24.4 ms (0.8 ms)
%s%s
%s%s
\n" # Spec on what keys to display diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py index 4942d941765951ed2ee5555138e91a202b96bf7c..8c0bfefb30319456e378a85c717c28910811159b 100644 --- a/tensorflow/contrib/lookup/lookup_ops.py +++ b/tensorflow/contrib/lookup/lookup_ops.py @@ -20,7 +20,6 @@ from __future__ import print_function from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops -from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import gen_lookup_ops from tensorflow.python.ops import lookup_ops # pylint: disable=unused-import @@ -395,17 +394,12 @@ class MutableHashTable(LookupInterface): Raises: TypeError: when `keys` do not match the table data types. """ - if keys.dtype.base_dtype != self._key_dtype: - raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." % - (self._key_dtype, keys.dtype)) - with ops.name_scope(name, "%s_lookup_table_find" % self._name, (self._table_ref, keys, self._default_value)) as name: + keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys") with ops.colocate_with(self._table_ref): values = gen_lookup_ops.lookup_table_find_v2( self._table_ref, keys, self._default_value, name=name) - - values.set_shape(keys.get_shape().concatenate(self._value_shape)) return values def insert(self, keys, values, name=None): @@ -451,9 +445,6 @@ class MutableHashTable(LookupInterface): with ops.colocate_with(self._table_ref): exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2( self._table_ref, self._key_dtype, self._value_dtype, name=name) - - exported_values.set_shape(exported_keys.get_shape().concatenate( - self._value_shape)) return exported_keys, exported_values class _Saveable(BaseSaverBuilder.SaveableObject): @@ -537,14 +528,15 @@ class MutableDenseHashTable(LookupInterface): ValueError: If checkpoint is True and no name was specified. """ self._default_value = ops.convert_to_tensor( - default_value, dtype=value_dtype) + default_value, dtype=value_dtype, name="default_value") self._value_shape = self._default_value.get_shape() # The table must be shared if checkpointing is requested for multi-worker # training to work correctly. Use the node name if no shared_name has been # explicitly specified. use_node_name_sharing = checkpoint and shared_name is None - empty_key = ops.convert_to_tensor(empty_key, dtype=key_dtype) + empty_key = ops.convert_to_tensor( + empty_key, dtype=key_dtype, name="empty_key") self._table_ref = gen_lookup_ops.mutable_dense_hash_table_v2( empty_key=empty_key, shared_name=shared_name, @@ -591,20 +583,13 @@ class MutableDenseHashTable(LookupInterface): Raises: TypeError: when `keys` do not match the table data types. """ - if keys.dtype.base_dtype != self._key_dtype: - raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." % - (self._key_dtype, keys.dtype)) - with ops.name_scope(name, "%s_lookup_table_find" % self._name, [self._table_ref, keys]) as name: + keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys") with ops.colocate_with(self._table_ref): values = gen_lookup_ops.lookup_table_find_v2( self._table_ref, keys, self._default_value, name=name) - if keys.get_shape().ndims is not None and keys.get_shape().ndims > 0: - values.set_shape( - tensor_shape.TensorShape([keys.get_shape().dims[0]]).concatenate( - self._value_shape)) return values def insert(self, keys, values, name=None): @@ -624,11 +609,11 @@ class MutableDenseHashTable(LookupInterface): TypeError: when `keys` or `values` doesn't match the table data types. """ - # pylint: disable=protected-access - lookup_ops._check_table_dtypes(self, keys.dtype, values.dtype) - # pylint: enable=protected-access with ops.name_scope(name, "%s_lookup_table_insert" % self._name, [self._table_ref, keys, values]) as name: + keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys") + values = ops.convert_to_tensor( + values, dtype=self._value_dtype, name="values") with ops.colocate_with(self._table_ref): op = gen_lookup_ops.lookup_table_insert_v2( self._table_ref, keys, values, name=name) @@ -650,8 +635,6 @@ class MutableDenseHashTable(LookupInterface): exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2( self._table_ref, self._key_dtype, self._value_dtype, name=name) - exported_values.set_shape(exported_keys.get_shape().concatenate( - self._value_shape)) return exported_keys, exported_values class _Saveable(BaseSaverBuilder.SaveableObject): diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py index 8d510ede5827df3889307c0f38572bece84f102e..6fb5244fc6230e1c6f6da7708fe30c20a163494c 100644 --- a/tensorflow/contrib/lookup/lookup_ops_test.py +++ b/tensorflow/contrib/lookup/lookup_ops_test.py @@ -434,8 +434,10 @@ class MutableHashTableOpTest(test.TestCase): self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result) exported_keys, exported_values = table.export() - self.assertAllEqual([None], exported_keys.get_shape().as_list()) - self.assertAllEqual([None, 2], exported_values.get_shape().as_list()) + self.assertAllEqual([None], exported_keys.get_shape().as_list(), + msg="Saw shape %s" % exported_keys.shape) + self.assertAllEqual([None, 2], exported_values.get_shape().as_list(), + msg="Saw shape %s" % exported_values.shape) # exported data is in the order of the internal map, i.e. undefined sorted_keys = np.sort(exported_keys.eval()) sorted_values = np.sort(exported_values.eval()) @@ -669,7 +671,7 @@ class MutableHashTableOpTest(test.TestCase): # lookup with keys of the wrong type input_string = constant_op.constant([1, 2, 3], dtypes.int64) - with self.assertRaises(TypeError): + with self.assertRaises(ValueError): table.lookup(input_string).eval() # default value of the wrong type @@ -853,7 +855,8 @@ class MutableDenseHashTableOpTest(test.TestCase): input_string = constant_op.constant([11, 12, 15], dtypes.int64) output = table.lookup(input_string) - self.assertAllEqual([3, 4], output.get_shape()) + self.assertAllEqual( + [3, 4], output.shape, msg="Saw shape: %s" % output.shape) result = output.eval() self.assertAllEqual([[0, 1, 2, 3], [3, 4, 5, 6], [-1, -2, -3, -4]], diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh index 48953e2e3843ff92744514d28bd725cc0d72f3a8..dc9b17a62783817ec9a2998c4d5548c0f05e073b 100755 --- a/tensorflow/contrib/makefile/download_dependencies.sh +++ b/tensorflow/contrib/makefile/download_dependencies.sh @@ -30,8 +30,14 @@ EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)" GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz" NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" -PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" -RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" +# Note: The Protobuf source in `tensorflow/workspace.bzl` in TensorFlow +# 1.10 branch does not work. `make distclean` fails and blocks the build +# process. For now we're hardcoding to the version which is used by +# TensorFlow 1.9. +PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz" +# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' once +# the archive has been propagated in mirror.bazel.build. +RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)" DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL_FILE_PATH}" | head -n1)" ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)" diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py index be7377b1519f3bdab8755411af3de7aa0c2dc9eb..eba505881fb648cf4993e2b8ce7d935dca0f4830 100644 --- a/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py +++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py @@ -41,12 +41,12 @@ class LossScaleManager(object): applied on variables. This class is used together with - @{tf.contrib.mixed_precision.LossScaleOptimizer} for mixed precision training + `tf.contrib.mixed_precision.LossScaleOptimizer` for mixed precision training (float32 variables and float16 ops) on Nvidia GPUs in order to achieve the same model quality as single precision training, with the benefits of potential higher throughput. - See @{tf.contrib.mixed_precision.LossScaleOptimizer} for more details. + See `tf.contrib.mixed_precision.LossScaleOptimizer` for more details. """ @abc.abstractmethod diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py index 93050a3ae373603c516c7eb72c22f327f4a60a00..fcce52a07a88547af437382c3ec060b23c9d334e 100644 --- a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py +++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py @@ -103,7 +103,7 @@ class LossScaleOptimizer(optimizer.Optimizer): Args: opt: The actual optimizer that will be used to compute and apply the - gradients. Must be an implementation of the @{tf.train.Optimizer} + gradients. Must be an implementation of the `tf.train.Optimizer` interface. loss_scale_manager: A LossScaleManager object. """ @@ -117,7 +117,7 @@ class LossScaleOptimizer(optimizer.Optimizer): aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): - """Compute gradients. See base class @{tf.train.Optimizer}.""" + """Compute gradients. See base class `tf.train.Optimizer`.""" loss_scale = self._loss_scale_manager.get_loss_scale() if context.executing_eagerly(): @@ -141,7 +141,7 @@ class LossScaleOptimizer(optimizer.Optimizer): return self._down_scale(grads_and_vars, loss_scale) def apply_gradients(self, grads_and_vars, global_step=None, name=None): - """Apply gradients. See base class @{tf.train.Optimizer}.""" + """Apply gradients. See base class `tf.train.Optimizer`.""" grads = [g for (g, _) in grads_and_vars] is_finite_grad = [] diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD index 54bd39afacbec07f054f61b72eda0a3654858aa7..16ddc38f5a5ba88485e18b136b2b1081b0e2ff0f 100644 --- a/tensorflow/contrib/model_pruning/BUILD +++ b/tensorflow/contrib/model_pruning/BUILD @@ -95,6 +95,22 @@ py_library( ], ) +py_library( + name = "strip_pruning_vars_lib", + srcs = ["python/strip_pruning_vars_lib.py"], + srcs_version = "PY2AND3", + visibility = ["//visibility:public"], + deps = [ + ":pruning", + "//tensorflow/python:client", + "//tensorflow/python:framework", + "//tensorflow/python:platform", + "//tensorflow/python:training", + "//third_party/py/numpy", + "@six_archive//:six", + ], +) + py_test( name = "pruning_utils_test", size = "small", @@ -129,6 +145,31 @@ py_test( ], ) +py_test( + name = "strip_pruning_vars_test", + size = "small", + srcs = ["python/strip_pruning_vars_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":layers", + ":pruning", + ":rnn_cells", + ":strip_pruning_vars_lib", + "//tensorflow/python:client_testlib", + ], +) + +py_binary( + name = "strip_pruning_vars", + srcs = ["python/strip_pruning_vars.py"], + srcs_version = "PY2AND3", + visibility = ["//visibility:public"], + deps = [ + ":strip_pruning_vars_lib", + "//tensorflow/python:platform", + ], +) + py_library( name = "init_py", srcs = ["__init__.py"], @@ -145,5 +186,6 @@ py_library( ":learning", ":pruning", ":rnn_cells", + ":strip_pruning_vars_lib", ], ) diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md index 9143d082bf08fefa7aa522455eb3af911e636ae0..a5267fd90482287a65a4c38ae257a0af349523e8 100644 --- a/tensorflow/contrib/model_pruning/README.md +++ b/tensorflow/contrib/model_pruning/README.md @@ -4,7 +4,15 @@ This document describes the API that facilitates magnitude-based pruning of neural network's weight tensors. The API helps inject necessary tensorflow op into the training graph so the model can be pruned while it is being trained. -### Model creation +## Table of contents +1. [Model creation](#model-creation) +2. [Hyperparameters for pruning](#hyperparameters) + - [Block sparsity](#block-sparsity) +3. [Adding pruning ops to the training graph](#adding-pruning-ops) +4. [Removing pruning ops from trained model](#remove) +5. [Example](#example) + +### Model creation The first step involves adding mask and threshold variables to the layers that need to undergo pruning. The variable mask is the same shape as the layer's @@ -33,7 +41,7 @@ auxiliary variables built-in (see * [rnn_cells.MaskedLSTMCell](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py?l=154) -### Adding pruning ops to the training graph +### Pruning-related hyperparameters The pruning library allows for specification of the following hyper parameters: @@ -42,7 +50,7 @@ The pruning library allows for specification of the following hyper parameters: | name | string | model_pruning | Name of the pruning specification. Used for adding summaries and ops under a common tensorflow name_scope | | begin_pruning_step | integer | 0 | The global step at which to begin pruning | | end_pruning_step | integer | -1 | The global step at which to terminate pruning. Defaults to -1 implying that pruning continues till the training stops | -| do_not_prune | list of strings | [""] | list of layers names that are not pruned | +| weight_sparsity_map | list of strings | [""] | list of weight variable name (or layer name):target sparsity pairs. Eg. [conv1:0.9,conv2/kernel:0.8]. For layers/weights not in this list, sparsity as specified by the target_sparsity hyperparameter is used. | | threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds | | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) | | nbins | integer | 256 | Number of bins to use for histogram computation | @@ -64,7 +72,13 @@ is divided into $$n$$ intervals of size equal to the pruning_frequency ($$\Delta t$$). $$s_f$$ is the target_sparsity, $$s_i$$ is the initial_sparsity, $$t_0$$ is the sparsity_function_begin_step. In this equation, the sparsity_function_exponent is set to 3. -### Adding pruning ops to the training graph + +#### Block Sparsity + +For some hardware architectures, it may be beneficial to induce spatially correlated sparsity. To train models in which the weight tensors have block sparse structure, set *block_height* and *block_width* hyperparameters to the desired block configuration (2x2, 4x4, 4x1, 1x8, etc). Currently, block sparsity is only supported for weight tensors which can be squeezed to rank 2. The matrix is partitioned into non-overlapping blocks of size *[block_height, block_dim]* and the either the average or max absolute value in this block is taken as a proxy for the entire block (set by *block_pooling_function* hyperparameter). +The convolution layer tensors are always pruned used block dimensions of [1,1]. + +### Adding pruning ops to the training graph The final step involves adding ops to the training graph that monitor the distribution of the layer's weight magnitudes and determine the layer threshold, @@ -105,7 +119,19 @@ with tf.graph.as_default(): ``` Ensure that `global_step` is being [incremented](https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#minimize), otherwise pruning will not work! -## Example: Pruning and training deep CNNs on the cifar10 dataset +### Removing pruning ops from the trained graph +Once the model is trained, it is necessary to remove the auxiliary variables (mask, threshold) and pruning ops added to the graph in the steps above. This can be accomplished using the `strip_pruning_vars` utility. + +This utility generates a binary GraphDef in which the variables have been converted to constants. In particular, the threshold variables are removed from the graph and the mask variable is fused with the corresponding weight tensor to produce a `masked_weight` tensor. This tensor is sparse, has the same size as the weight tensor, and the sparsity is as set by the `target_sparsity` or the `weight_sparsity_map` hyperparameters above. + +```shell +$ bazel build -c opt contrib/model_pruning:strip_pruning_vars +$ bazel-bin/contrib/model_pruning/strip_pruning_vars --checkpoint_dir=/path/to/checkpoints/ --output_node_names=graph_node1,graph_node2 --output_dir=/tmp --filename=pruning_stripped.pb +``` + +For now, it is assumed that the underlying hardware platform will provide mechanisms for compressing the sparse tensors and/or accelerating the sparse tensor computations. + +## Example: Pruning and training deep CNNs on the cifar10 dataset Please see https://www.tensorflow.org/tutorials/deep_cnn for details on neural network architecture, setting up inputs etc. The additional changes needed to @@ -121,7 +147,7 @@ incorporate pruning are captured in the following: To train the pruned version of cifar10: -```bash +```shell $ examples_dir=contrib/model_pruning/examples $ bazel build -c opt $examples_dir/cifar10:cifar10_{train,eval} $ bazel-bin/$examples_dir/cifar10/cifar10_train --pruning_hparams=name=cifar10_pruning,begin_pruning_step=10000,end_pruning_step=100000,target_sparsity=0.9,sparsity_function_begin_step=10000,sparsity_function_end_step=100000 @@ -133,10 +159,14 @@ Eval: $ bazel-bin/$examples_dir/cifar10/cifar10_eval --run_once ``` -### Block Sparsity +Removing pruning nodes from the trained graph: -For some hardware architectures, it may be beneficial to induce spatially correlated sparsity. To train models in which the weight tensors have block sparse structure, set *block_height* and *block_width* hyperparameters to the desired block configuration (2x2, 4x4, 4x1, 1x8, etc). Currently, block sparsity is only supported for weight tensors which can be squeezed to rank 2. The matrix is partitioned into non-overlapping blocks of size *[block_height, block_dim]* and the either the average or max absolute value in this block is taken as a proxy for the entire block (set by *block_pooling_function* hyperparameter). -The convolution layer tensors are always pruned used block dimensions of [1,1]. +```shell +$ bazel build -c opt contrib/model_pruning:strip_pruning_vars +$ bazel-bin/contrib/model_pruning/strip_pruning_vars --checkpoint_path=/tmp/cifar10_train --output_node_names=softmax_linear/softmax_linear_2 --filename=cifar_pruned.pb +``` + +The generated GraphDef (cifar_pruned.pb) may be visualized using the [`import_pb_to_tensorboard`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/tools/import_pb_to_tensorboard.py) utility ## References diff --git a/tensorflow/contrib/model_pruning/__init__.py b/tensorflow/contrib/model_pruning/__init__.py index d32bedbcd6b63bc8e473a9e9d1c8e0753877e6f8..6eca54aaee186f5873a84ef2cb3ff3c7cfb42cd4 100644 --- a/tensorflow/contrib/model_pruning/__init__.py +++ b/tensorflow/contrib/model_pruning/__init__.py @@ -33,6 +33,9 @@ from tensorflow.contrib.model_pruning.python.pruning import get_thresholds from tensorflow.contrib.model_pruning.python.pruning import get_weight_sparsity from tensorflow.contrib.model_pruning.python.pruning import get_weights from tensorflow.contrib.model_pruning.python.pruning import Pruning +from tensorflow.contrib.model_pruning.python.strip_pruning_vars_lib import graph_def_from_checkpoint +from tensorflow.contrib.model_pruning.python.strip_pruning_vars_lib import strip_pruning_vars_fn + # pylint: enable=unused-import from tensorflow.python.util.all_util import remove_undocumented @@ -41,7 +44,8 @@ _allowed_symbols = [ 'masked_convolution', 'masked_conv2d', 'masked_fully_connected', 'MaskedBasicLSTMCell', 'MaskedLSTMCell', 'train', 'apply_mask', 'get_masked_weights', 'get_masks', 'get_pruning_hparams', 'get_thresholds', - 'get_weights', 'get_weight_sparsity', 'Pruning' + 'get_weights', 'get_weight_sparsity', 'Pruning', 'strip_pruning_vars_fn', + 'graph_def_from_checkpoint' ] remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/model_pruning/python/layers/layers.py b/tensorflow/contrib/model_pruning/python/layers/layers.py index 466daf204a1ae86a7f37107342046305ea7249fc..d453e350f05c8e66df13c3861959980d69a564e8 100644 --- a/tensorflow/contrib/model_pruning/python/layers/layers.py +++ b/tensorflow/contrib/model_pruning/python/layers/layers.py @@ -139,7 +139,7 @@ def masked_convolution(inputs, with "NC". num_outputs: Integer, the number of output filters. kernel_size: A sequence of N positive integers specifying the spatial - dimensions of of the filters. Can be a single integer to specify the same + dimensions of the filters. Can be a single integer to specify the same value for all spatial dimensions. stride: A sequence of N positive integers specifying the stride at which to compute output. Can be a single integer to specify the same value for all diff --git a/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py b/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py index a5b050d25d00b298a20f7ce6abdda7c1d00db899..5f6c6aea74f2965ccfe552a58cde290b5506ef12 100644 --- a/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py +++ b/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py @@ -48,7 +48,7 @@ class MaskedBasicLSTMCell(tf_rnn.BasicLSTMCell): It does not allow cell clipping, a projection layer, and does not use peep-hole connections: it is the basic baseline. - For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell} + For advanced models, please use the full `tf.nn.rnn_cell.LSTMCell` that follows. """ diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py index da9d398cbc06299a33ab400cc9b4d780531211db..cd58526ed3620d4bd880cf36d806afac70c4bff7 100644 --- a/tensorflow/contrib/model_pruning/python/pruning.py +++ b/tensorflow/contrib/model_pruning/python/pruning.py @@ -152,8 +152,11 @@ def get_pruning_hparams(): end_pruning_step: integer the global step at which to terminate pruning. Defaults to -1 implying that pruning continues till the training stops - do_not_prune: list of strings - list of layers that are not pruned + weight_sparsity_map: list of strings + comma separed list of weight variable name:target sparsity pairs. + For layers/weights not in this list, sparsity as specified by the + target_sparsity hyperparameter is used. + Eg. [conv1:0.9,conv2/kernel:0.8] threshold_decay: float the decay factor to use for exponential decay of the thresholds pruning_frequency: integer @@ -200,7 +203,7 @@ def get_pruning_hparams(): name='model_pruning', begin_pruning_step=0, end_pruning_step=-1, - do_not_prune=[''], + weight_sparsity_map=[''], threshold_decay=0.9, pruning_frequency=10, nbins=256, @@ -234,6 +237,9 @@ class Pruning(object): # Pruning specification self._spec = spec if spec else get_pruning_hparams() + # Sanity check for pruning hparams + self._validate_spec() + # A tensorflow variable that tracks the sparsity function. # If not provided as input, the graph must already contain the global_step # variable before calling this constructor. @@ -256,6 +262,37 @@ class Pruning(object): # Block pooling function self._block_pooling_function = self._spec.block_pooling_function + # Mapping of weight names and target sparsity + self._weight_sparsity_map = self._get_weight_sparsity_map() + + def _validate_spec(self): + spec = self._spec + if spec.begin_pruning_step < 0: + raise ValueError('Illegal value for begin_pruning_step') + + if spec.begin_pruning_step >= spec.end_pruning_step: + if spec.end_pruning_step != -1: + raise ValueError( + 'Pruning must begin before it can end. begin_step=%d, end_step=%d.' + 'Set end_pruning_step to -1 if pruning is required till training' + 'stops' % (spec.begin_pruning_step, spec.end_pruning_step)) + + if spec.sparsity_function_begin_step < 0: + raise ValueError('Illegal value for sparsity_function_begin_step') + + if spec.sparsity_function_begin_step >= spec.sparsity_function_end_step: + raise ValueError( + 'Sparsity function requires begin_step < end_step') + + if not 0.0 <= spec.threshold_decay < 1.0: + raise ValueError('threshold_decay must be in range [0,1)') + + if not 0.0 <= spec.initial_sparsity < 1.0: + raise ValueError('initial_sparsity must be in range [0,1)') + + if not 0.0 <= spec.target_sparsity < 1.0: + raise ValueError('target_sparsity must be in range [0,1)') + def _setup_global_step(self, global_step): graph_global_step = global_step if graph_global_step is None: @@ -270,11 +307,6 @@ class Pruning(object): target_sparsity = self._spec.target_sparsity exponent = self._spec.sparsity_function_exponent - if begin_step >= end_step: - raise ValueError( - 'Pruning must begin before it can end. begin_step=%d, end_step=%d' % - (begin_step, end_step)) - with ops.name_scope(self._spec.name): p = math_ops.minimum( 1.0, @@ -306,15 +338,36 @@ class Pruning(object): 'last_mask_update_step', dtype=dtypes.int32) return last_update_step - def _exists_in_do_not_prune_list(self, tensor_name): - do_not_prune_list = self._spec.do_not_prune - if not do_not_prune_list[0]: - return False - for layer_name in do_not_prune_list: - if tensor_name.find(layer_name) != -1: - return True - - return False + def _get_weight_sparsity_map(self): + """Return the map of weight_name:sparsity parsed from the hparams.""" + weight_sparsity_map = {} + val_list = self._spec.weight_sparsity_map + filtered_val_list = [l for l in val_list if l] + for val in filtered_val_list: + weight_name, sparsity = val.split(':') + if float(sparsity) >= 1.0: + raise ValueError('Weight sparsity can not exceed 1.0') + weight_sparsity_map[weight_name] = float(sparsity) + + return weight_sparsity_map + + def _get_sparsity(self, weight_name): + """Return target sparsity for the given layer/weight name.""" + target_sparsity = [ + sparsity for name, sparsity in self._weight_sparsity_map.items() + if weight_name.find(name) != -1 + ] + if not target_sparsity: + return self._sparsity + + if len(target_sparsity) > 1: + raise ValueError( + 'Multiple matches in weight_sparsity_map for weight %s' % weight_name) + # TODO(suyoggupta): This will work when initial_sparsity = 0. Generalize + # to handle other cases as well. + return math_ops.mul( + self._sparsity, + math_ops.div(target_sparsity[0], self._spec.target_sparsity)) def _update_mask(self, weights, threshold): """Updates the mask for a given weight tensor. @@ -342,6 +395,8 @@ class Pruning(object): if self._sparsity is None: raise ValueError('Sparsity variable undefined') + sparsity = self._get_sparsity(weights.op.name) + with ops.name_scope(weights.op.name + '_pruning_ops'): abs_weights = math_ops.abs(weights) max_value = math_ops.reduce_max(abs_weights) @@ -354,7 +409,7 @@ class Pruning(object): math_ops.div( math_ops.reduce_sum( math_ops.cast( - math_ops.less(norm_cdf, self._sparsity), dtypes.float32)), + math_ops.less(norm_cdf, sparsity), dtypes.float32)), float(self._spec.nbins)), max_value) smoothed_threshold = math_ops.add_n([ @@ -453,10 +508,6 @@ class Pruning(object): if is_partitioned: weight = weight.as_tensor() - if self._spec.do_not_prune: - if self._exists_in_do_not_prune_list(mask.name): - continue - new_threshold, new_mask = self._maybe_update_block_mask(weight, threshold) self._assign_ops.append( pruning_utils.variable_assign(threshold, new_threshold)) @@ -507,22 +558,15 @@ class Pruning(object): no_update_op) def add_pruning_summaries(self): - """Adds summaries for this pruning spec. - - Args: none - - Returns: none - """ + """Adds summaries of weight sparsities and thresholds.""" with ops.name_scope(self._spec.name + '_summaries'): summary.scalar('sparsity', self._sparsity) summary.scalar('last_mask_update_step', self._last_update_step) masks = get_masks() thresholds = get_thresholds() for mask, threshold in zip(masks, thresholds): - if not self._exists_in_do_not_prune_list(mask.name): - summary.scalar(mask.op.name + '/sparsity', - nn_impl.zero_fraction(mask)) - summary.scalar(threshold.op.name + '/threshold', threshold) + summary.scalar(mask.op.name + '/sparsity', nn_impl.zero_fraction(mask)) + summary.scalar(threshold.op.name + '/threshold', threshold) def print_hparams(self): logging.info(self._spec.to_json()) diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py index f80b7c52c000f13b5ce98dd442ff21abfac37761..33c4ad58bd7f57422935fc839ddfc64d5e1f00f5 100644 --- a/tensorflow/contrib/model_pruning/python/pruning_test.py +++ b/tensorflow/contrib/model_pruning/python/pruning_test.py @@ -35,8 +35,8 @@ from tensorflow.python.training import training_util class PruningHParamsTest(test.TestCase): PARAM_LIST = [ "name=test", "threshold_decay=0.9", "pruning_frequency=10", - "do_not_prune=[conv1,conv2]", "sparsity_function_end_step=100", - "target_sparsity=0.9" + "sparsity_function_end_step=100", "target_sparsity=0.9", + "weight_sparsity_map=[conv1:0.8,conv2/kernel:0.8]" ] TEST_HPARAMS = ",".join(PARAM_LIST) @@ -55,9 +55,10 @@ class PruningHParamsTest(test.TestCase): self.assertEqual(p._spec.name, "test") self.assertAlmostEqual(p._spec.threshold_decay, 0.9) self.assertEqual(p._spec.pruning_frequency, 10) - self.assertAllEqual(p._spec.do_not_prune, ["conv1", "conv2"]) self.assertEqual(p._spec.sparsity_function_end_step, 100) self.assertAlmostEqual(p._spec.target_sparsity, 0.9) + self.assertEqual(p._weight_sparsity_map["conv1"], 0.8) + self.assertEqual(p._weight_sparsity_map["conv2/kernel"], 0.8) def testInitWithExternalSparsity(self): with self.test_session(): @@ -211,6 +212,37 @@ class PruningTest(test.TestCase): expected_non_zero_count = [100, 100, 80, 80, 60, 60, 40, 40, 40, 40] self.assertAllEqual(expected_non_zero_count, non_zero_count) + def testWeightSpecificSparsity(self): + param_list = [ + "begin_pruning_step=1", "pruning_frequency=1", "end_pruning_step=100", + "target_sparsity=0.5", "weight_sparsity_map=[layer2/weights:0.75]", + "threshold_decay=0.0" + ] + test_spec = ",".join(param_list) + pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) + + with variable_scope.variable_scope("layer1"): + w1 = variables.Variable( + math_ops.linspace(1.0, 100.0, 100), name="weights") + _ = pruning.apply_mask(w1) + with variable_scope.variable_scope("layer2"): + w2 = variables.Variable( + math_ops.linspace(1.0, 100.0, 100), name="weights") + _ = pruning.apply_mask(w2) + + p = pruning.Pruning(pruning_hparams) + mask_update_op = p.conditional_mask_update_op() + increment_global_step = state_ops.assign_add(self.global_step, 1) + + with self.test_session() as session: + variables.global_variables_initializer().run() + for _ in range(110): + session.run(mask_update_op) + session.run(increment_global_step) + + self.assertAllEqual( + session.run(pruning.get_weight_sparsity()), [0.5, 0.75]) + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/model_pruning/python/strip_pruning_vars.py b/tensorflow/contrib/model_pruning/python/strip_pruning_vars.py new file mode 100644 index 0000000000000000000000000000000000000000..3385103807f6dbdab2d27882c670a3ccf6a26e9d --- /dev/null +++ b/tensorflow/contrib/model_pruning/python/strip_pruning_vars.py @@ -0,0 +1,103 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +r"""Removes the auxiliary variables and ops added by the pruning library. + +Usage: + +bazel build tensorflow/contrib/model_pruning:strip_pruning_vars && \ +bazel-bin/tensorflow/contrib/model_pruning/strip_pruning_vars \ +--checkpoint_dir=/tmp/model_ckpts \ +--output_node_names=softmax \ +--output_dir=/tmp \ +--filename=pruning_stripped.pb +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os +import sys + +from tensorflow.contrib.model_pruning.python import strip_pruning_vars_lib +from tensorflow.python.framework import graph_io +from tensorflow.python.platform import app +from tensorflow.python.platform import tf_logging as logging + +FLAGS = None + + +def strip_pruning_vars(checkpoint_dir, output_node_names, output_dir, filename): + """Remove pruning-related auxiliary variables and ops from the graph. + + Accepts training checkpoints and produces a GraphDef in which the pruning vars + and ops have been removed. + + Args: + checkpoint_dir: Path to the checkpoints. + output_node_names: The name of the output nodes, comma separated. + output_dir: Directory where to write the graph. + filename: Output GraphDef file name. + + Returns: + None + + Raises: + ValueError: if output_nodes_names are not provided. + """ + if not output_node_names: + raise ValueError( + 'Need to specify atleast 1 output node through output_node_names flag') + output_node_names = output_node_names.replace(' ', '').split(',') + + initial_graph_def = strip_pruning_vars_lib.graph_def_from_checkpoint( + checkpoint_dir, output_node_names) + + final_graph_def = strip_pruning_vars_lib.strip_pruning_vars_fn( + initial_graph_def, output_node_names) + graph_io.write_graph(final_graph_def, output_dir, filename, as_text=False) + logging.info('\nFinal graph written to %s', os.path.join( + output_dir, filename)) + + +def main(unused_args): + return strip_pruning_vars(FLAGS.checkpoint_dir, FLAGS.output_node_names, + FLAGS.output_dir, FLAGS.filename) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.register('type', 'bool', lambda v: v.lower() == 'true') + parser.add_argument( + '--checkpoint_dir', type=str, default='', help='Path to the checkpoints.') + parser.add_argument( + '--output_node_names', + type=str, + default='', + help='The name of the output nodes, comma separated.') + parser.add_argument( + '--output_dir', + type=str, + default='/tmp', + help='Directory where to write the graph.') + parser.add_argument( + '--filename', + type=str, + default='pruning_stripped.pb', + help='Output \'GraphDef\' file name.') + + FLAGS, unparsed = parser.parse_known_args() + app.run(main=main, argv=[sys.argv[0]] + unparsed) diff --git a/tensorflow/contrib/model_pruning/python/strip_pruning_vars_lib.py b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_lib.py new file mode 100644 index 0000000000000000000000000000000000000000..fc4b10863f7c46235059f948fbbfcfcf83d3e15b --- /dev/null +++ b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_lib.py @@ -0,0 +1,142 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Utilities to remove pruning-related ops and variables from a GraphDef. +""" + +# pylint: disable=missing-docstring +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.core.framework import attr_value_pb2 +from tensorflow.core.framework import graph_pb2 +from tensorflow.core.framework import node_def_pb2 +from tensorflow.python.client import session +from tensorflow.python.framework import graph_util +from tensorflow.python.framework import importer +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_util +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import saver as saver_lib + + +def _node_name(tensor_name): + """Remove the trailing ':0' from the variable name.""" + if ':' not in tensor_name: + return tensor_name + + return tensor_name.split(':')[0] + + +def _tensor_name(node_name): + """Appends the :0 in the op name to get the canonical tensor name.""" + if ':' in node_name: + return node_name + + return node_name + ':0' + + +def _get_masked_weights(input_graph_def): + """Extracts masked_weights from the graph as a dict of {var_name:ndarray}.""" + input_graph = ops.Graph() + with input_graph.as_default(): + importer.import_graph_def(input_graph_def, name='') + + with session.Session(graph=input_graph) as sess: + masked_weights_dict = {} + for node in input_graph_def.node: + if 'masked_weight' in node.name: + masked_weight_val = sess.run( + sess.graph.get_tensor_by_name(_tensor_name(node.name))) + logging.info( + '%s has %d values, %1.2f%% zeros \n', node.name, + np.size(masked_weight_val), + 100 - float(100 * np.count_nonzero(masked_weight_val)) / + np.size(masked_weight_val)) + masked_weights_dict.update({node.name: masked_weight_val}) + return masked_weights_dict + + +def strip_pruning_vars_fn(input_graph_def, output_node_names): + """Removes mask variable from the graph. + + Replaces the masked_weight tensor with element-wise multiplication of mask + and the corresponding weight variable. + + Args: + input_graph_def: A GraphDef in which the variables have been converted to + constants. This is typically the output of + tf.graph_util.convert_variables_to_constant() + output_node_names: List of name strings for the result nodes of the graph + + Returns: + A GraphDef in which pruning-related variables have been removed + """ + masked_weights_dict = _get_masked_weights(input_graph_def) + pruned_graph_def = graph_pb2.GraphDef() + + # Replace masked_weight with a const op containing the + # result of tf.multiply(mask,weight) + for node in input_graph_def.node: + output_node = node_def_pb2.NodeDef() + if 'masked_weight' in node.name: + output_node.op = 'Const' + output_node.name = node.name + dtype = node.attr['T'] + data = masked_weights_dict[node.name] + output_node.attr['dtype'].CopyFrom(dtype) + output_node.attr['value'].CopyFrom( + attr_value_pb2.AttrValue(tensor=tensor_util.make_tensor_proto(data))) + + else: + output_node.CopyFrom(node) + pruned_graph_def.node.extend([output_node]) + + # Remove stranded nodes: mask and weights + return graph_util.extract_sub_graph(pruned_graph_def, output_node_names) + + +def graph_def_from_checkpoint(checkpoint_dir, output_node_names): + """Converts checkpoint data to GraphDef. + + Reads the latest checkpoint data and produces a GraphDef in which the + variables have been converted to constants. + + Args: + checkpoint_dir: Path to the checkpoints. + output_node_names: List of name strings for the result nodes of the graph. + + Returns: + A GraphDef from the latest checkpoint + + Raises: + ValueError: if no checkpoint is found + """ + checkpoint_path = saver_lib.latest_checkpoint(checkpoint_dir) + if checkpoint_path is None: + raise ValueError('Could not find a checkpoint at: {0}.' + .format(checkpoint_dir)) + + saver_for_restore = saver_lib.import_meta_graph( + checkpoint_path + '.meta', clear_devices=True) + with session.Session() as sess: + saver_for_restore.restore(sess, checkpoint_path) + graph_def = ops.get_default_graph().as_graph_def() + output_graph_def = graph_util.convert_variables_to_constants( + sess, graph_def, output_node_names) + + return output_graph_def diff --git a/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py new file mode 100644 index 0000000000000000000000000000000000000000..255daa036099c0d3ef2dbc5eb37fdb0c31c71383 --- /dev/null +++ b/tensorflow/contrib/model_pruning/python/strip_pruning_vars_test.py @@ -0,0 +1,232 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for strip_pruning_vars.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re + +from tensorflow.contrib.model_pruning.python import pruning +from tensorflow.contrib.model_pruning.python import strip_pruning_vars_lib +from tensorflow.contrib.model_pruning.python.layers import layers +from tensorflow.contrib.model_pruning.python.layers import rnn_cells +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import graph_util +from tensorflow.python.framework import importer +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import random_ops +from tensorflow.python.ops import rnn +from tensorflow.python.ops import rnn_cell as tf_rnn_cells +from tensorflow.python.ops import state_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import training_util + + +def _get_number_pruning_vars(graph_def): + number_vars = 0 + for node in graph_def.node: + if re.match(r"^.*(mask$)|(threshold$)", node.name): + number_vars += 1 + return number_vars + + +def _get_node_names(tensor_names): + return [ + strip_pruning_vars_lib._node_name(tensor_name) + for tensor_name in tensor_names + ] + + +class StripPruningVarsTest(test.TestCase): + + def setUp(self): + param_list = [ + "pruning_frequency=1", "begin_pruning_step=1", "end_pruning_step=10", + "nbins=2048", "threshold_decay=0.0" + ] + self.initial_graph = ops.Graph() + self.initial_graph_def = None + self.final_graph = ops.Graph() + self.final_graph_def = None + self.pruning_spec = ",".join(param_list) + with self.initial_graph.as_default(): + self.sparsity = variables.Variable(0.5, name="sparsity") + self.global_step = training_util.get_or_create_global_step() + self.increment_global_step = state_ops.assign_add(self.global_step, 1) + self.mask_update_op = None + + def _build_convolutional_model(self, number_of_layers): + # Create a graph with several conv2d layers + kernel_size = 3 + base_depth = 4 + depth_step = 7 + height, width = 7, 9 + with variable_scope.variable_scope("conv_model"): + input_tensor = array_ops.ones((8, height, width, base_depth)) + top_layer = input_tensor + for ix in range(number_of_layers): + top_layer = layers.masked_conv2d( + top_layer, + base_depth + (ix + 1) * depth_step, + kernel_size, + scope="Conv_" + str(ix)) + + return top_layer + + def _build_fully_connected_model(self, number_of_layers): + base_depth = 4 + depth_step = 7 + + input_tensor = array_ops.ones((8, base_depth)) + + top_layer = input_tensor + + with variable_scope.variable_scope("fc_model"): + for ix in range(number_of_layers): + top_layer = layers.masked_fully_connected( + top_layer, base_depth + (ix + 1) * depth_step) + + return top_layer + + def _build_lstm_model(self, number_of_layers): + batch_size = 8 + dim = 10 + inputs = variables.Variable(random_ops.random_normal([batch_size, dim])) + + def lstm_cell(): + return rnn_cells.MaskedBasicLSTMCell( + dim, forget_bias=0.0, state_is_tuple=True, reuse=False) + + cell = tf_rnn_cells.MultiRNNCell( + [lstm_cell() for _ in range(number_of_layers)], state_is_tuple=True) + + outputs = rnn.static_rnn( + cell, [inputs], + initial_state=cell.zero_state(batch_size, dtypes.float32)) + + return outputs + + def _prune_model(self, session): + pruning_hparams = pruning.get_pruning_hparams().parse(self.pruning_spec) + p = pruning.Pruning(pruning_hparams, sparsity=self.sparsity) + self.mask_update_op = p.conditional_mask_update_op() + + variables.global_variables_initializer().run() + for _ in range(20): + session.run(self.mask_update_op) + session.run(self.increment_global_step) + + def _get_outputs(self, session, input_graph, tensors_list, graph_prefix=None): + outputs = [] + + for output_tensor in tensors_list: + if graph_prefix: + output_tensor = graph_prefix + "/" + output_tensor + outputs.append( + session.run(session.graph.get_tensor_by_name(output_tensor))) + + return outputs + + def _get_initial_outputs(self, output_tensor_names_list): + with self.test_session(graph=self.initial_graph) as sess1: + self._prune_model(sess1) + reference_outputs = self._get_outputs(sess1, self.initial_graph, + output_tensor_names_list) + + self.initial_graph_def = graph_util.convert_variables_to_constants( + sess1, sess1.graph.as_graph_def(), + _get_node_names(output_tensor_names_list)) + return reference_outputs + + def _get_final_outputs(self, output_tensor_names_list): + self.final_graph_def = strip_pruning_vars_lib.strip_pruning_vars_fn( + self.initial_graph_def, _get_node_names(output_tensor_names_list)) + _ = importer.import_graph_def(self.final_graph_def, name="final") + + with self.test_session(self.final_graph) as sess2: + final_outputs = self._get_outputs( + sess2, + self.final_graph, + output_tensor_names_list, + graph_prefix="final") + return final_outputs + + def _check_removal_of_pruning_vars(self, number_masked_layers): + self.assertEqual( + _get_number_pruning_vars(self.initial_graph_def), number_masked_layers) + self.assertEqual(_get_number_pruning_vars(self.final_graph_def), 0) + + def _check_output_equivalence(self, initial_outputs, final_outputs): + for initial_output, final_output in zip(initial_outputs, final_outputs): + self.assertAllEqual(initial_output, final_output) + + def testConvolutionalModel(self): + with self.initial_graph.as_default(): + number_masked_conv_layers = 5 + top_layer = self._build_convolutional_model(number_masked_conv_layers) + output_tensor_names = [top_layer.name] + initial_outputs = self._get_initial_outputs(output_tensor_names) + + # Remove pruning-related nodes. + with self.final_graph.as_default(): + final_outputs = self._get_final_outputs(output_tensor_names) + + # Check that the final graph has no pruning-related vars + self._check_removal_of_pruning_vars(number_masked_conv_layers) + + # Check that outputs remain the same after removal of pruning-related nodes + self._check_output_equivalence(initial_outputs, final_outputs) + + def testFullyConnectedModel(self): + with self.initial_graph.as_default(): + number_masked_fc_layers = 3 + top_layer = self._build_fully_connected_model(number_masked_fc_layers) + output_tensor_names = [top_layer.name] + initial_outputs = self._get_initial_outputs(output_tensor_names) + + # Remove pruning-related nodes. + with self.final_graph.as_default(): + final_outputs = self._get_final_outputs(output_tensor_names) + + # Check that the final graph has no pruning-related vars + self._check_removal_of_pruning_vars(number_masked_fc_layers) + + # Check that outputs remain the same after removal of pruning-related nodes + self._check_output_equivalence(initial_outputs, final_outputs) + + def testLSTMModel(self): + with self.initial_graph.as_default(): + number_masked_lstm_layers = 2 + outputs = self._build_lstm_model(number_masked_lstm_layers) + output_tensor_names = [outputs[0][0].name] + initial_outputs = self._get_initial_outputs(output_tensor_names) + + # Remove pruning-related nodes. + with self.final_graph.as_default(): + final_outputs = self._get_final_outputs(output_tensor_names) + + # Check that the final graph has no pruning-related vars + self._check_removal_of_pruning_vars(number_masked_lstm_layers) + + # Check that outputs remain the same after removal of pruning-related nodes + self._check_output_equivalence(initial_outputs, final_outputs) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout.py b/tensorflow/contrib/nn/python/ops/alpha_dropout.py index 2f92d05ba81f30a91f68f3c3ec51b6695d3d0371..98f4264fe0813d421f559594efae73608e53ca62 100644 --- a/tensorflow/contrib/nn/python/ops/alpha_dropout.py +++ b/tensorflow/contrib/nn/python/ops/alpha_dropout.py @@ -43,7 +43,7 @@ def alpha_dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylin noise_shape: A 1-D `Tensor` of type `int32`, representing the shape for randomly generated keep/drop flags. seed: A Python integer. Used to create random seeds. See - @{tf.set_random_seed} for behavior. + `tf.set_random_seed` for behavior. name: A name for this operation (optional). Returns: diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py index e65925610c5f5125c2d2e92edc1cf708c54255d4..de71b0845e292b3ee03848afc6cc05c15286d9e8 100644 --- a/tensorflow/contrib/nn/python/ops/sampling_ops.py +++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py @@ -123,15 +123,15 @@ def rank_sampled_softmax_loss(weights, """Computes softmax loss using rank-based adaptive resampling. This has been shown to improve rank loss after training compared to - @{tf.nn.sampled_softmax_loss}. For a description of the algorithm and some + `tf.nn.sampled_softmax_loss`. For a description of the algorithm and some experimental results, please see: [TAPAS: Two-pass Approximate Adaptive Sampling for Softmax](https://arxiv.org/abs/1707.03073). Sampling follows two phases: * In the first phase, `num_sampled` classes are selected using - @{tf.nn.learned_unigram_candidate_sampler} or supplied `sampled_values`. + `tf.nn.learned_unigram_candidate_sampler` or supplied `sampled_values`. The logits are calculated on those sampled classes. This phases is - similar to @{tf.nn.sampled_softmax_loss}. + similar to `tf.nn.sampled_softmax_loss`. * In the second phase, the `num_resampled` classes with highest predicted probability are kept. Probabilities are `LogSumExp(logits / resampling_temperature)`, where the sum is over @@ -142,7 +142,7 @@ def rank_sampled_softmax_loss(weights, picks more candidates close to the predicted classes. A common strategy is to decrease the temperature as training proceeds. - See @{tf.nn.sampled_softmax_loss} for more documentation on sampling and + See `tf.nn.sampled_softmax_loss` for more documentation on sampling and for typical default values for some of the parameters. This operation is for training only. It is generally an underestimate of @@ -197,7 +197,7 @@ def rank_sampled_softmax_loss(weights, where a sampled class equals one of the target classes. partition_strategy: A string specifying the partitioning strategy, relevant if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported. - See @{tf.nn.embedding_lookup} for more details. + See `tf.nn.embedding_lookup` for more details. name: A name for the operation (optional). Returns: diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD index bbdf962d0480e52045d31f65b3d137ed3f11f2f1..778b710d78a2095b8a1315018641c67419c26b98 100644 --- a/tensorflow/contrib/opt/BUILD +++ b/tensorflow/contrib/opt/BUILD @@ -27,6 +27,7 @@ py_library( "python/training/nadam_optimizer.py", "python/training/powersign.py", "python/training/reg_adagrad_optimizer.py", + "python/training/shampoo.py", "python/training/sign_decay.py", "python/training/variable_clipping_optimizer.py", "python/training/weight_decay_optimizers.py", @@ -344,3 +345,23 @@ py_test( "//third_party/py/numpy", ], ) + +py_test( + name = "shampoo_test", + size = "large", + srcs = ["python/training/shampoo_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":opt_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework", + "//tensorflow/python:math_ops", + "//tensorflow/python:platform", + "//tensorflow/python:platform_test", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:variables", + "//third_party/py/numpy", + "@absl_py//absl/testing:parameterized", + ], +) diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py index 3e63e99030c46c254625ca8fdccce614cd60e8b0..9471fb018162ee377e9c614d6e4d745b4282165a 100644 --- a/tensorflow/contrib/opt/__init__.py +++ b/tensorflow/contrib/opt/__init__.py @@ -30,10 +30,10 @@ from tensorflow.contrib.opt.python.training.model_average_optimizer import * from tensorflow.contrib.opt.python.training.moving_average_optimizer import * from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import * from tensorflow.contrib.opt.python.training.nadam_optimizer import * +from tensorflow.contrib.opt.python.training.shampoo import * from tensorflow.contrib.opt.python.training.weight_decay_optimizers import * from tensorflow.contrib.opt.python.training.powersign import * from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import * -from tensorflow.contrib.opt.python.training.weight_decay_optimizers import * # pylint: enable=wildcard-import from tensorflow.python.util.all_util import remove_undocumented @@ -62,6 +62,7 @@ _allowed_symbols = [ 'ModelAverageOptimizer', 'ModelAverageCustomGetter', 'GGTOptimizer', + 'ShampooOptimizer', ] remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/opt/python/training/shampoo.py b/tensorflow/contrib/opt/python/training/shampoo.py new file mode 100644 index 0000000000000000000000000000000000000000..294627f42a839f399f747bcd6ba50968aadb35a1 --- /dev/null +++ b/tensorflow/contrib/opt/python/training/shampoo.py @@ -0,0 +1,474 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""The Shampoo Optimizer. + +Variant of Adagrad using one preconditioner matrix per variable dimension. +For details, see https://arxiv.org/abs/1802.09568 +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import linalg_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.platform import tf_logging +from tensorflow.python.training import optimizer + + +def GetParam(var, timestep): + if callable(var): + return var(timestep) + else: + return var + + +class ShampooOptimizer(optimizer.Optimizer): + """The Shampoo Optimizer + + Variant of Adagrad using one preconditioner matrix per variable dimension. + For details, see https://arxiv.org/abs/1802.09568 + + gbar is time-weighted accumulated gradient: + gbar[t] = gbar_decay[t] * gbar[t-1] + gbar_weight[t] * g[t] + + mat_gbar is time-weighted accumulated gradient square: + mat_gbar_j[t] = mat_gbar_decay[t] * mat_gbar_j[t-1] + + mat_gbar_weight[t] * gg_j[t] + where if g[t] = g_abcd then gg_a[t] = g_abcd g_a'bcd (Einstein notation) + + Update rule: + w[t+1] = w[t] - learning_rate[t] * Prod_j mat_gbar_j[t]^(-alpha/n) gbar[t] + Again, mat_gbar_j[t]^(-alpha) gbar[t] is a tensor contraction along the + j'th dimension of gbar[t] with the first dimension of + mat_gbar_j[t]^(-alpha/n), where alpha is a hyperparameter, + and n = rank of the variable. + Prod_j represents doing this contraction for all j in 0..n-1. + + Typically learning_rate is constant, but could be time dependent by passing + a lambda function that depends on step. + """ + + def __init__(self, + global_step=0, + max_matrix_size=768, + gbar_decay=0.0, + gbar_weight=1.0, + mat_gbar_decay=1.0, + mat_gbar_weight=1.0, + learning_rate=1.0, + svd_interval=1, + precond_update_interval=1, + epsilon=0.1, + alpha=0.5, + use_iterative_root=False, + use_locking=False, + name="Shampoo"): + """Default values of the various hyper-parameters. + + gbar_decay, gbar_weight etc. can be a float or a time varying parameter. + For time-varying parameters use e.g. "lambda T: T / (T + 1.0)" + where the expression in the lambda is a tensorflow expression + + Args: + global_step: tensorflow variable indicating the step. + max_matrix_size: We do not perform SVD for matrices larger than this. + gbar_decay: + gbar_weight: Used to update gbar: + gbar[t] = gbar_decay[t] * gbar[t-1] + gbar_weight[t] * g[t] + mat_gbar_decay: + mat_gbar_weight: Used to update mat_gbar: + mat_gbar_j[t] = mat_gbar_decay[t] * mat_gbar_j[t-1] + + mat_gbar_weight[t] * gg_j[t] + learning_rate: Similar to SGD + svd_interval: We should do SVD after this many steps. Default = 1, i.e. + every step. Usually 20 leads to no loss of accuracy, and + 50 or 100 is also OK. May also want more often early, + and less often later - set in caller as for example: + "svd_interval = lambda(T): tf.cond( + T < 2000, lambda: 20.0, lambda: 1000.0)" + precond_update_interval: We should update the preconditioners after + this many steps. Default = 1. Usually less than + svd_interval. + epsilon: epsilon * I_n is added to each mat_gbar_j for stability + alpha: total power of the preconditioners. + use_iterative_root: should the optimizer use SVD (faster) or the + iterative root method (for TPU) for finding the + roots of PSD matrices. + use_locking: + name: name of optimizer. + """ + + super(ShampooOptimizer, self).__init__(use_locking, name) + + self._global_step = math_ops.to_float(global_step) + self._max_matrix_size = max_matrix_size + self._gbar_decay = gbar_decay + self._gbar_weight = gbar_weight + self._mat_gbar_decay = mat_gbar_decay + self._mat_gbar_weight = mat_gbar_weight + self._learning_rate = learning_rate + self._svd_interval = svd_interval + self._precond_update_interval = precond_update_interval + self._epsilon = epsilon + self._alpha = alpha + self._use_iterative_root = use_iterative_root + self._name = name + + def _create_slots(self, var_list): + for v in var_list: + with ops.colocate_with(v): + _ = self._zeros_slot(v, "gbar", self._name) + shape = np.array(v.get_shape()) + for i, d in enumerate(shape): + d_tensor = ops.convert_to_tensor(d) + if d <= self._max_matrix_size: + mat_g_init = array_ops.zeros_like(linalg_ops.eye(d_tensor)) + if self._svd_interval > 1: + _ = self._get_or_make_slot(v, linalg_ops.eye(d_tensor), + "H_" + str(i), self._name) + else: + mat_g_init = array_ops.zeros([d_tensor]) + + _ = self._get_or_make_slot(v, mat_g_init, "Gbar_" + str(i), + self._name) + + def _resource_apply_dense(self, grad, var): + return self._apply_dense(grad, var) + + def _apply_dense(self, grad, var): + return self._apply_gradient(grad, var) + + def _resource_apply_sparse(self, grad_values, var, grad_indices): + return self._apply_sparse_shared(grad_values, grad_indices, var) + + def _apply_sparse(self, grad, var): + return self._apply_sparse_shared(grad.values, grad.indices, var) + + def _apply_sparse_shared(self, grad_values, grad_indices, var): + if var.get_shape()[0] <= self._max_matrix_size or self._gbar_decay != 0.0: + # The dimension is small enough, we can make the variable dense and + # do a dense update + dense_grad = array_ops.scatter_nd( + array_ops.expand_dims(grad_indices, axis=1), grad_values, + array_ops.shape(var, out_type=grad_indices.dtype)) + return self._apply_gradient(dense_grad, var) + return self._apply_gradient(grad_values, var, grad_indices) + + def _weighted_average(self, var, weight, weight_t, rest): + """Computes exponential weighted average: var = weight_t * var + rest. + + Important to ensure that var does not occur in rest, otherwise + we can get race conditions in a distributed setting. + + Args: + var: variable to be updated + weight: parameter to be checked. If it is a constant, we can optimize. + weight_t: current value of parameter, used for weighting + rest: the remaining tensor to be added + + Returns: + updated variable. + """ + if weight == 0.0: + return rest # no need to update var, we will never use it. + if weight == 1.0: # common case + return state_ops.assign_add(var, rest) + # The op below can cause race conditions in a distributed setting, + # since computing weight_t * var + rest can take some time, during + # which var may be set by another worker. To prevent this, it should + # be implemented as a C++ op. + return var.assign_add((weight_t - 1) * var + rest) + + def _update_mat_g(self, mat_g, grad, axes, mat_gbar_decay, + mat_gbar_weight, i): + """Updates the cumulative outer products of the gradients. + + Args: + mat_g: the matrix to be updated + grad: the gradient of the variable + axes: a list of k-1 integers 0 to k-1, except i + mat_gbar_decay: constant for weighted average: + mat_g = mat_g * decay + grad * weight + mat_gbar_weight: constant for weighted average + i: index of dimension to be updated. + + Returns: + updated mat_g = mat_g * mat_gbar_decay + grad_outer * mat_gbar_weight + + In Einstein notation if i = 0: grad_outer_aa'= g_abcd g_a'bcd + thus grad_outer is a matrix d_i x d_i, where d_i is the size of the + i'th dimension of g. + Alternate view: If mat_i(grad) is the flattening of grad to a + d_i x (d_1d_2...d_{i-1}d_{i+1}...d_k) matrix, then + grad_outer = mat_i(grad) mat_i(grad).transpose + """ + grad_outer = math_ops.tensordot(grad, grad, axes=(axes, axes), + name="grad_outer_" + str(i)) + return self._weighted_average(mat_g, self._mat_gbar_decay, mat_gbar_decay, + mat_gbar_weight * grad_outer) + + def _compute_power_svd(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name): + """Computes mat_h = mat_g^alpha using svd. mat_g is a symmetric PSD matrix. + + Args: + var: the variable we are updating. + mat_g: the symmetric PSD matrix whose power it to be computed + mat_g_size: size of mat_g + alpha: a real number + mat_h_slot_name: name of slot to store the power, if needed. + + Returns: + mat_h = mat_g^alpha + + Stores mat_h in the appropriate slot, if it exists. + Note that mat_g is PSD. So we could use linalg_ops.self_adjoint_eig. + """ + if mat_g_size == 1: + mat_h = math_ops.pow(mat_g + self._epsilon, alpha) + else: + damping = self._epsilon * linalg_ops.eye(math_ops.to_int32(mat_g_size)) + diag_d, mat_u, mat_v = linalg_ops.svd(mat_g + damping, full_matrices=True) + mat_h = math_ops.matmul( + mat_v * math_ops.pow(math_ops.maximum(diag_d, self._epsilon), alpha), + array_ops.transpose(mat_u)) + if mat_h_slot_name is not None: + return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h) + return mat_h + + def _compute_power_iter(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name, + iter_count=100, epsilon=1e-6): + """Computes mat_g^alpha, where alpha = -1/p, p a positive integer. + + We use an iterative Schur-Newton method from equation 3.2 on page 9 of: + + A Schur-Newton Method for the Matrix p-th Root and its Inverse + by Chun-Hua Guo and Nicholas J. Higham + SIAM Journal on Matrix Analysis and Applications, + 2006, Vol. 28, No. 3 : pp. 788-804 + https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf + + Args: + var: the variable we are updating. + mat_g: the symmetric PSD matrix whose power it to be computed + mat_g_size: size of mat_g. + alpha: exponent, must be -1/p for p a positive integer. + mat_h_slot_name: name of slot to store the power, if needed. + iter_count: Maximum number of iterations. + epsilon: accuracy indicator, useful for early termination. + + Returns: + mat_g^alpha + """ + + identity = linalg_ops.eye(math_ops.to_int32(mat_g_size)) + + def MatPower(mat_m, p): + """Computes mat_m^p, for p a positive integer. + + Power p is known at graph compile time, so no need for loop and cond. + Args: + mat_m: a square matrix + p: a positive integer + + Returns: + mat_m^p + """ + assert p == int(p) and p > 0 + power = None + while p > 0: + if p % 2 == 1: + power = math_ops.matmul(mat_m, power) if power is not None else mat_m + p //= 2 + mat_m = math_ops.matmul(mat_m, mat_m) + return power + + def IterCondition(i, mat_m, _): + return math_ops.logical_and( + i < iter_count, + math_ops.reduce_max(math_ops.abs(mat_m - identity)) > epsilon) + + def IterBody(i, mat_m, mat_x): + mat_m_i = (1 - alpha) * identity + alpha * mat_m + return (i + 1, math_ops.matmul(MatPower(mat_m_i, -1.0/alpha), mat_m), + math_ops.matmul(mat_x, mat_m_i)) + + if mat_g_size == 1: + mat_h = math_ops.pow(mat_g + self._epsilon, alpha) + else: + damped_mat_g = mat_g + self._epsilon * identity + z = (1 - 1 / alpha) / (2 * linalg_ops.norm(damped_mat_g)) + # The best value for z is + # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) / + # (c_max^{1-alpha} - c_min^{1-alpha}) + # where c_max and c_min are the largest and smallest singular values of + # damped_mat_g. + # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha) + # Can replace above line by the one below, but it is less accurate, + # hence needs more iterations to converge. + # z = (1 - 1/alpha) / math_ops.trace(damped_mat_g) + # If we want the method to always converge, use z = 1 / norm(damped_mat_g) + # or z = 1 / math_ops.trace(damped_mat_g), but these can result in many + # extra iterations. + _, _, mat_h = control_flow_ops.while_loop( + IterCondition, IterBody, + [0, damped_mat_g * z, identity * math_ops.pow(z, -alpha)]) + if mat_h_slot_name is not None: + return state_ops.assign(self.get_slot(var, mat_h_slot_name), mat_h) + return mat_h + + def _compute_power(self, var, mat_g, mat_g_size, alpha, mat_h_slot_name=None): + """Just a switch between the iterative power vs svd.""" + with ops.name_scope("matrix_iterative_power"): + if self._use_iterative_root: + return self._compute_power_iter(var, mat_g, mat_g_size, alpha, + mat_h_slot_name) + else: + return self._compute_power_svd(var, mat_g, mat_g_size, alpha, + mat_h_slot_name) + + def _apply_gradient(self, grad, var, indices=None): + """The main function to update a variable. + + Args: + grad: A Tensor containing gradient to apply. + var: A Tensor containing the variable to update. + indices: An array of integers, for sparse update. + + Returns: + Updated variable var = var - learning_rate * preconditioner * grad + + If the gradient is dense, var and grad have the same shape. + If the update is sparse, then the first dimension of the gradient and var + may differ, others are all the same. In this case the indices array + provides the set of indices of the variable which are to be updated with + each row of the gradient. + """ + global_step = self._global_step + 1 + + # Update accumulated weighted average of gradients + gbar = self.get_slot(var, "gbar") + gbar_decay_t = GetParam(self._gbar_decay, global_step) + gbar_weight_t = GetParam(self._gbar_weight, global_step) + if indices is not None: + # Note - the sparse update is not easily implemented, since the + # algorithm needs all indices of gbar to be updated + # if mat_gbar_decay != 1 or mat_gbar_decay != 0. + # One way to make mat_gbar_decay = 1 is by rescaling. + # If we want the update: + # G_{t+1} = a_{t+1} G_t + b_{t+1} w_t + # define: + # r_{t+1} = a_{t+1} * r_t + # h_t = G_t / r_t + # Then: + # h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t + # So we get the mat_gbar_decay = 1 as desired. + # We can implement this in a future version as needed. + # However we still need gbar_decay = 0, otherwise all indices + # of the variable will need to be updated. + if self._gbar_decay != 0.0: + tf_logging.warning("Not applying momentum for variable: %s" % var.name) + gbar_updated = grad + else: + gbar_updated = self._weighted_average(gbar, self._gbar_decay, + gbar_decay_t, + gbar_weight_t * grad) + + # Update the preconditioners and compute the preconditioned gradient + shape = var.get_shape() + mat_g_list = [] + for i in range(len(shape)): + mat_g_list.append(self.get_slot(var, "Gbar_" + str(i))) + mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step) + mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step) + + preconditioned_grad = gbar_updated + v_rank = len(mat_g_list) + neg_alpha = - GetParam(self._alpha, global_step) / v_rank + svd_interval = GetParam(self._svd_interval, global_step) + precond_update_interval = GetParam(self._precond_update_interval, + global_step) + for i, mat_g in enumerate(mat_g_list): + # axes is the list of indices to reduce - everything but the current i. + axes = list(range(i)) + list(range(i+1, v_rank)) + if shape[i] <= self._max_matrix_size: + # If the tensor size is sufficiently small perform full Shampoo update + # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this + # is not strictly correct. However we will use it for now, and + # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg) + + # pylint: disable=g-long-lambda,cell-var-from-loop + mat_g_updated = control_flow_ops.cond( + math_ops.mod(global_step, precond_update_interval) < 1, + lambda: self._update_mat_g( + mat_g, grad, axes, mat_gbar_decay_t, + mat_gbar_weight_t * precond_update_interval, i), + lambda: mat_g) + + if self._svd_interval == 1: + mat_h = self._compute_power(var, mat_g_updated, shape[i], neg_alpha) + else: + mat_h = control_flow_ops.cond( + math_ops.mod(global_step, svd_interval) < 1, + lambda: self._compute_power(var, mat_g_updated, shape[i], + neg_alpha, "H_" + str(i)), + lambda: self.get_slot(var, "H_" + str(i))) + + # mat_h is a square matrix of size d_i x d_i + # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor + # After contraction with a d_i x d_i tensor + # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor + # (the first dimension is contracted out, and the second dimension of + # mat_h is appended). After going through all the indices, it becomes + # a d_0 x ... x d_n tensor again. + preconditioned_grad = math_ops.tensordot(preconditioned_grad, mat_h, + axes=([0], [0]), + name="precond_" + str(i)) + else: + # Tensor size is too large -- perform diagonal Shampoo update + grad_outer = math_ops.reduce_sum(grad * grad, axis=axes) + if i == 0 and indices is not None: + assert self._mat_gbar_decay == 1.0 + mat_g_updated = state_ops.scatter_add(mat_g, indices, + mat_gbar_weight_t * grad_outer) + mat_h = math_ops.pow( + array_ops.gather(mat_g_updated, indices) + self._epsilon, + neg_alpha) + else: + mat_g_updated = self._weighted_average(mat_g, + self._mat_gbar_decay, + mat_gbar_decay_t, + mat_gbar_weight_t * grad_outer) + mat_h = math_ops.pow(mat_g_updated + self._epsilon, neg_alpha) + + # Need to do the transpose to ensure that the tensor becomes + # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above. + preconditioned_grad = array_ops.transpose( + preconditioned_grad, perm=list(range(1, v_rank)) + [0]) * mat_h + + # Update the variable based on the Shampoo update + learning_rate_t = GetParam(self._learning_rate, global_step) + if indices is not None: + var_updated = state_ops.scatter_add( + var, indices, -learning_rate_t * preconditioned_grad) + else: + var_updated = state_ops.assign_sub(var, + learning_rate_t * preconditioned_grad) + return var_updated diff --git a/tensorflow/contrib/opt/python/training/shampoo_test.py b/tensorflow/contrib/opt/python/training/shampoo_test.py new file mode 100644 index 0000000000000000000000000000000000000000..2e0a202ae293664d85ece884a505096455cde73c --- /dev/null +++ b/tensorflow/contrib/opt/python/training/shampoo_test.py @@ -0,0 +1,734 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Functional tests for AdaMoo optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl.testing import parameterized +import numpy as np + +from tensorflow.contrib.opt.python.training import shampoo +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + +TOLERANCE = 1e-3 + + +def np_power(mat_g, alpha): + """Computes mat_g^alpha for a square symmetric matrix mat_g.""" + + mat_u, diag_d, mat_v = np.linalg.svd(mat_g) + diag_d = np.power(diag_d, alpha) + return np.dot(np.dot(mat_u, np.diag(diag_d)), mat_v) + + +class ShampooTest(test.TestCase, parameterized.TestCase): + + @parameterized.named_parameters(('Var', False), ('ResourceVar', True)) + def testBasicVector(self, use_resource_var): + """Similar to the full Adagrad update.""" + + size = 20 + init_var_np = np.zeros(size) + grad_np = np.random.rand(size) + grad_np_2 = np.random.rand(size) + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = constant_op.constant(grad_np, dtype=dtypes.float32) + grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32) + + opt = shampoo.ShampooOptimizer(global_step) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + update_2 = opt.apply_gradients(zip([grad_2], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + init_val = sess.run(var) + self.assertAllCloseAccordingToType(init_var_np, init_val) + + # Run a step of Shampoo + update.run() + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * mat_g^{-0.5} * grad + # lr = 1 + mat_g = np.outer(grad_np, grad_np) + mat_h = np_power(mat_g + 0.1 * np.eye(size), -0.5) + new_val_np = init_var_np - np.dot(mat_h, grad_np) + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + # Run another step of Shampoo + update_2.run() + new_val = sess.run(var) + + mat_g += np.outer(grad_np_2, grad_np_2) + mat_h = np_power(mat_g + 0.1 * np.eye(size), -0.5) + new_val_np -= np.dot(mat_h, grad_np_2) + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + @parameterized.named_parameters(('Var', False), ('ResourceVar', True)) + def testBasicMatrix(self, use_resource_var): + """Check update when gradient is a matrix.""" + size = [10, 5] + init_var_np = np.zeros(size) + grad_np = np.random.rand(size[0], size[1]) + grad_np_2 = np.random.rand(size[0], size[1]) + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = constant_op.constant(grad_np, dtype=dtypes.float32) + grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32) + + opt = shampoo.ShampooOptimizer(global_step) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + update_2 = opt.apply_gradients(zip([grad_2], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + init_val = sess.run(var) + self.assertAllCloseAccordingToType(init_var_np, init_val) + + # Run a step of Shampoo + update.run() + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * mat_g1^{-0.25} * grad * mat_g2^{-0.25} + # lr = 1 + mat_g1 = np.dot(grad_np, grad_np.transpose()) + mat_left = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.25) + mat_g2 = np.dot(grad_np.transpose(), grad_np) + mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25) + new_val_np = init_var_np - np.dot(np.dot(mat_left, grad_np), mat_right) + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + # Run another step of Shampoo + update_2.run() + new_val = sess.run(var) + + mat_g1 += np.dot(grad_np_2, grad_np_2.transpose()) + mat_left = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.25) + mat_g2 += np.dot(grad_np_2.transpose(), grad_np_2) + mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25) + new_val_np -= np.dot(np.dot(mat_left, grad_np_2), mat_right) + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + def _testBasicTensor(self, use_iterative_root, use_resource_var): + """Check update when gradient is a tensor. + + Args: + use_iterative_root: use iterative power method or SVD to find nth roots. + use_resource_var: use resource var as variables. + """ + size = [10, 5, 7] + init_var_np = np.zeros(size) + grad_np = np.random.rand(size[0], size[1], size[2]) + grad_np_2 = np.random.rand(size[0], size[1], size[2]) + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = constant_op.constant(grad_np, dtype=dtypes.float32) + grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32) + + opt = shampoo.ShampooOptimizer(global_step, + use_iterative_root=use_iterative_root) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + update_2 = opt.apply_gradients(zip([grad_2], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + init_val = sess.run(var) + self.assertAllCloseAccordingToType(init_var_np, init_val) + + # Run a step of Shampoo + update.run() + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * Prod_i mat_g_i^{-0.5/3} grad + # lr = 1 + mat_g1 = np.tensordot(grad_np, grad_np, axes=([1, 2], [1, 2])) + mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5/3.0) + mat_g2 = np.tensordot(grad_np, grad_np, axes=([0, 2], [0, 2])) + mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5/3.0) + mat_g3 = np.tensordot(grad_np, grad_np, axes=([0, 1], [0, 1])) + mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5/3.0) + + precond_grad = np.tensordot(grad_np, mat_g1_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g2_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g3_a, axes=([0], [0])) + new_val_np = init_var_np - precond_grad + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + # Run another step of Shampoo + update_2.run() + new_val = sess.run(var) + + mat_g1 += np.tensordot(grad_np_2, grad_np_2, axes=([1, 2], [1, 2])) + mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5/3.0) + mat_g2 += np.tensordot(grad_np_2, grad_np_2, axes=([0, 2], [0, 2])) + mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5/3.0) + mat_g3 += np.tensordot(grad_np_2, grad_np_2, axes=([0, 1], [0, 1])) + mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5/3.0) + + precond_grad = np.tensordot(grad_np_2, mat_g1_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g2_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g3_a, axes=([0], [0])) + new_val_np -= precond_grad + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + @parameterized.named_parameters( + ('SVDWithVar', False, False), + ('SVDWithResourceVar', False, True), + ('IterRootWithVar', True, False), + ('IterRootWithResourceVar', True, True), + ) + def testBasicTensor(self, use_iterative_root, use_resource_var): + self._testBasicTensor(use_iterative_root, use_resource_var) + + @parameterized.named_parameters(('Var', False), ('ResourceVar', True)) + def testLargeVector(self, use_resource_var): + """This is just the diagonal Adagrad update.""" + + size = 2000 + init_var_np = np.zeros(size) + grad_np = np.random.rand(size) + grad_np_2 = np.random.rand(size) + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = constant_op.constant(grad_np, dtype=dtypes.float32) + grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32) + + opt = shampoo.ShampooOptimizer(global_step) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + update_2 = opt.apply_gradients(zip([grad_2], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + init_val = sess.run(var) + self.assertAllCloseAccordingToType(init_var_np, init_val) + + # Run a step of Shampoo + update.run() + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * gg^{-0.5} * grad + # lr = 1 + mat_g = grad_np * grad_np + 0.1 + new_val_np = init_var_np - np.power(mat_g, -0.5) * grad_np + + self.assertAllCloseAccordingToType(new_val_np, new_val) + + # Run another step of Shampoo + update_2.run() + new_val = sess.run(var) + + mat_g += grad_np_2 * grad_np_2 + new_val_np -= np.power(mat_g, -0.5) * grad_np_2 + + self.assertAllCloseAccordingToType(new_val_np, new_val) + + @parameterized.named_parameters(('Var', False), ('ResourceVar', True)) + def testLargeMatrix(self, use_resource_var): + """Gradient is a matrix, one of whose dimensions is large. + + We do diagonal updates for large dimensions. + + Args: + use_resource_var: use resource var as variables. + """ + + size = [2000, 3] + init_var_np = np.zeros(size) + grad_np = np.random.rand(size[0], size[1]) + grad_np_2 = np.random.rand(size[0], size[1]) + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = constant_op.constant(grad_np, dtype=dtypes.float32) + grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32) + + opt = shampoo.ShampooOptimizer(global_step) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + update_2 = opt.apply_gradients(zip([grad_2], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + init_val = sess.run(var) + self.assertAllCloseAccordingToType(init_var_np, init_val) + + # Run a step of Shampoo + update.run() + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * mat_left * grad * mat_right + # where the mat_left * grad is just element-wise product, + # with broadcasting + # lr = 1 + + mat_g1 = np.sum(grad_np * grad_np, axis=1, keepdims=True) + mat_left = np.power(mat_g1 + 0.1, -0.25) + mat_g2 = np.dot(grad_np.transpose(), grad_np) + mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25) + new_val_np = init_var_np - np.dot(grad_np * mat_left, mat_right) + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + # Run another step of Shampoo + update_2.run() + new_val = sess.run(var) + + mat_g1 += np.sum(grad_np_2 * grad_np_2, axis=1, keepdims=True) + mat_left = np.power(mat_g1 + 0.1, -0.25) + mat_g2 += np.dot(grad_np_2.transpose(), grad_np_2) + mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25) + new_val_np -= np.dot(grad_np_2 * mat_left, mat_right) + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + @parameterized.named_parameters(('Var', False)) + def testSparseUpdateLarge(self, use_resource_var): + """Check update when gradient is of type IndexSlices. + + We do diagonal updates for the first dimension, unless it is very small. + + Args: + use_resource_var: use resource var as variables. + """ + size = [2000, 3] + sample_size_1 = 100 + init_var_np = np.zeros(size) + grad_indices = np.sort(np.random.choice(np.arange(size[0]), sample_size_1, + replace=False)) + grad_np = np.random.rand(sample_size_1, size[1]) + + sample_size_2 = 7 + grad_indices_2 = np.sort(np.random.choice(np.arange(size[0]), sample_size_2, + replace=False)) + grad_np_2 = np.random.rand(sample_size_2, size[1]) + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = ops.IndexedSlices( + constant_op.constant(grad_np, dtype=dtypes.float32), + constant_op.constant(grad_indices), + constant_op.constant(size)) + grad_2 = ops.IndexedSlices( + constant_op.constant(grad_np_2, dtype=dtypes.float32), + constant_op.constant(grad_indices_2), + constant_op.constant(size)) + + opt = shampoo.ShampooOptimizer(global_step) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + update_2 = opt.apply_gradients(zip([grad_2], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + init_val = sess.run(var) + self.assertAllCloseAccordingToType(init_var_np, init_val) + + # Run a step of Shampoo + update.run() + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * mat_left * grad * mat_right + # where the mat_left * grad is just element-wise product, + # with broadcasting + # lr = 1 + # In this case the update lr * mat_left * grad * mat_right is + # of size 10 x 2. + # So the correct indices of var need to be updated. + + mat_g1 = np.sum(grad_np * grad_np, axis=1, keepdims=True) + mat_g1_acc = np.zeros((size[0], 1)) + mat_g1_acc[grad_indices] += mat_g1 + mat_left = np.power(mat_g1 + 0.1, -0.25) + mat_g2 = np.dot(grad_np.transpose(), grad_np) + mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25) + new_val_np = init_var_np + new_val_np[grad_indices, :] -= np.dot(grad_np * mat_left, mat_right) + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + # Run another step of Shampoo + update_2.run() + new_val = sess.run(var) + + mat_g1 = np.sum(grad_np_2 * grad_np_2, axis=1, keepdims=True) + mat_g1_acc[grad_indices_2] += mat_g1 + mat_left = np.power(mat_g1_acc[grad_indices_2] + 0.1, -0.25) + mat_g2 += np.dot(grad_np_2.transpose(), grad_np_2) + mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25) + new_val_np[grad_indices_2, :] -= np.dot(grad_np_2 * mat_left, mat_right) + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + def _testSparseUpdateSmall(self, use_iterative_root, use_resource_var): + """Gradient is of type IndexSlices, but the first dimension is small. + + We create dense gradient and do the full update with SVD etc. + + Args: + use_iterative_root: use iterative power method or SVD to find nth roots. + use_resource_var: use resource var as variables. + """ + + size = [100, 3, 5] + sample_size = 10 + init_var_np = np.zeros(size) + grad_indices = np.sort(np.random.choice(np.arange(size[0]), sample_size, + replace=False)) + grad_np = np.random.rand(sample_size, size[1], size[2]) + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = ops.IndexedSlices( + constant_op.constant(grad_np, dtype=dtypes.float32), + constant_op.constant(grad_indices), + constant_op.constant(size)) + + opt = shampoo.ShampooOptimizer(global_step, + use_iterative_root=use_iterative_root) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + init_val = sess.run(var) + self.assertAllCloseAccordingToType(init_var_np, init_val) + + # Run a step of Shampoo + update.run() + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * Prod_i mat_g_i^{-0.125} grad + # lr = 1 + grad_dense = np.zeros_like(init_var_np) + grad_dense[grad_indices] = grad_np + + mat_g1 = np.tensordot(grad_dense, grad_dense, axes=([1, 2], [1, 2])) + mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5/3.0) + mat_g2 = np.tensordot(grad_dense, grad_dense, axes=([0, 2], [0, 2])) + mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5/3.0) + mat_g3 = np.tensordot(grad_dense, grad_dense, axes=([0, 1], [0, 1])) + mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5/3.0) + + precond_grad = np.tensordot(grad_dense, mat_g1_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g2_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g3_a, axes=([0], [0])) + new_val_np = init_var_np - precond_grad + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + @parameterized.named_parameters( + ('SVDWithVar', False, False), + ('SVDWithResourceVar', False, True), + ('IterRootWithVar', True, False), + ('IterRootWithResourceVar', True, True), + ) + def testSparseUpdateSmall(self, use_iterative_root, use_resource_var): + self._testSparseUpdateSmall(use_iterative_root, use_resource_var) + + def _testBasicTensorWithMomentum(self, use_iterative_root, use_resource_var): + """Check update with momentum when gradient is a tensor. + + Args: + use_iterative_root: use iterative power method or SVD to find nth roots. + use_resource_var: use resource var as variables. + """ + size = [10, 5, 7] + init_var_np = np.zeros(size) + grad_np = np.random.rand(size[0], size[1], size[2]) + grad_np_2 = np.random.rand(size[0], size[1], size[2]) + gbar_decay = 0.9 + gbar_weight = 0.1 + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = constant_op.constant(grad_np, dtype=dtypes.float32) + grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32) + + opt = shampoo.ShampooOptimizer(global_step, gbar_decay=gbar_decay, + gbar_weight=gbar_weight, + use_iterative_root=use_iterative_root) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + update_2 = opt.apply_gradients(zip([grad_2], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + # Run a step of Shampoo + update.run() + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * Prod_i mat_g_i^{-0.5/3} grad + # lr = 1 + mat_g1 = np.tensordot(grad_np, grad_np, axes=([1, 2], [1, 2])) + mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5/3.0) + mat_g2 = np.tensordot(grad_np, grad_np, axes=([0, 2], [0, 2])) + mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5/3.0) + mat_g3 = np.tensordot(grad_np, grad_np, axes=([0, 1], [0, 1])) + mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5/3.0) + + gbar_np = gbar_weight * grad_np + precond_grad = np.tensordot(gbar_np, mat_g1_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g2_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g3_a, axes=([0], [0])) + new_val_np = init_var_np - precond_grad + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + # Run another step of Shampoo + update_2.run() + new_val = sess.run(var) + + mat_g1 += np.tensordot(grad_np_2, grad_np_2, axes=([1, 2], [1, 2])) + mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5/3.0) + mat_g2 += np.tensordot(grad_np_2, grad_np_2, axes=([0, 2], [0, 2])) + mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5/3.0) + mat_g3 += np.tensordot(grad_np_2, grad_np_2, axes=([0, 1], [0, 1])) + mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5/3.0) + + gbar_np_2 = gbar_decay * gbar_np + gbar_weight * grad_np_2 + precond_grad = np.tensordot(gbar_np_2, mat_g1_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g2_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g3_a, axes=([0], [0])) + new_val_np -= precond_grad + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + @parameterized.named_parameters( + ('SVDWithVar', False, False), + ('SVDWithResourceVar', False, True), + ('IterRootWithVar', True, False), + ('IterRootWithResourceVar', True, True), + ) + def testBasicTensorWithMomentum(self, use_iterative_root, use_resource_var): + self._testBasicTensorWithMomentum(use_iterative_root, use_resource_var) + + def _testDelayedSVD(self, use_iterative_root, use_resource_var): + """Performing the SVD every nth step. + + Args: + use_iterative_root: use iterative power method or SVD to find nth roots. + use_resource_var: use resource var as variables. + """ + size = [10, 5, 7] + init_var_np = np.zeros(size).astype(np.float32) + iterations = 20 + svd_interval = 5 + grad_np = np.random.rand( + iterations, size[0], size[1], size[2]).astype(np.float32) + mat_g1_a = np.eye(size[0]) + mat_g1 = np.zeros_like(mat_g1_a) + mat_g2_a = np.eye(size[1]) + mat_g2 = np.zeros_like(mat_g2_a) + mat_g3_a = np.eye(size[2]) + mat_g3 = np.zeros_like(mat_g3_a) + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = array_ops.placeholder(dtypes.float32, shape=size) + + opt = shampoo.ShampooOptimizer(global_step, svd_interval=svd_interval, + use_iterative_root=use_iterative_root) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + init_val = sess.run(var) + self.assertAllCloseAccordingToType(init_var_np, init_val) + new_val_np = init_var_np + + # Run n steps of Shampoo + for i in range(iterations): + _ = sess.run(update, feed_dict={grad: grad_np[i]}) + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * Prod_i mat_g_i^{-0.5/3} grad + # lr = 1 + mat_g1 += np.tensordot(grad_np[i], grad_np[i], axes=([1, 2], [1, 2])) + mat_g2 += np.tensordot(grad_np[i], grad_np[i], axes=([0, 2], [0, 2])) + mat_g3 += np.tensordot(grad_np[i], grad_np[i], axes=([0, 1], [0, 1])) + if (i + 1) % svd_interval == 0: + mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5/3.0) + mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5/3.0) + mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5/3.0) + + precond_grad = np.tensordot(grad_np[i], mat_g1_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g2_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g3_a, axes=([0], [0])) + new_val_np -= precond_grad + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + @parameterized.named_parameters( + ('SVDWithVar', False, False), + ('SVDWithResourceVar', False, True), + ('IterRootWithVar', True, False), + ('IterRootWithResourceVar', True, True), + ) + def testDelayedSVD(self, use_iterative_root, use_resource_var): + self._testDelayedSVD(use_iterative_root, use_resource_var) + + def _testDelayedPrecondUpdate(self, use_iterative_root, use_resource_var): + """Update the squared sum every nth step, drop the other steps. + + Args: + use_iterative_root: use iterative power method or SVD to find nth roots. + use_resource_var: use resource var as variables. + """ + size = [10, 5, 7] + init_var_np = np.zeros(size).astype(np.float32) + iterations = 100 + grad_np = np.random.rand( + iterations, size[0], size[1], size[2]).astype(np.float32) + svd_interval = 20 + precond_update_interval = 5 + mat_g1_a = np.eye(size[0]) + mat_g1 = np.zeros_like(mat_g1_a) + mat_g2_a = np.eye(size[1]) + mat_g2 = np.zeros_like(mat_g2_a) + mat_g3_a = np.eye(size[2]) + mat_g3 = np.zeros_like(mat_g3_a) + + with self.test_session() as sess: + global_step = variables.Variable( + 0, dtype=dtypes.int64, use_resource=use_resource_var) + var = variables.Variable( + init_var_np, dtype=dtypes.float32, use_resource=use_resource_var) + grad = array_ops.placeholder(dtypes.float32, shape=size) + + opt = shampoo.ShampooOptimizer( + global_step, svd_interval=svd_interval, + precond_update_interval=precond_update_interval, + use_iterative_root=use_iterative_root) + update = opt.apply_gradients(zip([grad], [var]), + global_step=global_step) + variables.global_variables_initializer().run() + + init_val = sess.run(var) + self.assertAllCloseAccordingToType(init_var_np, init_val) + new_val_np = init_var_np + + # Run n steps of Shampoo + for i in range(iterations): + _ = sess.run(update, feed_dict={grad: grad_np[i]}) + new_val = sess.run(var) + + # let up compute this in numpy + # Update rule is var = var - lr * Prod_i mat_g_i^{-0.5/3} grad + # lr = 1 + if (i + 1) % precond_update_interval == 0: + mat_g1 += (np.tensordot(grad_np[i], grad_np[i], axes=([1, 2], [1, 2])) + * precond_update_interval) + mat_g2 += (np.tensordot(grad_np[i], grad_np[i], axes=([0, 2], [0, 2])) + * precond_update_interval) + mat_g3 += (np.tensordot(grad_np[i], grad_np[i], axes=([0, 1], [0, 1])) + * precond_update_interval) + + if (i + 1) % svd_interval == 0: + mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5/3.0) + mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5/3.0) + mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5/3.0) + + precond_grad = np.tensordot(grad_np[i], mat_g1_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g2_a, axes=([0], [0])) + precond_grad = np.tensordot(precond_grad, mat_g3_a, axes=([0], [0])) + new_val_np -= precond_grad + + self.assertAllCloseAccordingToType(new_val_np, new_val, + atol=TOLERANCE, rtol=TOLERANCE) + + @parameterized.named_parameters( + ('SVDWithVar', False, False), + ('SVDWithResourceVar', False, True), + ('IterRootWithVar', True, False), + ('IterRootWithResourceVar', True, True), + ) + def testDelayedPrecondUpdate(self, use_iterative_root, use_resource_var): + self._testDelayedPrecondUpdate(use_iterative_root, use_resource_var) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py index 06ab58188a2fffa0e3a810d451875ca951a077b9..28a531dfecf275c48fea54310b93b5266a79899a 100644 --- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py +++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py @@ -41,6 +41,7 @@ from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import state_ops from tensorflow.python.ops import template from tensorflow.python.ops import variable_scope +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import saver as core_saver from tensorflow.python.training import training_util from tensorflow.python.training.checkpointable import tracking @@ -278,7 +279,8 @@ class CheckpointingTests(test.TestCase): root = util.Checkpoint( optimizer=optimizer, model=model, optimizer_step=training_util.get_or_create_global_step()) - root.restore(core_saver.latest_checkpoint(checkpoint_directory)) + root.restore(checkpoint_management.latest_checkpoint( + checkpoint_directory)) for _ in range(num_training_steps): # TODO(allenl): Use a Dataset and serialize/checkpoint it. input_value = constant_op.constant([[3.]]) @@ -306,7 +308,8 @@ class CheckpointingTests(test.TestCase): train_op = optimizer.minimize( model(input_value), global_step=root.global_step) - checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory) + checkpoint_path = checkpoint_management.latest_checkpoint( + checkpoint_directory) with self.test_session(graph=ops.get_default_graph()) as session: status = root.restore(save_path=checkpoint_path) status.initialize_or_restore(session=session) @@ -339,7 +342,8 @@ class CheckpointingTests(test.TestCase): root = util.Checkpoint( optimizer=optimizer, model=model, global_step=training_util.get_or_create_global_step()) - checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory) + checkpoint_path = checkpoint_management.latest_checkpoint( + checkpoint_directory) status = root.restore(save_path=checkpoint_path) input_value = constant_op.constant([[3.]]) train_fn = functools.partial( @@ -372,7 +376,8 @@ class CheckpointingTests(test.TestCase): root = util.Checkpoint( optimizer=optimizer, model=model, global_step=training_util.get_or_create_global_step()) - checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory) + checkpoint_path = checkpoint_management.latest_checkpoint( + checkpoint_directory) status = root.restore(save_path=checkpoint_path) def train_fn(): @function.defun diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD index 36e21af618f5af744ce793509813eaf36e1b8479..72ea777ca7036bad91b15d8d2163fdee842b1e32 100644 --- a/tensorflow/contrib/predictor/BUILD +++ b/tensorflow/contrib/predictor/BUILD @@ -60,7 +60,7 @@ py_library( ":base_predictor", "//tensorflow/python:framework_ops", "//tensorflow/python:training", - "//tensorflow/python/estimator:model_fn", + "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/saved_model:signature_constants", ], ) @@ -90,9 +90,7 @@ py_library( "//tensorflow/python:framework_ops", "//tensorflow/python:math_ops", "//tensorflow/python/estimator", - "//tensorflow/python/estimator:export", - "//tensorflow/python/estimator:export_output", - "//tensorflow/python/estimator:model_fn", + "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/saved_model:signature_constants", ], ) diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py index af3b2ad1b531b835f484a155efcc57bbe634f2df..c2166594e598857065a7fd109ec599a3b36e2d2c 100644 --- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py +++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py @@ -22,8 +22,8 @@ from __future__ import print_function from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils from tensorflow.contrib.predictor import predictor from tensorflow.python.framework import ops +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import monitored_session -from tensorflow.python.training import saver class ContribEstimatorPredictor(predictor.Predictor): @@ -57,7 +57,8 @@ class ContribEstimatorPredictor(predictor.Predictor): # pylint: disable=protected-access model_fn_ops = estimator._get_predict_ops(input_fn_ops.features) # pylint: enable=protected-access - checkpoint_path = saver.latest_checkpoint(estimator.model_dir) + checkpoint_path = checkpoint_management.latest_checkpoint( + estimator.model_dir) self._session = monitored_session.MonitoredSession( session_creator=monitored_session.ChiefSessionCreator( config=config, diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py index f275bc15adfa0a51a48964dff8edddbd45500e45..7886744b3ce7fc438bc73cb81bccfd0ddeea873e 100644 --- a/tensorflow/contrib/predictor/predictor_factories.py +++ b/tensorflow/contrib/predictor/predictor_factories.py @@ -108,6 +108,8 @@ def from_estimator(estimator, def from_saved_model(export_dir, signature_def_key=None, signature_def=None, + input_names=None, + output_names=None, tags=None, graph=None, config=None): @@ -121,6 +123,12 @@ def from_saved_model(export_dir, signature_def: A `SignatureDef` proto specifying the inputs and outputs for prediction. Only one of `signature_def_key` and `signature_def` should be specified. + input_names: A dictionary mapping strings to `Tensor`s in the `SavedModel` + that represent the input. The keys can be any string of the user's + choosing. + output_names: A dictionary mapping strings to `Tensor`s in the + `SavedModel` that represent the output. The keys can be any string of + the user's choosing. tags: Optional. Tags that will be used to retrieve the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`. graph: Optional. The Tensorflow `graph` in which prediction should be @@ -138,6 +146,8 @@ def from_saved_model(export_dir, export_dir, signature_def_key=signature_def_key, signature_def=signature_def, + input_names=input_names, + output_names=output_names, tags=tags, graph=graph, config=config) diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py index e3c48998305e9d9b6c185fd4c0f324fa0449c691..d9f179bee48de587976872dabb470cfd5c69114c 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py @@ -120,6 +120,7 @@ def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay): scaled_weight_tensor = math_ops.multiply( weights, multiplier_tensor, name='mul_fold') + new_layer_tensor = _CloneWithNewOperands( match.layer_op, match.input_tensor, scaled_weight_tensor, match.batch_to_space_op) @@ -368,20 +369,20 @@ def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay, lambda: bn_decay_zero, lambda: match.bn_decay_mean_tensor, name='freeze_moving_mean') + graph_editor.reroute_ts( [bn_decay_mean_out], [match.bn_decay_mean_tensor], can_modify=bn_decay_mean_consumers) - if fused_batch_norm is False: - bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers()) - bn_decay_var_out = utils.smart_cond( - use_mv_avg, - lambda: bn_decay_zero, - lambda: match.bn_decay_var_tensor, - name='freeze_moving_var') - graph_editor.reroute_ts( - [bn_decay_var_out], [match.bn_decay_var_tensor], - can_modify=bn_decay_var_consumers) + bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers()) + bn_decay_var_out = utils.smart_cond( + use_mv_avg, + lambda: bn_decay_zero, + lambda: match.bn_decay_var_tensor, + name='freeze_moving_var') + graph_editor.reroute_ts( + [bn_decay_var_out], [match.bn_decay_var_tensor], + can_modify=bn_decay_var_consumers) correction_recip = utils.smart_cond( use_mv_avg, diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py index 7c907ffd92c1ae0c762e41cc429b0e6ce053f6b9..3f8063cc022726cb745d42aba3c834c71e876e70 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py @@ -128,6 +128,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] self._AssertOutputGoesToOps(folded_add, g, output_op_names) + if freeze_batch_norm_delay is not None: + self._AssertMovingAveragesAreFrozen(g, scope) + for op in g.get_operations(): self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name) @@ -216,6 +219,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): ]) output_op_names = [scope + '/' + relu_op_name] self._AssertOutputGoesToOps(folded_add, g, output_op_names) + if freeze_batch_norm_delay is not None: + self._AssertMovingAveragesAreFrozen(g, scope) for op in g.get_operations(): self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name) @@ -284,6 +289,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] self._AssertOutputGoesToOps(folded_add, g, output_op_names) + if freeze_batch_norm_delay is not None: + self._AssertMovingAveragesAreFrozen(g, scope) for op in g.get_operations(): self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name) @@ -351,6 +358,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] self._AssertOutputGoesToOps(folded_add, g, output_op_names) + if freeze_batch_norm_delay is not None: + self._AssertMovingAveragesAreFrozen(g, scope) for op in g.get_operations(): self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name) @@ -431,6 +440,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] self._AssertOutputGoesToOps(folded_add, g, output_op_names) + if freeze_batch_norm_delay is not None: + self._AssertMovingAveragesAreFrozen(g, scope) for op in g.get_operations(): self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name) @@ -515,6 +526,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): ]) output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name] self._AssertOutputGoesToOps(folded_add, g, output_op_names) + if freeze_batch_norm_delay is not None: + self._AssertMovingAveragesAreFrozen(g, scope) for op in g.get_operations(): self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name) @@ -644,6 +657,22 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): out_op = graph.get_operation_by_name(out_op_name) self.assertIn(op.outputs[0].name, [str(t.name) for t in out_op.inputs]) + def _AssertMovingAveragesAreFrozen(self, graph, scope): + """Asserts to check if moving mean and variance are frozen. + + Args: + graph: Graph where the operations are located. + scope: Scope of batch norm op + """ + moving_average_mult = graph.get_operation_by_name( + scope + '/BatchNorm/AssignMovingAvg/mul') + self.assertTrue( + moving_average_mult.inputs[1].name.find('freeze_moving_mean/Merge') > 0) + moving_var_mult = graph.get_operation_by_name( + scope + '/BatchNorm/AssignMovingAvg_1/mul') + self.assertTrue( + moving_var_mult.inputs[1].name.find('freeze_moving_var/Merge') > 0) + def _CopyGraph(self, graph): """Return a copy of graph.""" meta_graph = saver_lib.export_meta_graph( diff --git a/tensorflow/contrib/quantize/python/quant_ops_test.py b/tensorflow/contrib/quantize/python/quant_ops_test.py index c2a8def48012c808da18587c8ff462fa33a363c0..a45840009b758881c14fb64b2d39af6cd4ec4bc4 100644 --- a/tensorflow/contrib/quantize/python/quant_ops_test.py +++ b/tensorflow/contrib/quantize/python/quant_ops_test.py @@ -75,7 +75,7 @@ class QuantOpsTest(googletest.TestCase): self.assertGreater(max_value, 0.0) self.assertLess(max_value, 1.0) - def testVariablesNotParitioned_LastValue(self): + def testVariablesNotPartitioned_LastValue(self): # Variables added should not use a default partiioner since they are # scalar. There would be a tensorflow error thrown if the partitioner was # respected by the rewrite. @@ -90,7 +90,7 @@ class QuantOpsTest(googletest.TestCase): is_training=True, vars_collection=_MIN_MAX_VARS) - def testVariablesNotParitioned_MovingAvg(self): + def testVariablesNotPartitioned_MovingAvg(self): # Variables added should not use a default partiioner since they are # scalar. There would be a tensorflow error thrown if the partitioner was # respected by the rewrite. diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py index 4fc315d901a86ac235513aad6eb34d7f90f61801..cb66fd1f76bcdb0a8f77fc7c476511576368ab4e 100644 --- a/tensorflow/contrib/quantize/python/quantize.py +++ b/tensorflow/contrib/quantize/python/quantize.py @@ -198,7 +198,7 @@ def _FindLayersToQuantize(graph): | [post_conv_correction] | - biasadd|folded_bias + [biasadd|folded_bias] | [bypass] | @@ -261,6 +261,16 @@ def _FindLayersToQuantize(graph): layer_output_pattern = graph_matcher.OneofPattern( [batch_to_space_pattern, layer_pattern]) + + # For separable convolutions, we are looking for a conv, followed by a conv + # with no activations between the two. + sep_conv_pattern = graph_matcher.OpTypePattern( + '|'.join(_QUANTIZABLE_TYPES), + inputs=[ + graph_matcher.OneofPattern([layer_output_pattern]), + graph_matcher.OpTypePattern('*') + ], + ordered_inputs=False) folded_bias_mul_pattern = graph_matcher.OpTypePattern( 'Mul', inputs=[graph_matcher.OpTypePattern('*'), layer_output_pattern], @@ -310,6 +320,7 @@ def _FindLayersToQuantize(graph): folded_bias_add_pattern, batch_norm_identity, bypass_pattern, + layer_pattern, ]) ]) @@ -393,6 +404,17 @@ def _FindLayersToQuantize(graph): layer_matches.append( _LayerMatch(layer_op, weight_tensor, activation_op, None, None, None)) + # Look for separable convolutions here + sep_conv_matcher = graph_matcher.GraphMatcher(sep_conv_pattern) + for match_result in sep_conv_matcher.match_graph(graph): + layer_op = match_result.get_op(layer_pattern) + weight_tensor = match_result.get_tensor(weight_identity_pattern) + activation_op = match_result.get_op(layer_pattern) + if layer_op not in matched_layer_set: + matched_layer_set.add(layer_op) + layer_matches.append( + _LayerMatch(layer_op, weight_tensor, activation_op, None, None, None)) + return layer_matches diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py index 92ca4a1b0c3126ebccf2b525f01f4d6455c4d527..06ebcdfee1617af0c13cd6ed09a2ec5190c5a718 100644 --- a/tensorflow/contrib/quantize/python/quantize_test.py +++ b/tensorflow/contrib/quantize/python/quantize_test.py @@ -122,12 +122,67 @@ class QuantizeTest(test_util.TensorFlowTestCase): array_ops.identity(node, name='control_dependency') quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) + # Check if output of bias add is quantized + quantization_node_name = 'FakeQuantWithMinMaxVars' + conv_quant = graph.get_operation_by_name('test/test/conv_quant/' + + quantization_node_name) + self.assertEqual(conv_quant.type, quantization_node_name) + + for op in graph.get_operations(): + if op.type == quantization_node_name: + quant_op = graph.get_operation_by_name(op.name) + # Scan through all FakeQuant operations, ensuring that the activation + # identity op isn't in the consumers of the operation. + consumers = [] + for output in quant_op.outputs: + consumers.extend(output.consumers()) + + self.assertNotIn('test/relu6', [c.name for c in consumers]) + + def testInsertQuantOpInSeparableConv2d(self): + self._RunTestOverParameters(self._TestInsertQuantOpInSeparableConv2d) + + def _TestInsertQuantOpInSeparableConv2d(self, is_training): + graph = ops.Graph() + with graph.as_default(): + batch_size, height, width, depth = 5, 128, 128, 3 + input1 = array_ops.zeros((batch_size, height, width, depth)) + input2 = array_ops.zeros((batch_size, height / 2, width / 2, depth)) + conv = separable_conv2d( + input1, + 3, [5, 5], + stride=2, + depth_multiplier=1.0, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=None, + scope='test/test') + node = math_ops.add(conv, input2, name='test/add') + node = nn_ops.relu6(node, name='test/relu6') + update_barrier = control_flow_ops.no_op(name='update_barrier') + with ops.control_dependencies([update_barrier]): + array_ops.identity(node, name='control_dependency') + quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) + # Check if output of bias add is quantized quantization_node_name = 'FakeQuantWithMinMaxVars' conv_quant = graph.get_operation_by_name('test/test/conv_quant/' + quantization_node_name) self.assertEqual(conv_quant.type, quantization_node_name) + # Check if weights for both convs inside seperable conv are quantized + pointwise_weight_quant = graph.get_operation_by_name( + 'test/test/weights_quant/' + quantization_node_name) + self.assertEqual(pointwise_weight_quant.type, quantization_node_name) + depthwise_weight_quant = graph.get_operation_by_name( + 'test/test/separable_conv2d/weights_quant/' + quantization_node_name) + self.assertEqual(depthwise_weight_quant.type, quantization_node_name) + + # Check if activations after first depthwise conv are quantized. + depthwise_act_quant = graph.get_operation_by_name( + 'test/test/separable_conv2d/act_quant/' + quantization_node_name) + self.assertEqual(depthwise_act_quant.type, quantization_node_name) + for op in graph.get_operations(): if op.type == quantization_node_name: quant_op = graph.get_operation_by_name(op.name) @@ -139,6 +194,33 @@ class QuantizeTest(test_util.TensorFlowTestCase): self.assertNotIn('test/relu6', [c.name for c in consumers]) + def testLayerActivationQuantized(self): + self._RunTestOverParameters(self._TestLayerActivationQuantized) + + def _TestLayerActivationQuantized(self, is_training): + graph = ops.Graph() + with graph.as_default(): + batch_size, height, width, depth = 5, 128, 128, 3 + input1 = array_ops.zeros((batch_size, height, width, depth)) + _ = conv2d( + input1, + 32, [5, 5], + stride=2, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=nn_ops.relu6, + biases_initializer=None, + scope='test') + # Ensure that both weights and output of activations are quantized + # when we have a conv->relu6 with no bias add + quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) + activation_op = graph.get_operation_by_name('test/Relu6') + conv_op = graph.get_operation_by_name('test/Conv2D') + self.assertTrue('test/weights_quant/FakeQuantWithMinMaxVars:0' in + [tensor_in.name for tensor_in in conv_op.inputs]) + self.assertTrue('FakeQuantWithMinMaxVars' in + [op.type for op in activation_op.outputs[0].consumers()]) + def testFinalLayerQuantized(self): self._RunTestOverParameters(self._TestFinalLayerQuantized) diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py index 0f19ac7dbe0cee2eb6c780ec5ea6266bc847abd7..f23194a6f2e64e0619049bac51891d6d6099831f 100644 --- a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py +++ b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py @@ -61,10 +61,17 @@ class FunctionalRnnTest(test_util.TensorFlowTestCase): func, args = self._CELLDEFS[celldef_name] return func(*args) - def _CreateInputs(self): - inputs = np.random.random([FunctionalRnnTest._BATCH_SIZE, - FunctionalRnnTest._TOTAL_TIME, - FunctionalRnnTest._INPUT_SIZE]) + def _CreateInputs(self, time_major=False): + if time_major: + inputs = np.random.random([ + FunctionalRnnTest._TOTAL_TIME, FunctionalRnnTest._BATCH_SIZE, + FunctionalRnnTest._INPUT_SIZE + ]) + else: + inputs = np.random.random([ + FunctionalRnnTest._BATCH_SIZE, FunctionalRnnTest._TOTAL_TIME, + FunctionalRnnTest._INPUT_SIZE + ]) # Always leave one time slot empty, to check max_length behavior. sequence_length = np.random.randint( 0, high=FunctionalRnnTest._TOTAL_TIME - 1, @@ -72,15 +79,51 @@ class FunctionalRnnTest(test_util.TensorFlowTestCase): dtype=np.int) return (inputs, sequence_length) - def _CreateRnnGraph(self, create_rnn_computation_func, cell, tf_inputs, - tf_sequence_length, initial_state=None, - time_major=None, scope=None): - tf_result = create_rnn_computation_func(cell=cell, inputs=tf_inputs, - sequence_length=tf_sequence_length, - initial_state=initial_state, - dtype=dtypes.float32, - time_major=time_major, - scope=scope) + def _CreateSymmetricInputs(self): + # total time = batch size + inputs = np.zeros( + (FunctionalRnnTest._BATCH_SIZE, FunctionalRnnTest._BATCH_SIZE, + FunctionalRnnTest._INPUT_SIZE)) + for i in range(FunctionalRnnTest._BATCH_SIZE): + for j in range(i, FunctionalRnnTest._BATCH_SIZE): + inputs[i][j] = np.random.random([FunctionalRnnTest._INPUT_SIZE]) + inputs[j][i] = inputs[i][j] + + # Always leave one time slot empty, to check max_length behavior. + sequence_length = np.random.randint( + 0, + high=FunctionalRnnTest._BATCH_SIZE - 1, + size=FunctionalRnnTest._BATCH_SIZE, + dtype=np.int) + return (inputs, sequence_length) + + def _CreateRnnGraph(self, + create_rnn_computation_func, + cell, + tf_inputs, + tf_sequence_length, + is_bidirectional, + initial_state=None, + time_major=None, + scope=None): + if is_bidirectional: + tf_result = create_rnn_computation_func( + cell_fw=cell, + cell_bw=cell, + inputs=tf_inputs, + sequence_length=tf_sequence_length, + dtype=dtypes.float32, + time_major=time_major, + scope=scope) + else: + tf_result = create_rnn_computation_func( + cell=cell, + inputs=tf_inputs, + sequence_length=tf_sequence_length, + initial_state=initial_state, + dtype=dtypes.float32, + time_major=time_major, + scope=scope) grad = gradients_impl.gradients(tf_result, variables.trainable_variables()) return {'inference': tf_result, 'grad': grad} @@ -102,15 +145,26 @@ class FunctionalRnnTest(test_util.TensorFlowTestCase): variable_cache[n] = v def _RunRnn(self, numpy_inputs, numpy_slen, cell_name, variable_cache, - is_dynamic): + is_dynamic, time_major=None, is_bidirectional=False): with ops.Graph().as_default() as graph: tf_inputs = array_ops.placeholder( dtypes.float32, shape=numpy_inputs.shape) tf_slen = array_ops.placeholder(dtypes.int32) feeds = {tf_inputs: numpy_inputs, tf_slen: numpy_slen} cell = self._CreateCell(cell_name) - fn = rnn_lib.dynamic_rnn if is_dynamic else functional_rnn.functional_rnn - fetches = self._CreateRnnGraph(fn, cell, tf_inputs, tf_slen) + if is_dynamic: + if is_bidirectional: + fn = rnn_lib.bidirectional_dynamic_rnn + else: + fn = rnn_lib.dynamic_rnn + else: + if is_bidirectional: + fn = functional_rnn.bidirectional_functional_rnn + else: + fn = functional_rnn.functional_rnn + + fetches = self._CreateRnnGraph( + fn, cell, tf_inputs, tf_slen, is_bidirectional, time_major=time_major) with self.test_session(graph=graph) as sess: sess.run(variables.global_variables_initializer()) # Note that cell.trainable_variables it not always set. @@ -158,6 +212,78 @@ class FunctionalRnnTest(test_util.TensorFlowTestCase): self.assertAllClose(dyn_rnn['inference'], func_rnn['inference']) self.assertAllClose(dyn_rnn['grad'], func_rnn['grad']) + def testLstmWithTimeMajorInputs(self): + """Checks an LSTM against the reference implementation, with time_major.""" + time_major = True + np_inputs, np_slen = self._CreateInputs(time_major=True) + var_cache = {} + args = [np_inputs, np_slen, 'lstm', var_cache] + _, func_rnn = self._RunRnn(*(args + [False]), time_major=time_major) + _, dyn_rnn = self._RunRnn(*(args + [True]), time_major=time_major) + self.assertAllClose(dyn_rnn['inference'], func_rnn['inference']) + self.assertAllClose(dyn_rnn['grad'], func_rnn['grad']) + + def testBidirectionalLstmWithTimeMajorInputs(self): + """Checks a bi-directional LSTM with time-major inputs.""" + time_major = True + np_inputs, np_slen = self._CreateInputs(time_major) + var_cache = {} + args = [np_inputs, np_slen, 'lstm', var_cache] + _, func_rnn = self._RunRnn( + *(args + [False]), time_major=time_major, is_bidirectional=True) + _, dyn_rnn = self._RunRnn( + *(args + [True]), time_major=time_major, is_bidirectional=True) + self.assertAllClose(dyn_rnn['inference'], func_rnn['inference']) + # TODO(b/112170761): comment out this line after the bug is fixed. + # self.assertAllClose(dyn_rnn['grad'], func_rnn['grad']) + + def testBidirectionalLstm(self): + """Checks time-major and batch-major rnn produce consistent results.""" + time_major_inputs, np_slen = self._CreateInputs(True) + batch_major_inputs = np.transpose(time_major_inputs, [1, 0, 2]) + var_cache = {} + args = [np_slen, 'lstm', var_cache, False] + _, time_major_rnn = self._RunRnn( + *([time_major_inputs] + args), time_major=True, is_bidirectional=True) + _, batch_major_rnn = self._RunRnn( + *([batch_major_inputs]+ args), time_major=False, is_bidirectional=True) + # Convert the batch-major outputs to be time-major before the comparasion. + outputs, state = batch_major_rnn['inference'] + outputs = [np.transpose(x, [1, 0, 2]) for x in outputs] + batch_major_rnn['inference'] = [outputs, state] + self.assertAllClose(time_major_rnn['inference'], + batch_major_rnn['inference']) + self.assertAllClose(time_major_rnn['grad'], batch_major_rnn['grad']) + + def testBidirectionalLstmWithSymmetricInputs(self): + """Checks a bi-directional LSTM with symmetric inputs. + + time-major and batch-major rnn produce the same result with symmetric + inputs. + """ + np_inputs, np_slen = self._CreateSymmetricInputs() + var_cache = {} + args = [np_inputs, np_slen, 'lstm', var_cache] + _, time_major_func_rnn = self._RunRnn( + *(args + [False]), time_major=True, is_bidirectional=True) + _, batch_major_func_rnn = self._RunRnn( + *(args + [False]), time_major=False, is_bidirectional=True) + _, time_major_dyn_rnn = self._RunRnn( + *(args + [True]), time_major=True, is_bidirectional=True) + _, batch_major_dyn_rnn = self._RunRnn( + *(args + [True]), time_major=False, is_bidirectional=True) + self.assertAllClose(time_major_func_rnn['inference'], + batch_major_func_rnn['inference']) + self.assertAllClose(time_major_func_rnn['grad'], + batch_major_func_rnn['grad']) + self.assertAllClose(time_major_dyn_rnn['inference'], + batch_major_dyn_rnn['inference']) + self.assertAllClose(time_major_dyn_rnn['grad'], batch_major_dyn_rnn['grad']) + self.assertAllClose(time_major_func_rnn['inference'], + batch_major_dyn_rnn['inference']) + self.assertAllClose(time_major_func_rnn['grad'], + batch_major_dyn_rnn['grad']) + if __name__ == '__main__': test_lib.main() diff --git a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py index a085474c1bf6117ba5663139c78d8f08f71392d3..67a8f59c3c03d01a5957a9eff8bd026e70770a45 100644 --- a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py +++ b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py @@ -206,7 +206,7 @@ def _PickFinalStateFromHistory(acc_state, sequence_length): lengths = array_ops.tile(array_ops.reshape(sequence_length, [-1, 1]), [1, max_time]) last_idx = math_ops.cast(math_ops.equal(output_time, lengths - 1), - dtype=dtypes.float32) + dtype=state_var.dtype) last_idx = array_ops.transpose(last_idx) last_idx_for_bcast = array_ops.expand_dims(last_idx, -1) sliced = math_ops.multiply(last_idx_for_bcast, state_var) @@ -284,8 +284,13 @@ def functional_rnn(cell, inputs, sequence_length=None, inputs=inputs, cell_fn=func_cell.cell_step, use_tpu=use_tpu) - return _PostProcessOutput(extended_acc_state, extended_final_state, - func_cell, inputs_flat[0].shape[0], sequence_length) + tf_output, tf_state = _PostProcessOutput( + extended_acc_state, extended_final_state, func_cell, + inputs_flat[0].shape[0], sequence_length) + + if time_major: + tf_output = array_ops.transpose(tf_output, [1, 0, 2]) + return tf_output, tf_state def bidirectional_functional_rnn( diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py index 1816b469ee5bf338453a82d18663f97f6565dc0c..f74c95f96299cf132a9a1d8ab8b238a532e2695b 100644 --- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py +++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py @@ -3276,7 +3276,7 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell): It does not allow cell clipping, a projection layer, and does not use peep-hole connections: it is the basic baseline. - For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell} + For advanced models, please use the full `tf.nn.rnn_cell.LSTMCell` that follows. TODO(gonnet): Write a paper describing this and add a reference here. diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD index 1a1591d798f6f904e23987d9d7a60193c124c20e..18b56cd21942e28cb0dc3210df0bb04d55c1e16f 100644 --- a/tensorflow/contrib/seq2seq/BUILD +++ b/tensorflow/contrib/seq2seq/BUILD @@ -177,7 +177,7 @@ cuda_py_test( cuda_py_test( name = "beam_search_decoder_test", - size = "small", + size = "medium", srcs = ["python/kernel_tests/beam_search_decoder_test.py"], additional_deps = [ ":seq2seq_py", diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py index 1c9d179e3c55ad07fcf709f66028c91c20e8eea0..0ba32cd3bf8a374f5f55bdc6b2325b03443cd545 100644 --- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py +++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py @@ -382,8 +382,8 @@ class LuongAttention(_BaseAttentionMechanism): for values past the respective sequence lengths. scale: Python boolean. Whether to scale the energy term. probability_fn: (optional) A `callable`. Converts the score to - probabilities. The default is @{tf.nn.softmax}. Other options include - @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}. + probabilities. The default is `tf.nn.softmax`. Other options include + `tf.contrib.seq2seq.hardmax` and `tf.contrib.sparsemax.sparsemax`. Its signature should be: `probabilities = probability_fn(score)`. score_mask_value: (optional) The mask value for score before passing into `probability_fn`. The default is -inf. Only used if @@ -529,8 +529,8 @@ class BahdanauAttention(_BaseAttentionMechanism): for values past the respective sequence lengths. normalize: Python boolean. Whether to normalize the energy term. probability_fn: (optional) A `callable`. Converts the score to - probabilities. The default is @{tf.nn.softmax}. Other options include - @{tf.contrib.seq2seq.hardmax} and @{tf.contrib.sparsemax.sparsemax}. + probabilities. The default is `tf.nn.softmax`. Other options include + `tf.contrib.seq2seq.hardmax` and `tf.contrib.sparsemax.sparsemax`. Its signature should be: `probabilities = probability_fn(score)`. score_mask_value: (optional): The mask value for score before passing into `probability_fn`. The default is -inf. Only used if @@ -1091,7 +1091,7 @@ class AttentionWrapper(rnn_cell_impl.RNNCell): `AttentionWrapper`, then you must ensure that: - The encoder output has been tiled to `beam_width` via - @{tf.contrib.seq2seq.tile_batch} (NOT `tf.tile`). + `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`). - The `batch_size` argument passed to the `zero_state` method of this wrapper is equal to `true_batch_size * beam_width`. - The initial state created with `zero_state` above contains a diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py index f17dbb0fe3c13c3a43f043b82772949737dfb2de..74741a7bd6306181c248af50e9784f45dfc41c55 100644 --- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py +++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py @@ -234,7 +234,7 @@ class BeamSearchDecoder(decoder.Decoder): `AttentionWrapper`, then you must ensure that: - The encoder output has been tiled to `beam_width` via - @{tf.contrib.seq2seq.tile_batch} (NOT `tf.tile`). + `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`). - The `batch_size` argument passed to the `zero_state` method of this wrapper is equal to `true_batch_size * beam_width`. - The initial state created with `zero_state` above contains a diff --git a/tensorflow/contrib/signal/python/kernel_tests/test_util.py b/tensorflow/contrib/signal/python/kernel_tests/test_util.py index 7d6289532addfd4b4b867bf64d9113253bd1c76d..b4422a49887378187a2be46275d4dabf1fbd40a1 100644 --- a/tensorflow/contrib/signal/python/kernel_tests/test_util.py +++ b/tensorflow/contrib/signal/python/kernel_tests/test_util.py @@ -27,15 +27,15 @@ def grappler_optimize(graph, fetches=None, rewriter_config=None): """Tries to optimize the provided graph using grappler. Args: - graph: A @{tf.Graph} instance containing the graph to optimize. + graph: A `tf.Graph` instance containing the graph to optimize. fetches: An optional list of `Tensor`s to fetch (i.e. not optimize away). Grappler uses the 'train_op' collection to look for fetches, so if not provided this collection should be non-empty. - rewriter_config: An optional @{tf.RewriterConfig} to use when rewriting the + rewriter_config: An optional `tf.RewriterConfig` to use when rewriting the graph. Returns: - A @{tf.GraphDef} containing the rewritten graph. + A `tf.GraphDef` containing the rewritten graph. """ if rewriter_config is None: rewriter_config = rewriter_config_pb2.RewriterConfig() diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py index 062d84aea183ab61501a8b07521adb1a1a17c63c..ecc2fedb9f82151511bab3f3c0496bc4e290903f 100644 --- a/tensorflow/contrib/signal/python/ops/mel_ops.py +++ b/tensorflow/contrib/signal/python/ops/mel_ops.py @@ -108,7 +108,7 @@ def linear_to_mel_weight_matrix(num_mel_bins=20, # `M` has shape [frames, num_mel_bins] M = tf.matmul(S, A) - The matrix can be used with @{tf.tensordot} to convert an arbitrary rank + The matrix can be used with `tf.tensordot` to convert an arbitrary rank `Tensor` of linear-scale spectral bins into the mel scale. # S has shape [..., num_spectrogram_bins]. diff --git a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py b/tensorflow/contrib/signal/python/ops/reconstruction_ops.py index 653c030a04c2bbc7e3ee49b9c85a781fb49de8d0..4db8dc2ca090534f2cda66bd55c30dfa389b860a 100644 --- a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py +++ b/tensorflow/contrib/signal/python/ops/reconstruction_ops.py @@ -90,22 +90,28 @@ def overlap_and_add(signal, frame_step, name=None): raise ValueError("frame_step must be an integer. Got %s" % frame_step.dtype) - # If frame_length and frame_step are known at graph construction time, check - # frame_step is less than or equal to frame_length. - frame_step_static = tensor_util.constant_value(frame_step) - if (frame_step_static is not None and signal.shape.ndims is not None and - signal.shape[-1].value is not None and - frame_step_static > signal.shape[-1].value): - raise ValueError( - "frame_step (%d) must be less than or equal to frame_length (%d)" % ( - frame_step_static, signal.shape[-1].value)) - signal_shape = array_ops.shape(signal) # All dimensions that are not part of the overlap-and-add. Can be empty for # rank 2 inputs. outer_dimensions = signal_shape[:-2] + # If frame_length and frame_step are known at graph construction time, check + # frame_step is less than or equal to frame_length. + frame_step_static = tensor_util.constant_value(frame_step) + if (frame_step_static is not None and signal.shape.ndims is not None and + signal.shape[-1].value is not None): + if frame_step_static > signal.shape[-1].value: + raise ValueError( + "frame_step (%d) must be less than or equal to " + "frame_length (%d)" % ( + frame_step_static, signal.shape[-1].value)) + # If frame_length is equal to frame_step, there's no overlap so just + # reshape the tensor. + if frame_step_static == signal.shape[-1].value: + return array_ops.reshape(signal, array_ops.concat( + [outer_dimensions, [-1]], 0)) + signal_rank = array_ops.rank(signal) frames = signal_shape[-2] frame_length = signal_shape[-1] diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py index 5cfd5ee82e2a0fce33311a8783d2d4ceb031544d..0feb3925eb8ec4eca7c7fd527510f45ceb83091b 100644 --- a/tensorflow/contrib/slim/python/slim/evaluation.py +++ b/tensorflow/contrib/slim/python/slim/evaluation.py @@ -22,7 +22,8 @@ modules using a variety of metrics and summarizing the results. ********************** In the simplest use case, we use a model to create the predictions, then specify -the metrics and finally call the `evaluation` method: +the metrics and choose one model checkpoint, finally call the`evaluation_once` +method: # Create model and obtain the predictions: images, labels = LoadData(...) @@ -34,20 +35,24 @@ the metrics and finally call the `evaluation` method: "mse": slim.metrics.mean_squared_error(predictions, labels), }) + checkpoint_path = '/tmp/my_model_dir/my_checkpoint' + log_dir = '/tmp/my_model_eval/' + initial_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer()) - with tf.Session() as sess: - metric_values = slim.evaluation( - sess, - num_evals=1, - initial_op=initial_op, - eval_op=names_to_updates.values(), - final_op=name_to_values.values()) + metric_values = slim.evaluate_once( + master='', + checkpoint_path=checkpoint_path, + log_dir=log_dir, + num_evals=1, + initial_op=initial_op, + eval_op=names_to_updates.values(), + final_op=name_to_values.values()) - for metric, value in zip(names_to_values.keys(), metric_values): - logging.info('Metric %s has value: %f', metric, value) + for metric, value in zip(names_to_values.keys(), metric_values): + logging.info('Metric %s has value: %f', metric, value) ************************************************ * Evaluating a Checkpointed Model with Metrics * diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py index d22b80ac88a9ced541a952fcbb58c50366464075..42898e797cc351e3de290cc65fc825f1406c739d 100644 --- a/tensorflow/contrib/summary/summary.py +++ b/tensorflow/contrib/summary/summary.py @@ -17,7 +17,7 @@ The operations in this package are safe to use with eager execution turned on or off. It has a more flexible API that allows summaries to be written directly from ops to places other than event log files, rather than propagating protos -from @{tf.summary.merge_all} to @{tf.summary.FileWriter}. +from `tf.summary.merge_all` to `tf.summary.FileWriter`. To use with eager execution enabled, write your code as follows: diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc index 7e25579070eef13682dedfcd3c9e435333f65687..6cb2c881e2428dfcac3187bf7364582e857b9879 100644 --- a/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc +++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision_node_evaluator.cc @@ -51,7 +51,8 @@ std::unique_ptr CreateBinaryDecisionNodeEvaluator( InequalityDecisionNodeEvaluator::InequalityDecisionNodeEvaluator( const decision_trees::InequalityTest& test, int32 left, int32 right) : BinaryDecisionNodeEvaluator(left, right) { - safe_strto32(test.feature_id().id().value(), &feature_num_); + CHECK(safe_strto32(test.feature_id().id().value(), &feature_num_)) + << "Invalid feature ID: [" << test.feature_id().id().value() << "]"; threshold_ = test.threshold().float_value(); include_equals_ = test.type() == decision_trees::InequalityTest::LESS_OR_EQUAL; @@ -72,7 +73,9 @@ ObliqueInequalityDecisionNodeEvaluator::ObliqueInequalityDecisionNodeEvaluator( : BinaryDecisionNodeEvaluator(left, right) { for (int i = 0; i < test.oblique().features_size(); ++i) { int32 val; - safe_strto32(test.oblique().features(i).id().value(), &val); + CHECK(safe_strto32(test.oblique().features(i).id().value(), &val)) + << "Invalid feature ID: [" << test.oblique().features(i).id().value() + << "]"; feature_num_.push_back(val); feature_weights_.push_back(test.oblique().weights(i)); } @@ -97,7 +100,8 @@ int32 ObliqueInequalityDecisionNodeEvaluator::Decide( MatchingValuesDecisionNodeEvaluator::MatchingValuesDecisionNodeEvaluator( const decision_trees::MatchingValuesTest& test, int32 left, int32 right) : BinaryDecisionNodeEvaluator(left, right) { - safe_strto32(test.feature_id().id().value(), &feature_num_); + CHECK(safe_strto32(test.feature_id().id().value(), &feature_num_)) + << "Invalid feature ID: [" << test.feature_id().id().value() << "]"; for (const auto& val : test.value()) { values_.push_back(val.float_value()); } diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD index 46f3c36e3db51fde4c8732d4300a9d3eaddb452a..fc0d22d112efcccd1a3be6388d36478cf2076ff5 100644 --- a/tensorflow/contrib/tensorrt/BUILD +++ b/tensorflow/contrib/tensorrt/BUILD @@ -3,7 +3,7 @@ # and provide TensorRT operators and converter package. # APIs are meant to change over time. -package(default_visibility = ["//tensorflow:__subpackages__"]) +package(default_visibility = ["//visibility:public"]) licenses(["notice"]) # Apache 2.0 @@ -85,11 +85,12 @@ cc_library( copts = tf_copts(), visibility = ["//visibility:public"], deps = [ + ":test_utils", ":trt_allocator", + ":trt_conversion", ":trt_logging", ":trt_plugins", ":trt_resources", - ":trt_conversion", ":utils", "//tensorflow/core:gpu_headers_lib", "//tensorflow/core:lib_proto_parsing", @@ -184,6 +185,8 @@ py_library( ], ) +# TODO(aaroey): this wrapper has been causing troubles of double linking, so +# either get rid of it, or split to make it contain minimum dependencies. tf_py_wrap_cc( name = "wrap_conversion", srcs = ["trt_conversion.i"], @@ -192,6 +195,7 @@ tf_py_wrap_cc( "//tensorflow/python:platform/base.i", ], deps = [ + ":test_utils", ":trt_conversion", ":trt_engine_op_kernel", "//third_party/python_runtime:headers", @@ -264,6 +268,7 @@ tf_cuda_library( ], deps = [ ":segment", + ":test_utils", ":trt_allocator", ":trt_plugins", ":trt_logging", @@ -274,7 +279,6 @@ tf_cuda_library( "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler:utils", - "//tensorflow/core:gpu_runtime", "//tensorflow/core:framework_lite", "//tensorflow/core:graph", "//tensorflow/core:lib", @@ -412,4 +416,17 @@ cc_library( srcs = ["convert/utils.cc"], hdrs = ["convert/utils.h"], copts = tf_copts(), + deps = [ + "//tensorflow/core:lib", + ], +) + +cc_library( + name = "test_utils", + srcs = ["test/utils.cc"], + hdrs = ["test/utils.h"], + deps = [ + "//tensorflow/core:lib", + "@com_googlesource_code_re2//:re2", + ], ) diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index 3383f6bc9b99879a1c661a0d49e42c6f3b878f66..21ec8b0b30c595a1fad01b69bce9b16393742704 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -20,6 +20,7 @@ limitations under the License. #include #include #include +#include #include #include @@ -29,9 +30,7 @@ limitations under the License. #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h" #include "tensorflow/contrib/tensorrt/resources/trt_resources.h" #include "tensorflow/contrib/tensorrt/segment/segment.h" -#include "tensorflow/core/common_runtime/gpu/gpu_id.h" -#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h" -#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" +#include "tensorflow/contrib/tensorrt/test/utils.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" @@ -195,20 +194,44 @@ tensorflow::Status ConvertCalibGraphToInferGraph( return tensorflow::Status::OK(); } -// Entry function from Python. tensorflow::Status ConvertGraphDefToTensorRT( const tensorflow::GraphDef& graph_def, const std::vector& output_names, size_t max_batch_size, size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def, int precision_mode, int minimum_segment_size, bool is_dyn_op, int max_cached_engines, std::vector cached_engine_batches) { - // optimization pass + // Create GrapplerItem. tensorflow::grappler::GrapplerItem item; item.fetch = output_names; item.graph = graph_def; - // grappler requires a virtual cluster with a proper GPU device - // in order to calculate flops>0 or fails with FATAL - // We add numbers from a Pascal card here to have flops>0 + + // TODO(aaroey): we should have used single machine cluster like the + // following, but the problem is then wrap_conversion will depend on + // direct_session and cause double linking problems. To fix this we need to + // fix or get rid of the swig dependency. Here we use VirtualCluster + // as a work around, and we need to create a session to initialize the + // underlying device before calling this method. +#if 0 + // Create single machine cluster. Note that this will create a session and + // initialize the gpu devices. + const int num_cpu_cores = + tensorflow::grappler::GetNumAvailableLogicalCPUCores(); + const int num_gpus = tensorflow::grappler::GetNumAvailableGPUs(); + VLOG(2) << "cpu_cores: " << num_cpu_cores; + VLOG(2) << "gpus: " << num_gpus; + const int timeout_s = 60 * 10; + std::unique_ptr cluster( + new tensorflow::grappler::SingleMachine( + timeout_s, num_cpu_cores, num_gpus)); + // These settings are the defaults in tensorflow/python/grappler/cluster.py. + cluster->DisableDetailedStats(true); + cluster->AllowSoftPlacement(true); + cluster->SetNumWarmupSteps(10); + TF_RETURN_IF_ERROR(cluster->Provision()); +#else + // Create virtual cluster. Grappler requires a virtual cluster with a proper + // GPU device in order to calculate flops>0 or fails with FATAL in dbg mode. + // We add numbers from a Pascal card here to have flops>0. tensorflow::DeviceProperties device_properties; device_properties.set_type("GPU"); device_properties.mutable_environment()->insert({"architecture", "6"}); @@ -217,47 +240,43 @@ tensorflow::Status ConvertGraphDefToTensorRT( std::unique_ptr cluster( new tensorflow::grappler::VirtualCluster( {{"/GPU:0", device_properties}})); +#endif - // single machine - int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores(); - int num_gpus = tensorflow::grappler::GetNumAvailableGPUs(); - VLOG(2) << "cpu_cores: " << num_cpu_cores; - VLOG(2) << "gpus: " << num_gpus; + // Create RewriterConfig. tensorflow::RewriterConfig rw_cfg; - // use only const folding and layout for the time being since new optimizers - // break the graph for us + // TODO(aaroey): use only const folding and layout for the time being since + // new optimizers break the graph for trt. rw_cfg.add_optimizers("constfold"); rw_cfg.add_optimizers("layout"); - rw_cfg.set_meta_optimizer_iterations(tensorflow::RewriterConfig::ONE); + auto optimizer = rw_cfg.add_custom_optimizers(); + optimizer->set_name("TensorRTOptimizer"); + auto& parameters = *(optimizer->mutable_parameter_map()); + parameters["minimum_segment_size"].set_i(minimum_segment_size); + parameters["max_batch_size"].set_i(max_batch_size); + parameters["is_dynamic_op"].set_b(is_dyn_op); + parameters["max_workspace_size_bytes"].set_i(max_workspace_size_bytes); + TF_RETURN_IF_ERROR(GetPrecisionModeName( + precision_mode, parameters["precision_mode"].mutable_s())); + parameters["maximum_cached_engines"].set_i(max_cached_engines); + if (!cached_engine_batches.empty()) { + auto list = parameters["cached_engine_batches"].mutable_list(); + for (const int batch : cached_engine_batches) { + list->add_i(batch); + } + } + + // Run optimizer. tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg); - tensorflow::GraphDef gdef; - TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, &gdef)); - item.graph = gdef; - - // AJ refactoring shape inference through grappler/GraphProperties. - tensorflow::grappler::GraphProperties static_graph_properties(item); - TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true)); - // Build full graph - ConversionParams cp; - cp.input_graph_def = &gdef; - cp.output_names = &output_names; - cp.max_batch_size = max_batch_size; - cp.output_graph_def = new_graph_def; - cp.precision_mode = precision_mode; - cp.is_dyn_op = is_dyn_op; - cp.max_cached_engines = max_cached_engines; - cp.cached_engine_batches = cached_engine_batches; - cp.minimum_segment_size = minimum_segment_size; - cp.graph_properties = &static_graph_properties; - cp.max_workspace_size_bytes = max_workspace_size_bytes; + TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, new_graph_def)); + if (VLOG_IS_ON(5)) { std::fstream f; f.open("TRTConversionInput.pb", std::fstream::out | std::fstream::binary | std::fstream::trunc); - f << gdef.SerializeAsString(); + f << new_graph_def->SerializeAsString(); f.close(); } - return ConvertAfterShapes(cp); + return Status::OK(); } // Function to get subsegment information structure. @@ -268,11 +287,10 @@ tensorflow::Status GetEngineInfo( const std::unordered_map& node_map, const std::vector& reverse_topo_order, EngineInfo* info) { - std::vector subgraph_node_ids; + std::vector subgraph_node_ids; // Topologically sorted node ids. + std::set subgraph_node_names = segment_nodes; std::set added_const_node_ids; // Used to prevent double insertion. std::set segment_devices; - int input_port = 0; - int output_port = 0; // Map from src_node_name+port to the unique port numbers of the TRT op, where // the src_node_name is the name of the source node of the input/output @@ -280,13 +298,12 @@ tensorflow::Status GetEngineInfo( // input/output edges must be in different split of the graph. // TODO(aaroey): consider using node id and port instead. // TODO(aaroey): using topo order instead of reverting reverse topo order. - std::unordered_map created_edges; + std::unordered_map input_to_engine_port, output_to_engine_port; for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend(); ++it) { const auto& node_name = (*it)->name(); - if (segment_nodes.count(node_name) == 0) continue; - auto node = node_map.at(node_name); + auto node = *it; auto node_device = node->requested_device(); if (!node_device.empty()) { segment_devices.insert(node_device); @@ -299,64 +316,93 @@ tensorflow::Status GetEngineInfo( } } const int node_id = node->id(); + subgraph_node_ids.push_back(node_id); + // Create input connections. for (const auto edge : node->in_edges()) { auto input_node = edge->src(); - if (segment_nodes.count(input_node->name()) == 0 && - !edge->IsControlEdge() && !input_node->IsSource()) { - // Add constant input node into the segment. We don't care if it has - // other output edges going into other engines or TF nodes. Since we add - // it only to the subsegment node list, not the subsegment itself, it - // won't be removed from the graph. If it doesn't have any edges, TF - // will prune it out. - if (input_node->type_string() == "Const") { - if (added_const_node_ids.count(input_node->id()) == 0) { - added_const_node_ids.insert(input_node->id()); - subgraph_node_ids.push_back(input_node->id()); - } + if (input_node->IsSource() || segment_nodes.count(input_node->name())) { + continue; + } + if (edge->IsControlEdge()) { + // Control input. + info->connections.emplace_back(input_node->name(), input_node->id(), + node_name, node_id, + /*input_edge=*/true); + } else if (input_node->type_string() == "Const") { + // Add constant data input nodes into the segment graphdef (thus also in + // the engine). We don't care if it has other output edges going into + // other engines or TF nodes. Since we add it only to the segment + // graphdef, not the segment itself, it won't be removed from the graph. + // If it doesn't have any edges, TF will prune it out. + // + // Note that the segmenter already ensure that the constant data input + // is valid and suppported by the engine. + if (!added_const_node_ids.insert(input_node->id()).second) { + // Already added before. + continue; + } + VLOG(1) << "Adding const node " << input_node->name(); + QCHECK(subgraph_node_names.insert(input_node->name()).second); + // Since we already add (duplicate) the const input node to the segment + // graphdef, it's now not a data dependency any more, but to make the + // dependency correct we still add a control dependency. + info->connections.emplace_back(input_node->name(), input_node->id(), + node_name, node_id, + /*input_edge=*/true); + } else { + // Non-const data input. + int port = Graph::kControlSlot - 1; + // Use the source non-segment node name/port as key. + const string s = StrCat(input_node->name(), ":", edge->src_output()); + VLOG(1) << "Input edge = " << s; + if (input_to_engine_port.count(s)) { + port = input_to_engine_port.at(s); } else { - string s(input_node->name()); - StrAppend(&s, ":", edge->src_output()); - VLOG(1) << "Input edge = " << s; - int port = input_port; - if (created_edges.count(s)) { - port = created_edges.at(s); - } else { - created_edges.insert({s, port}); - input_port++; - } - info->connections.emplace_back(input_node->name(), input_node->id(), - edge->src_output(), node_name, node_id, - edge->dst_input(), true, port); + port = input_to_engine_port.size(); + input_to_engine_port.insert({s, port}); } + info->connections.emplace_back( + input_node->name(), input_node->id(), edge->src_output(), node_name, + node_id, edge->dst_input(), /*input_edge=*/true, port); } } - // We need to add possible const input nodes before adding this node in - // order to keep the topological order. - subgraph_node_ids.push_back(node_id); + // Create output connections. for (const auto edge : node->out_edges()) { auto output_node = edge->dst(); - if (segment_nodes.count(output_node->name()) == 0 && - !edge->IsControlEdge() && !output_node->IsSink()) { - string s(node_name); - StrAppend(&s, ":", edge->src_output()); + if (output_node->IsSink() || segment_nodes.count(output_node->name())) { + continue; + } + if (edge->IsControlEdge()) { + // Control output. + info->connections.emplace_back(output_node->name(), output_node->id(), + node_name, node_id, + /*input_edge=*/false); + } else { + // Data output. + int port = Graph::kControlSlot - 1; + // Use the source segment node name/port as key. + const string s = StrCat(node_name, ":", edge->src_output()); VLOG(1) << "Output edge = " << s; - int port = output_port; - if (created_edges.count(s)) { - port = created_edges.at(s); + if (output_to_engine_port.count(s)) { + port = output_to_engine_port.at(s); } else { - created_edges.insert({s, port}); - output_port++; + port = output_to_engine_port.size(); + output_to_engine_port.insert({s, port}); } - info->connections.emplace_back(output_node->name(), output_node->id(), - edge->dst_input(), node_name, node_id, - edge->src_output(), false, port); + info->connections.emplace_back( + output_node->name(), output_node->id(), edge->dst_input(), + node_name, node_id, edge->src_output(), /*input_edge=*/false, port); } } - } + } // For each segment node in topological order. + // Construct the const nodes first. + subgraph_node_ids.insert(subgraph_node_ids.begin(), + added_const_node_ids.begin(), + added_const_node_ids.end()); TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef( - g, graph_properties, subgraph_node_ids, &info->connections, - &info->segment_graph_def, &info->engine_name)); + g, graph_properties, subgraph_node_names, subgraph_node_ids, + &info->connections, &info->segment_graph_def, &info->engine_name)); // TODO(sami): This should not happen once segmenter is updated. if (segment_devices.size() == 1) { info->device = *segment_devices.begin(); @@ -366,94 +412,137 @@ tensorflow::Status GetEngineInfo( << "but this shouldn't have happened"; info->device = *segment_devices.begin(); } else { - VLOG(1) << "Segment devices size is 0"; + LOG(ERROR) << "Can't find a device placement for the op!"; } return Status::OK(); } -// Function to insert a TRT node into the graph. The graph is not modified if -// the returned status is not ok. -// 'alloc' is only used for creating static engine. -tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, - const std::vector& infos, int pos, +// Helper function to update edge connection from the removed node to the +// engine node. If an outside node is gone, it must have been absorbed into +// an engine node. Find the engine node. +void UpdateToEngineNode(const std::vector& infos, + const size_t my_engine_id, + const std::vector& engine_nodes, + const bool is_input_edge, const string& node_name, + tensorflow::Node** node, int* port) { + for (size_t t = 0; t < infos.size(); ++t) { + if (t == my_engine_id) { + continue; + } + const auto& info = infos.at(t); + for (const auto& eng_conn : info.connections) { + // If the connection being updated is an input connection, the source of + // the connection must be an output connection of another engine. And vise + // versa. + if (is_input_edge == eng_conn.is_input_edge) continue; + if (eng_conn.inside_node_name == node_name && + eng_conn.inside_port == *port) { + *node = CHECK_NOTNULL(engine_nodes[t]); + QCHECK_EQ(info.engine_name, (**node).name()) + << "Engine name mismatch: " << info.engine_name << " vs " + << (**node).name(); + *port = eng_conn.port_number; + return; + } + } + } + LOG(FATAL) << "Node " << (**node).name() << " not found in any engine."; +} + +// Function to insert a TRT engine node into the graph. +// Create engine nodes in the following way: +// 1. Each invocation of CreateTRTNode creates an engine node for infos[pos] +// 2. When an engine node is created, add it into the graph with necessary +// re-wiring. +// 2.1. If the outside connected node is existing, connect the engine +// node to it. +// 2.2. If the outside connected node is gone, it must have been absorted +// into another engine node (which was processed before the processing +// one). Connect to the pre-existing engine node instead. +// 3. In this way, we ensure the graph is topologically sort-able after each +// invocation of CreateTRTNode(). +tensorflow::Status CreateTRTNode(const std::vector& infos, int pos, + int max_batch_size, tensorflow::Graph* graph, nvinfer1::IGpuAllocator* alloc, - int max_batch_size) { + std::vector* engine_nodes) { const auto& info = infos.at(pos); + TRT_RETURN_IF_TEST_VALUE(StrCat(info.engine_name, ":CreateTRTNode"), "fail"); std::vector output_shape_protos; std::vector input_shape_protos; std::vector input_shapes; std::vector inputs; + std::vector input_nodes; + std::vector control_input_nodes; + std::unordered_set control_input_names; std::vector out_types; - VLOG(1) << "Processing " << info.engine_name; - // Update the shape and data types of input/output nodes, and find all unique - // inputs. + VLOG(1) << "Processing " << info.engine_name; + // Collect needed info for creating the engine node in the graph for (const auto& conn : info.connections) { - if (!conn.is_input_edge) { - // Set the shapes and data types of output edge. - tensorflow::TensorShapeProto out_shape; - // shape of the output node inside segment - conn.inside_shape.AsProto(&out_shape); - if (output_shape_protos.size() <= conn.port_number) { - output_shape_protos.resize(conn.port_number + 1); - out_types.resize(conn.port_number + 1); + // Control edges + if (conn.is_control_edge()) { + // Skip control outputs for now. control output info are not needed for + // node creation and will be processed later. + if (!conn.is_input_edge) continue; + + // Rewrire control input if it's not found in original graph. + tensorflow::Node* input_node = graph->FindNodeId(conn.outside_id); + int port = tensorflow::Graph::kControlSlot; + if (!input_node) { + UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/true, + conn.outside_node_name, &input_node, &port); + QCHECK_EQ(Graph::kControlSlot, port); } - output_shape_protos.at(conn.port_number) = out_shape; - out_types.at(conn.port_number) = conn.connection_type; - continue; - } - - // Set the shapes and data types of input edge. - tensorflow::TensorShapeProto in_shape; - conn.outside_shape.AsProto(&in_shape); - if (input_shape_protos.size() <= conn.port_number) { - input_shape_protos.resize(conn.port_number + 1); - input_shapes.resize(conn.port_number + 1); - } - input_shape_protos.at(conn.port_number) = in_shape; - input_shapes.at(conn.port_number) = conn.outside_shape; - - string input_node = conn.outside_node_name; - int input_port = conn.outside_port; - bool found_engine = false; - // Rewire the inputs to other engines if they contain original input node. - // Note that we use the information of the engine here, not the information - // of the created TRT nodes, so we're able to find all the connections to - // any other engines beforehand. - for (size_t t = 0; t < infos.size(); ++t) { - if (t == pos) continue; - auto& engine_info = infos.at(t); - for (const auto& eng_conn : engine_info.connections) { - if (eng_conn.is_input_edge) continue; - if (eng_conn.inside_node_name == input_node) { - input_node = engine_info.engine_name; - if (eng_conn.inside_port == input_port) { - input_port = eng_conn.port_number; - found_engine = true; - break; - } - } + if (!control_input_names.insert(input_node->name()).second) { + continue; } - if (found_engine) break; - } - VLOG(1) << "Engine Input " << input_node << ":" << input_port << " -> " - << info.engine_name << ":" << inputs.size(); - // Skip duplicate inputs. - // TODO(aaroey): use std::find instead. GetEngineInfo already remove - // duplicate connections, so here we should never find any duplicate? - bool new_input = true; - for (const auto& inp : inputs) { - if (inp.node == input_node && inp.index == input_port) { - new_input = false; - break; + control_input_nodes.push_back(input_node); + VLOG(1) << "Engine Control Input " << input_node->name() << " -> " + << info.engine_name; + } else { + // Data edges + if (!conn.is_input_edge) { + // Set the shapes and data types of output edge. + tensorflow::TensorShapeProto out_shape; + // shape of the output node inside segment + conn.inside_shape.AsProto(&out_shape); + if (output_shape_protos.size() <= conn.port_number) { + output_shape_protos.resize(conn.port_number + 1); + out_types.resize(conn.port_number + 1); + } + output_shape_protos.at(conn.port_number) = out_shape; + out_types.at(conn.port_number) = conn.connection_type; + } else { + // Set the shapes and data types of input edge. + tensorflow::TensorShapeProto in_shape; + conn.outside_shape.AsProto(&in_shape); + if (input_shape_protos.size() <= conn.port_number) { + input_shape_protos.resize(conn.port_number + 1); + input_shapes.resize(conn.port_number + 1); + } + input_shape_protos.at(conn.port_number) = in_shape; + input_shapes.at(conn.port_number) = conn.outside_shape; + + // Rewrire data input if it's not found in original graph. + tensorflow::Node* input_node = graph->FindNodeId(conn.outside_id); + int port = conn.outside_port; + if (!input_node) { + UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/true, + conn.outside_node_name, &input_node, &port); + } + if (std::find_if( + std::begin(inputs), std::end(inputs), + [input_node, &port](const NodeDefBuilder::NodeOut& inp) { + return inp.node == input_node->name() && inp.index == port; + }) == std::end(inputs)) { + inputs.emplace_back(input_node->name(), port, conn.connection_type); + input_nodes.push_back(CHECK_NOTNULL(input_node)); + VLOG(1) << "Engine Input " << input_node->name() << ":" << port + << " -> " << info.engine_name << ":" << inputs.size() - 1; + } } } - if (new_input) { - inputs.emplace_back(input_node, input_port, conn.connection_type); - } } - - // Build the engine and get its serialized representation. string segment_string; if (info.engine_type == EngineInfo::EngineType::TRTStatic || info.precision_mode == INT8MODE) { @@ -485,21 +574,10 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, // TODO(aaroey): use enum instead, and add a helper method to do the // conversion. string prec_string; - switch (info.precision_mode) { - case FP32MODE: - prec_string = "FP32"; - break; - case FP16MODE: - prec_string = "FP16"; - break; - case INT8MODE: - prec_string = "INT8"; - if (!TRTResourceManager::instance()->getManager("TRTCalibration")) { - LOG(ERROR) << "Failed to construct calibration storage"; - } - break; - default: - return tensorflow::errors::OutOfRange("Unknown precision mode"); + TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string)); + if (info.precision_mode == INT8MODE && + !TRTResourceManager::instance()->getManager("TRTCalibration")) { + LOG(ERROR) << "Failed to construct calibration storage"; } tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp"); if (!info.device.empty()) node_builder.Device(info.device); @@ -511,6 +589,10 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, VLOG(1) << ins; } node_builder.Input(inputs); + for (const string& c : control_input_names) { + node_builder.ControlInput(c); + } + if (info.engine_type == EngineInfo::EngineType::TRTStatic && info.cached_engine_batches.size()) { LOG(WARNING) << "Cached engine batches are ignored for static engines"; @@ -539,34 +621,55 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph, // Up until this point, graph is not modified. If we return !status.ok() from // here, this segment will be skipped + // TODO(aaroey): let it return proper error status for the following logic + // instead of checking fail. tensorflow::Node* engine_node = graph->AddNode(trt_node, &status); + (*engine_nodes)[pos] = engine_node; if (!status.ok()) { LOG(ERROR) << "Adding node failed " << status; return status; } + // Add control input and input edges to the engine node. + for (const auto in : control_input_nodes) { + VLOG(1) << "Connecting control edge from " << in->name() << " to " + << engine_node->name(); + graph->AddControlEdge(in, engine_node); + } + VLOG(1) << "input_nodes size = " << input_nodes.size(); + for (int i = 0; i < input_nodes.size(); ++i) { + Node* n = CHECK_NOTNULL(input_nodes[i]); + const auto& in = inputs[i]; + VLOG(1) << "Connecting data edge from " << n->name() << ":" << in.index + << " to " << engine_node->name() << ":" << i; + graph->AddEdge(n, in.index, engine_node, i); + } + // Updates the inputs of output edges destination nodes, and point them to the // engine node. for (auto& conn : info.connections) { - if (conn.is_input_edge) continue; - VLOG(1) << " Updating DBG " << engine_node->name() << " out_port " - << conn.port_number << " out_id " << conn.outside_id - << " name=" << conn.outside_node_name; - auto dst_node = graph->FindNodeId(conn.outside_id); - // dst_node can only be removed if it is an input node of another engine. - // In this case, other engines input edge is updated in nodedef to point to - // this engine. Even though edge doesn't exists in the graph, when it is - // deserialized again, correct edges will be constructed. This is a problem - // of graph->AddNode(). - if (!dst_node) continue; + if (conn.is_input_edge) { + continue; + } + tensorflow::Node* output_node = graph->FindNodeId(conn.outside_id); + int port = conn.outside_port; + if (!output_node) { + UpdateToEngineNode(infos, pos, *engine_nodes, /*is_input_edge=*/false, + conn.outside_node_name, &output_node, &port); + } VLOG(1) << "Updating " << engine_node->name() << ":" << conn.port_number - << " to " << dst_node->name() << ":" << conn.outside_port; - auto new_edge = graph->AddEdge(engine_node, conn.port_number, dst_node, - conn.outside_port); - CHECK(new_edge) << "Adding a new edge failed " << engine_node->name() << ":" - << conn.port_number << " -> " << dst_node->name() << ":" - << conn.outside_port; + << " to " << output_node->name() << ":" << port; + if (conn.is_control_edge()) { + QCHECK_EQ(Graph::kControlSlot, port); + graph->AddControlEdge(engine_node, output_node); + } else { + auto new_edge = + graph->AddEdge(engine_node, conn.port_number, output_node, port); + QCHECK(new_edge) << "Adding a new edge failed " << engine_node->name() + << ":" << conn.port_number << " -> " + << output_node->name() << ":" << conn.outside_port; + } } - return status; + return Status::OK(); } // Function to construct a funcdef from the segment and add it to the graph. @@ -666,72 +769,36 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary( } std::pair GetDeviceAndAllocator( - ConversionParams& params, EngineInfo& engine) { + const ConversionParams& params, const EngineInfo& engine) { int cuda_device_id = -1; - auto check_device_id = [](int tfid) -> int { - tensorflow::TfGpuId tf_gpu_id(tfid); - CudaGpuId cuda_gpu_id; - Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id); - if (s.ok()) { - VLOG(1) << "Found TF GPU " << tf_gpu_id.value() << " at cuda device " - << cuda_gpu_id.value(); - return cuda_gpu_id.value(); - } - VLOG(2) << "TF GPU with id " << tfid << " do not exist " << s; - return -1; - }; tensorflow::Allocator* dev_allocator = nullptr; - // we need to us PM here since in python path there is no way to get - // to allocators. - // TODO(sami): when grappler devices become available else path will not be - // necessary - auto pm = tensorflow::GPUProcessState::singleton(); - if (params.cluster) { // get allocator - tensorflow::Device* device = nullptr; - if (params.cluster->GetDeviceSet()) { - device = params.cluster->GetDeviceSet()->FindDeviceByName(engine.device); + if (params.cluster) { + std::vector devices; + if (!engine.device.empty() && params.cluster->GetDeviceSet()) { + DeviceNameUtils::ParsedName parsed_name; + if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name) && + parsed_name.has_id) { + params.cluster->GetDeviceSet()->FindMatchingDevices(parsed_name, + &devices); + } } - if (device) { + if (!devices.empty()) { + if (devices.size() > 1) { + string msg = "Found multiple matching devices using name '"; + StrAppend(&msg, engine.device, "': "); + for (auto d : devices) StrAppend(&msg, d->name(), ", "); + StrAppend(&msg, ". Will get the allocator from first one."); + LOG(WARNING) << msg; + } tensorflow::AllocatorAttributes alloc_attr; - dev_allocator = device->GetAllocator(alloc_attr); - VLOG(1) << "Using allocator " << dev_allocator->Name(); + cuda_device_id = devices[0]->tensorflow_gpu_device_info()->gpu_id; + dev_allocator = devices[0]->GetAllocator(alloc_attr); + VLOG(1) << "Using allocator " << dev_allocator->Name() + << " and cuda_device_id " << cuda_device_id; } else { LOG(WARNING) << "Cluster is set but device '" << engine.device << "' is not found in the cluster"; } - } else { // cluster not found, possibly a python call - VLOG(1) << "Cluster is not set, probably called from python"; - int found_device = 0; - bool try_gpu_ids = true; - // if device is set, try to find the device. Might be a problem for multi - // host case but TensorRT do not support multi host setups yet. - if (!engine.device.empty()) { - DeviceNameUtils::ParsedName parsed_name; - if (DeviceNameUtils::ParseFullName(engine.device, &parsed_name)) { - cuda_device_id = parsed_name.has_id ? parsed_name.id : -1; - } - try_gpu_ids = !parsed_name.has_id; - } - if (try_gpu_ids) { - while (found_device < 100) { - cuda_device_id = check_device_id(found_device); - if (cuda_device_id >= 0) break; - found_device++; - } - } - if (found_device == 100) { - LOG(ERROR) << " Can't find a GPU device to work with. Please " - "instantiate a session to initialize devices"; - return std::make_pair(cuda_device_id, dev_allocator); - } - LOG(WARNING) - << "Can't determine the device, constructing an allocator at device " - << found_device; - tensorflow::GPUOptions gpuoptions; - // this will be a noop if device is already initialized - gpuoptions.set_allow_growth(true); - tensorflow::TfGpuId tf_gpu_id(found_device); - dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1); } return std::make_pair(cuda_device_id, dev_allocator); } @@ -824,6 +891,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { LOG(ERROR) << "Couldn't get current device: " << cudaGetErrorString(err); } VLOG(1) << "Current cuda device is " << old_cuda_device; + std::vector engine_nodes; + engine_nodes.resize(engine_segments.size()); for (int i = 0; i < engine_segments.size(); ++i) { auto& engine = engine_segments.at(i); // Partition the workspace size by the average of node ratio and segment @@ -847,19 +916,21 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { LOG(WARNING) << "Can't identify the cuda device. Running on device 0 "; } cudaSetDevice(cuda_device_id); - auto status = CreateTRTNode(&graph, engine_segments, i, alloc.get(), - params.max_batch_size); + auto status = CreateTRTNode(engine_segments, i, params.max_batch_size, + &graph, alloc.get(), &engine_nodes); // If status is ok, we successfully added the node to the graph and can // remove segment ops. Otherwise graph is not modified. + const string msg = StrCat("Engine ", engine.engine_name, + " creation for segment ", i, ", composed of ", + converted_segments.at(i).first.size(), " nodes"); if (status.ok()) { + LOG(INFO) << msg << " succeeded."; for (auto node_name : converted_segments.at(i).first) { graph.RemoveNode(node_map.at(node_name)); } } else { // Graph is not modified. - LOG(WARNING) << "Engine creation for segment " << i << ", composed of " - << converted_segments.at(i).first.size() - << " nodes failed: " << status << ". Skipping..."; + LOG(WARNING) << msg << " failed: " << status << ". Skipping..."; } } cudaSetDevice(old_cuda_device); diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc index 451d6fe698bbcf89570fdf54fb3d780a731e7d74..35fa590254137d62fea868882d5c225848829ca1 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc @@ -22,6 +22,7 @@ limitations under the License. #include #include #include +#include #include #include @@ -2690,7 +2691,7 @@ tensorflow::Status ConvertGraphDefToEngine( // Graph nodes are already topologically sorted during construction for (const auto& node_def : gdef.node()) { string node_name = node_def.name(); - VLOG(1) << "Converting op name=" << node_name << ", op=" << node_def.op(); + VLOG(2) << "Converting op name=" << node_name << ", op=" << node_def.op(); if (tensorflow::str_util::StartsWith(node_name, kInputPHName) && (node_def.op() == "Placeholder")) { nvinfer1::DimsCHW input_dim_pseudo_chw; @@ -2788,6 +2789,7 @@ tensorflow::Status ConvertGraphDefToEngine( tensorflow::Status ConvertSegmentToGraphDef( const tensorflow::Graph* graph, const tensorflow::grappler::GraphProperties& graph_properties, + const std::set& subgraph_node_names, const std::vector& subgraph_node_ids, // In topological order std::vector* connections, tensorflow::GraphDef* segment_def, string* common_scope) { @@ -2796,6 +2798,7 @@ tensorflow::Status ConvertSegmentToGraphDef( // nodes in the segment graphdef. for (size_t i = 0; i < connections->size(); ++i) { auto& connection = connections->at(i); + if (connection.is_control_edge()) continue; auto outside_node = graph->FindNodeId(connection.outside_id); if (!outside_node) { // This should never happen, unless the original graph is problematic. @@ -2809,13 +2812,13 @@ tensorflow::Status ConvertSegmentToGraphDef( GetInputProperties(graph_properties, graph->FindNodeId(connection.outside_id), connection.outside_port, &partial_shape, &dtype); - + connection.outside_shape = partial_shape; } else { GetOutputProperties(graph_properties, graph->FindNodeId(connection.outside_id), connection.outside_port, &partial_shape, &dtype); + connection.inside_shape = partial_shape; } - connection.outside_shape = partial_shape; connection.connection_type = dtype; // Add dummy input/output nodes to the segment graphdef. @@ -2868,12 +2871,12 @@ tensorflow::Status ConvertSegmentToGraphDef( old_to_new_id_map[node_id] = segment_def->node_size(); auto snode = segment_def->add_node(); snode->CopyFrom(node->def()); - VLOG(1) << "Copying " << snode->name() << " to subgraph"; + VLOG(2) << "Copying " << snode->name() << " to subgraph"; } // Update the inputs of the new input nodes to point to placeholder nodes. for (int i = 0; i < connections->size(); ++i) { auto& connection = connections->at(i); - if (!connection.is_input_edge) continue; + if (connection.is_control_edge() || !connection.is_input_edge) continue; auto snode = segment_def->mutable_node(old_to_new_id_map[connection.inside_id]); const string placeholder_name = @@ -2883,6 +2886,39 @@ tensorflow::Status ConvertSegmentToGraphDef( << placeholder_name; snode->set_input(connection.inside_port, placeholder_name); } + // Remove control inputs that are not inside the segment. + for (int i = 0; i < segment_def->node_size(); ++i) { + auto snode = segment_def->mutable_node(i); + const int input_size = snode->input_size(); + int input_idx = 0; + int actual_input_idx = 0; + while (input_idx < input_size) { + TensorId input = ParseTensorName(snode->input(input_idx)); + if (!subgraph_node_names.count( + string(input.first.data(), input.first.size())) && + !str_util::StartsWith(input.first, kInputPHName)) { + if (input.second == Graph::kControlSlot) { + VLOG(1) << "... removing control inputs " << input.first + << " from subgraph."; + ++input_idx; + continue; + } else { + return tensorflow::errors::InvalidArgument( + "Found non control input outside the segment that is not an " + "engine connection to ", + snode->name(), ": ", input.first); + } + } + if (actual_input_idx != input_idx) { + snode->set_input(actual_input_idx, snode->input(input_idx)); + } + ++input_idx; + ++actual_input_idx; + } + for (int remove = input_size - actual_input_idx; remove > 0; --remove) { + snode->mutable_input()->RemoveLast(); + } + } *common_scope = local_scope; VLOG(0) << "Segment @scope '" << local_scope << "', converted to graph"; return tensorflow::Status::OK(); @@ -2897,12 +2933,12 @@ bool InputEdgeValidator::operator()(const tensorflow::Edge* in_edge) const { nvinfer1::DataType trt_dtype; Status status = ValidateInputProperties(shape, dtype, &trt_dtype); if (!status.ok()) { - VLOG(2) << "--> Need to remove input node " << in_edge->dst()->name() + VLOG(1) << "--> Need to remove input node " << in_edge->dst()->name() << ": " << status; return false; } if (shape.dims() < 3 && in_edge->src()->type_string() != "Const") { - VLOG(2) << "--> Need to remove input node " << in_edge->dst()->name() + VLOG(1) << "--> Need to remove input node " << in_edge->dst()->name() << " which has an input at port " << in_edge->dst_input() << " with #dim<3 and is not a const: " << shape; return false; @@ -2913,7 +2949,7 @@ bool InputEdgeValidator::operator()(const tensorflow::Edge* in_edge) const { bool OutputEdgeValidator::operator()(const tensorflow::Edge* out_edge) const { if (out_edge->IsControlEdge()) return true; if (out_edge->src()->type_string() == "Const") { - VLOG(2) << "--> Need to remove output node " << out_edge->src()->name() + VLOG(1) << "--> Need to remove output node " << out_edge->src()->name() << " which is a Const."; return false; } diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h index 6ae60ec352587feb8b26d6fcc69c907a5f145760..a60253740fe0b27dcd9c20618d6d05aa7001a1a1 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h @@ -36,16 +36,12 @@ limitations under the License. namespace tensorflow { namespace tensorrt { -static const char* kInputPHName = "InputPH_"; -static const char* kOutputPHName = "OutputPH_"; +static const char* kInputPHName = "TensorRTInputPH_"; +static const char* kOutputPHName = "TensorRTOutputPH_"; namespace convert { -// TODO(aaroey): use an enum instead. -const int FP32MODE = 0; -const int FP16MODE = 1; -const int INT8MODE = 2; - struct EngineConnection { + // Constructs a non-control edge. EngineConnection(const string& outside, int out_id, int out_port, const string& inside, int in_id, int in_port, bool input_edge, int port) @@ -58,21 +54,35 @@ struct EngineConnection { is_input_edge(input_edge), port_number(port) {} + // Constructs a control edge. + EngineConnection(const string& outside, int out_id, const string& inside, + int in_id, bool input_edge) + : outside_node_name(outside), + outside_id(out_id), + outside_port(Graph::kControlSlot), + inside_node_name(inside), + inside_id(in_id), + inside_port(Graph::kControlSlot), + is_input_edge(input_edge), + port_number(Graph::kControlSlot) {} + + bool is_control_edge() const { return port_number == Graph::kControlSlot; } + const string outside_node_name; const int outside_id; const int outside_port; - tensorflow::PartialTensorShape outside_shape; + tensorflow::PartialTensorShape outside_shape; // Only set for input edge. const string inside_node_name; const int inside_id; const int inside_port; - tensorflow::PartialTensorShape inside_shape; + tensorflow::PartialTensorShape inside_shape; // Only set for output edge. tensorflow::DataType connection_type; - bool is_input_edge; + const bool is_input_edge; - // The port number of the TRT node connecting to this edge. - int port_number; + // The port number of the TRT node connected with this edge. + const int port_number; }; struct EngineInfo { @@ -85,7 +95,9 @@ struct EngineInfo { string device; tensorflow::GraphDef segment_graph_def; - // The segment nodes that are on one side of the edges are topological sorted. + // Non-control input connections inside this vector are sorted in a way such + // that, the segment nodes connecting to them are topological sorted. + // In addition, for non-control connections, there must be no duplicates. std::vector connections; enum class EngineType { TRTStatic = 0, TRTDynamic = 1 }; @@ -101,6 +113,7 @@ struct EngineInfo { // (OutputPH_*). This function needs to be called before TensorRT nodes // inserted in order to correctly get sizes from the original graph. // +// - subgraph_node_names: the node names of the subgraph. // - subgraph_node_ids: the node ids of the subgraph, must be sorted in // topological order. // - segment_def: the output GraphDef, whose non-input/output nodedefs will be @@ -110,6 +123,7 @@ struct EngineInfo { tensorflow::Status ConvertSegmentToGraphDef( const tensorflow::Graph* graph, const tensorflow::grappler::GraphProperties& graph_properties, + const std::set& subgraph_node_names, const std::vector& subgraph_node_ids, std::vector* connections, tensorflow::GraphDef* segment_def, string* common_scope); diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc index 044c736c03e0dcad0d27d6b9ad9d244816596536..f33f2cc4d68f5ac10eafeb744f8162bfca0abfab 100644 --- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc +++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/stacktrace.h" #if GOOGLE_CUDA #if GOOGLE_TENSORRT @@ -189,9 +190,6 @@ tensorflow::Status TRTOptimizationPass::Optimize( tensorflow::grappler::Cluster* cluster, const tensorflow::grappler::GrapplerItem& item, GraphDef* optimized_graph) { VLOG(1) << "Called TRTOptimization Pass " << name_; - if (VLOG_IS_ON(1)) { - PrintDebugInfo(cluster, item); - } // This is a hack to workaround optimizer issue. MetaOptimizer calls // optimization passes on function objects as well, we should not modify // generated funcdefs! This is fragile but we don't have any other option @@ -203,6 +201,10 @@ tensorflow::Status TRTOptimizationPass::Optimize( *optimized_graph = item.graph; return tensorflow::Status::OK(); } + if (VLOG_IS_ON(1)) { + VLOG(2) << CurrentStackTrace(); + PrintDebugInfo(cluster, item); + } int max_dim = -1; if (item.feed.size()) { for (const auto& f : item.feed) { diff --git a/tensorflow/contrib/tensorrt/convert/utils.cc b/tensorflow/contrib/tensorrt/convert/utils.cc index 17857cf4d002b663f38248cc0ff989915ec864b4..e7a1febb8c076891596741fe30721e7acca15a73 100644 --- a/tensorflow/contrib/tensorrt/convert/utils.cc +++ b/tensorflow/contrib/tensorrt/convert/utils.cc @@ -15,6 +15,9 @@ limitations under the License. #include "tensorflow/contrib/tensorrt/convert/utils.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" + namespace tensorflow { namespace tensorrt { @@ -31,5 +34,36 @@ bool IsGoogleTensorRTEnabled() { #endif } +Status GetPrecisionModeName(const int precision_mode, string* name) { + switch (precision_mode) { + case FP32MODE: + *name = "FP32"; + break; + case FP16MODE: + *name = "FP16"; + break; + case INT8MODE: + *name = "INT8"; + break; + default: + return tensorflow::errors::OutOfRange("Unknown precision mode"); + } + return Status::OK(); +} + +Status GetPrecisionMode(const string& name, int* precision_mode) { + if (name == "FP32") { + *precision_mode = FP32MODE; + } else if (name == "FP16") { + *precision_mode = FP16MODE; + } else if (name == "INT8") { + *precision_mode = INT8MODE; + } else { + return tensorflow::errors::InvalidArgument("Invalid precision mode name: ", + name); + } + return Status::OK(); +} + } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/convert/utils.h b/tensorflow/contrib/tensorrt/convert/utils.h index 8b5f4d614a9c1f849f0aec9df42100bb4126b439..0592f31462af2b20f3a13fe5119e89c2ba42dd8a 100644 --- a/tensorflow/contrib/tensorrt/convert/utils.h +++ b/tensorflow/contrib/tensorrt/convert/utils.h @@ -18,6 +18,8 @@ limitations under the License. #include +#include "tensorflow/core/lib/core/status.h" + namespace tensorflow { namespace tensorrt { @@ -33,6 +35,15 @@ using TrtUniquePtrType = std::unique_ptr>; bool IsGoogleTensorRTEnabled(); +// TODO(aaroey): use an enum instead. +const int FP32MODE = 0; +const int FP16MODE = 1; +const int INT8MODE = 2; + +Status GetPrecisionModeName(const int precision_mode, string* name); + +Status GetPrecisionMode(const string& name, int* precision_mode); + } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index 6699b71d285f1f4fa8cc9bb66679c65e71d16dcc..2b42d81f475189f74a934c3aeed7d7fc34d4eb53 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h" #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h" #include "tensorflow/contrib/tensorrt/resources/trt_resources.h" +#include "tensorflow/contrib/tensorrt/test/utils.h" #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/lib/strings/str_util.h" @@ -122,15 +123,9 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) context->GetAttr("calibration_data", &calibration_data)); OP_REQUIRES_OK(context, context->GetAttr("segment_funcdef_name", &funcdef_name_)); - if (precision_string == "FP32") { - precision_mode_ = convert::FP32MODE; - } else if (precision_string == "FP16") { - precision_mode_ = convert::FP16MODE; - } else if (precision_string == "INT8") { - precision_mode_ = convert::INT8MODE; - } + OP_REQUIRES_OK(context, GetPrecisionMode(precision_string, &precision_mode_)); calibration_mode_ = - (precision_mode_ == convert::INT8MODE && calibration_data.size() == 0); + (precision_mode_ == INT8MODE && calibration_data.size() == 0); if (calibration_data.size()) { calibrator_.reset(new TRTInt8Calibrator(calibration_data)); calibration_data.resize(0); @@ -179,7 +174,7 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx, helper->Ref(); // Increment count for calculating native graph VLOG(1) << "Executing native segment " << name(); lib->Run(opts, native_func_, inputs, outputs, - [ctx, outputs, helper](const tensorflow::Status& s) { + [this, ctx, outputs, helper](const tensorflow::Status& s) { tensorflow::core::ScopedUnref sc(helper); VLOG(1) << "Native Segment completed"; if (!s.ok()) { @@ -189,6 +184,8 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx, for (size_t t = 0; t < outputs->size(); ++t) { ctx->set_output(t, outputs->at(t)); } + test::AddTestValue(StrCat(this->name(), ":ExecuteNativeSegment"), + "done"); delete outputs; }); } @@ -234,6 +231,7 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, ->implementation() ->GpuStreamMemberHack())); calib_res->calibrator_->setBatch(input_data, *stream); + test::AddTestValue(StrCat(name(), ":ExecuteCalibration"), "done"); VLOG(2) << "Passed calibration data"; ExecuteNativeSegment(ctx, helper); } @@ -258,7 +256,7 @@ int TRTEngineOp::GetEngineBatch(OpKernelContext* ctx) { StrCat("Engine buffer is full. buffer limit=", max_cached_engines_, ", current entries="); for (auto i : cached_engine_batches_) StrAppend(&msg, i, ","); - StrAppend(&msg, "Requested batch=", num_batch); + StrAppend(&msg, " requested batch=", num_batch); LOG(WARNING) << msg; return -1; } @@ -276,7 +274,8 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx, } const int smallest_engine = GetEngineBatch(ctx); if (smallest_engine < 0) { - LOG(WARNING) << "Failed to get engine batch, running native segment"; + LOG(WARNING) << "Failed to get engine batch, running native segment for " + << name(); ExecuteNativeSegment(ctx, helper); return; } @@ -286,14 +285,15 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx, auto& trt_engine_ptr = engine_ctx_pair.first; if (!trt_engine_ptr) { LOG(WARNING) << "Engine retrieval for batch size " << num_batch - << " failed. Running native segment"; + << " failed. Running native segment for " << name(); ExecuteNativeSegment(ctx, helper); return; } const bool retry = ExecuteTrtEngine(ctx, num_batch, trt_engine_ptr.get(), engine_ctx_pair.second.get()); if (retry) { - LOG(WARNING) << "Failed to execute engine, retrying with native segment"; + LOG(WARNING) << "Failed to execute engine, " + << "retrying with native segment for " << name(); ExecuteNativeSegment(ctx, helper); return; } @@ -412,6 +412,7 @@ bool TRTEngineOp::ExecuteTrtEngine( LOG(WARNING) << "Failed to enqueue batch for TRT engine: " << name(); return kRetry; } + test::AddTestValue(StrCat(name(), ":ExecuteTrtEngine"), "done"); // Synchronization will be done by TF. return !kRetry; } @@ -589,7 +590,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources( // TODO(aaroey): maybe setting the max batch size using the python // calibration wrapper class. auto s = convert::ConvertGraphDefToEngine( - *segment_graph, convert::INT8MODE, cres->calibrator_->getBatchSize(), + *segment_graph, INT8MODE, cres->calibrator_->getBatchSize(), workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(), cres->calibrator_.get(), &cres->engine_, /*convert_successfully=*/nullptr); diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h index 59b744e6d35d603795c0e87c89c0a8d56c26b3cb..8fe06758914261035c90a6fda3f114a63a8ac93a 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h @@ -35,7 +35,7 @@ limitations under the License. namespace tensorflow { namespace tensorrt { -class TRTInt8Calibrator; +struct TRTInt8Calibrator; class TRTCalibrationResource; class AsyncHelper; // TODO(Sami): Remove this file? diff --git a/tensorflow/contrib/tensorrt/python/__init__.py b/tensorflow/contrib/tensorrt/python/__init__.py index fe4fa166a10d914d028938925266683e62861421..7cdfe2b1a612be2eec473d806d0eb44b611ca68a 100644 --- a/tensorflow/contrib/tensorrt/python/__init__.py +++ b/tensorflow/contrib/tensorrt/python/__init__.py @@ -20,7 +20,11 @@ from __future__ import print_function # pylint: disable=unused-import,line-too-long from tensorflow.contrib.tensorrt.python.ops import trt_engine_op +from tensorflow.contrib.tensorrt.python.trt_convert import add_test_value from tensorflow.contrib.tensorrt.python.trt_convert import calib_graph_to_infer_graph +from tensorflow.contrib.tensorrt.python.trt_convert import clear_test_values from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph +from tensorflow.contrib.tensorrt.python.trt_convert import enable_test_value +from tensorflow.contrib.tensorrt.python.trt_convert import get_test_value from tensorflow.contrib.tensorrt.python.trt_convert import is_tensorrt_enabled # pylint: enable=unused-import,line-too-long diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py index 2b67931661397cee0de9faa66b58a608c69ecdc5..4116f2fe30aa5c0c9ea139100291abe3b13da94b 100644 --- a/tensorflow/contrib/tensorrt/python/trt_convert.py +++ b/tensorflow/contrib/tensorrt/python/trt_convert.py @@ -20,26 +20,26 @@ from __future__ import print_function # pylint: disable=unused-import,line-too-long import six as _six +from tensorflow.contrib.tensorrt.wrap_conversion import add_test_value from tensorflow.contrib.tensorrt.wrap_conversion import calib_convert +from tensorflow.contrib.tensorrt.wrap_conversion import clear_test_values +from tensorflow.contrib.tensorrt.wrap_conversion import enable_test_value from tensorflow.contrib.tensorrt.wrap_conversion import get_linked_tensorrt_version from tensorflow.contrib.tensorrt.wrap_conversion import get_loaded_tensorrt_version +from tensorflow.contrib.tensorrt.wrap_conversion import get_test_value from tensorflow.contrib.tensorrt.wrap_conversion import is_tensorrt_enabled -from tensorflow.contrib.tensorrt.wrap_conversion import trt_convert from tensorflow.core.framework import graph_pb2 +from tensorflow.core.protobuf import meta_graph_pb2 from tensorflow.core.protobuf import rewriter_config_pb2 -from tensorflow.python.framework import errors from tensorflow.python.framework import errors_impl as _impl -from tensorflow.python.framework import meta_graph +from tensorflow.python.framework import importer from tensorflow.python.framework import ops from tensorflow.python.grappler import tf_optimizer from tensorflow.python.platform import tf_logging -from tensorflow.python.util import compat - +from tensorflow.python.training import saver # pylint: enable=unused-import,line-too-long -# TODO(skama): get outputs from session when implemented as c++ -# optimization pass def create_inference_graph(input_graph_def, outputs, max_batch_size=1, @@ -48,7 +48,7 @@ def create_inference_graph(input_graph_def, minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, - cached_engine_batches=[]): + cached_engine_batches=None): """Python wrapper for the TRT transformation. Args: @@ -87,8 +87,7 @@ def create_inference_graph(input_graph_def, (".".join([str(x) for x in compiled_version]), ".".join([str(x) for x in loaded_version])) + ". Please make sure that correct version of TensorRT " + - "is available in the system and added to ldconfig or LD_LIBRARY_PATH" - ) + "is available in the system and added to ldconfig or LD_LIBRARY_PATH") raise RuntimeError("Incompatible TensorRT library version") for i in zip(loaded_version, compiled_version): if i[0] != i[1]: @@ -121,41 +120,42 @@ def create_inference_graph(input_graph_def, to_bytes = py3bytes to_string = py3string - out_names = [] - for i in outputs: - if isinstance(i, ops.Tensor): - out_names.append(to_bytes(i.name)) - else: - out_names.append(to_bytes(i)) - - input_graph_def_str = input_graph_def.SerializeToString() - - # TODO(sami): Fix this when we can return status from C++ library - # There is a problem with the TF internal library setup that doesn't - # allow us to return a status object from C++. Thus we return a - # pair or strings where first one is encoded status and the second - # one is the transformed graphs protobuf string. - out = trt_convert(input_graph_def_str, out_names, max_batch_size, - max_workspace_size_bytes, mode, minimum_segment_size, - is_dynamic_op, maximum_cached_engines, - cached_engine_batches) - status = to_string(out[0]) - output_graph_def_string = out[1] - del input_graph_def_str # Save some memory - if len(status) < 2: - raise _impl.UnknownError(None, None, status) - if status[:2] != "OK": - msg = status.split(";") - if len(msg) == 1: - raise RuntimeError("Status message is malformed {}".format(status)) - # pylint: disable=protected-access - raise _impl._make_specific_exception(None, None, ";".join(msg[1:]), - int(msg[0])) - # pylint: enable=protected-access - output_graph_def = graph_pb2.GraphDef() - output_graph_def.ParseFromString(output_graph_def_string) - del output_graph_def_string # Save some memory - return output_graph_def + # Create MetaGraphDef + graph = ops.Graph() + with graph.as_default(): + importer.import_graph_def(input_graph_def, name="") + meta_graph = saver.export_meta_graph( + graph_def=graph.as_graph_def(), graph=graph) + if outputs: + output_collection = meta_graph_pb2.CollectionDef() + output_list = output_collection.node_list.value + for i in outputs: + if isinstance(i, ops.Tensor): + output_list.append(to_bytes(i.name)) + else: + output_list.append(to_bytes(i)) + meta_graph.collection_def["train_op"].CopyFrom(output_collection) + + # Create RewriterConfig. + rewriter_cfg = rewriter_config_pb2.RewriterConfig() + rewriter_cfg.optimizers.extend(["constfold", "layout"]) + optimizer = rewriter_cfg.custom_optimizers.add() + optimizer.name = "TensorRTOptimizer" + optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size + optimizer.parameter_map["max_batch_size"].i = max_batch_size + optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op + optimizer.parameter_map[ + "max_workspace_size_bytes"].i = max_workspace_size_bytes + optimizer.parameter_map["precision_mode"].s = to_bytes(precision_mode) + optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines + if cached_engine_batches: + if not isinstance(cached_engine_batches, list): + raise TypeError("cached_engine_batches should be a list.") + optimizer.parameter_map["cached_engine_batches"].list.i.extend( + cached_engine_batches) + + return tf_optimizer.OptimizeGraph( + rewriter_cfg, meta_graph, graph_id=b"tf_graph") def calib_graph_to_infer_graph(calibration_graph_def, is_dynamic_op=False): diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc index 008fffc95430b1c423788a4e958e06e700cac233..b43f1b190f5f8cfe98959dd9f2838e4d45759e5c 100644 --- a/tensorflow/contrib/tensorrt/segment/segment.cc +++ b/tensorflow/contrib/tensorrt/segment/segment.cc @@ -414,10 +414,10 @@ tensorflow::Status SegmentGraph( } for (const SimpleNode* node : order) { // All output nodes of 'node' have been visited... - VLOG(2) << "Trying node " << node->name() << " id=" << node->id(); + VLOG(3) << "Trying node " << node->name() << " id=" << node->id(); // 'node' must be a TRT candidate... if (node_segments[node->id()].Value() == nullptr) { - VLOG(2) << "... not a TRT candidate"; + VLOG(3) << "... not a TRT candidate"; continue; } // Contract output edges to combine 'node' with output @@ -426,22 +426,22 @@ tensorflow::Status SegmentGraph( while (true) { std::set contract_edges; for (const SimpleEdge* out_edge : node->out_edges()) { - VLOG(2) << "... out node " << out_edge->dst()->name() << " ( " + VLOG(3) << "... out node " << out_edge->dst()->name() << " ( " << out_edge->dst()->id() << " <- " << node->id() << " )"; if (out_edge->IsControlEdge()) { - VLOG(2) << "... ... Control Edge, Skipping"; + VLOG(3) << "... ... Control Edge, Skipping"; continue; } // Out node must be TRT candidate... if (node_segments[out_edge->dst()->id()].Value() == nullptr) { - VLOG(2) << "... ... not a TRT candidate"; + VLOG(3) << "... ... not a TRT candidate"; continue; } if (CanContractEdge(out_edge, graph)) { - VLOG(2) << "... ... can contract"; + VLOG(3) << "... ... can contract"; contract_edges.insert(out_edge); } else { - VLOG(2) << "... ... cannot contract, would form cycle"; + VLOG(3) << "... ... cannot contract, would form cycle"; } } if (contract_edges.empty()) { @@ -454,7 +454,7 @@ tensorflow::Status SegmentGraph( const SimpleNode* src = contract_edge->src(); const SimpleNode* dst = contract_edge->dst(); - VLOG(2) << "Merge " << src->name() << " <- " << dst->name() << " (" + VLOG(3) << "Merge " << src->name() << " <- " << dst->name() << " (" << src->id() << " <- " << dst->id(); node_segments[src->id()].Merge(&node_segments[dst->id()]); @@ -478,7 +478,7 @@ tensorflow::Status SegmentGraph( // A map from the segment identifier (currently the name of the root node of // the segment tree) to the segment nodes set. - std::unordered_map> sg_map; + std::map> sg_map; // A map from the segment identifier (currently the name of the root node of // the segment tree) to the device names that the nodes in the segment are @@ -558,27 +558,36 @@ tensorflow::Status SegmentGraph( // then after doing this operation the resulting subgraph will keep the // same properties 1 and 2. // - // For simplicity we use heuristics: for input nodes remove all its - // input, for output nodes remove all its output. In this way, for common - // cases the number of removed nodes should be minimum. + // For simplicity we use heuristics: for input and const output nodes + // remove all their inputs, and for non-const output nodes remove all + // their outputs. In this way, for common cases the number of removed + // nodes should be minimum. auto remove_nodes = [&segment_nodes]( bool is_input_nodes, std::deque* que) { // Run a BFS on the queue to find all the input/output nodes. std::set visited; + std::set logged(que->begin(), que->end()); while (!que->empty()) { auto node = que->front(); que->pop_front(); if (!visited.insert(node).second) continue; segment_nodes.erase(node); - for (auto in : - is_input_nodes ? node->in_nodes() : node->out_nodes()) { + for (auto in : (is_input_nodes || node->type_string() == "Const") + ? node->in_nodes() + : node->out_nodes()) { if (segment_nodes.count(in)) { que->push_back(in); - VLOG(2) << "Need to remove node " << in->name() - << " because one of its " - << (is_input_nodes ? "output" : "input") - << " nodes in the graph was removed: " << node->name(); + if (VLOG_IS_ON(2)) { + if (!logged.count(in)) { + VLOG(2) << "----> Need to remove node " << in->name() + << " because one of its " + << (is_input_nodes ? "output" : "input") + << " nodes in the graph was removed: " + << node->name(); + logged.insert(in); + } + } } } } @@ -594,7 +603,7 @@ tensorflow::Status SegmentGraph( for (const auto& itr : sg_map) { const std::set& segment_nodes = itr.second; if (VLOG_IS_ON(1)) { - string s; + string s = "parent=" + itr.first + ":"; for (auto node : segment_nodes) s += " " + node->name(); VLOG(1) << "Segment " << segments->size() << ": " << s; } diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc index 432e7b1c047cb3b22d47f7432b6aad639a3a3b2d..5937fa8259a39339e92b150862d195ee1f23f70a 100644 --- a/tensorflow/contrib/tensorrt/segment/segment_test.cc +++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc @@ -206,7 +206,7 @@ TEST_F(SegmentTest, Multiple) { // Make add5 not a TRT candidate, and we expect two segments. auto without_add5 = all_adds - "add5"; RunTest(&g, without_add5, without_add5, without_add5, - {{"add6", "add8"}, {"add0", "add1", "add2", "add3"}}); + {{"add0", "add1", "add2", "add3"}, {"add6", "add8"}}); // Make add8 not a candidate and add6 not an input candidate, then all direct // and indirect inputs of add6 will be removed from the segment. @@ -252,7 +252,7 @@ TEST_F(SegmentTest, BigIfElse) { const std::set all_adds = {"add0", "add1", "add2", "add3", "add4", "add5", "add6", "add7"}; RunTest(&g, all_adds - "add2", all_adds, all_adds, - {{"add3", "add4", "add5", "add6", "add7"}, {"add0", "add1"}}); + {{"add0", "add1"}, {"add3", "add4", "add5", "add6", "add7"}}); } } // namespace test diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py index edd30ad7a95dd3c7f74634699660caad30c0b645..8ea5a6373525a8045d13f70aa9e12d66d4c08f0a 100644 --- a/tensorflow/contrib/tensorrt/test/base_test.py +++ b/tensorflow/contrib/tensorrt/test/base_test.py @@ -20,17 +20,19 @@ from __future__ import print_function import numpy as np +from tensorflow.contrib.tensorrt.python import trt_convert from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.ops import nn_ops from tensorflow.python.platform import test -class SimpleSingleEngineGraphDefTest(trt_test.TfTrtIntegrationTestBase): +class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase): def GetParams(self): """Create a graph containing single segment.""" @@ -65,13 +67,17 @@ class SimpleSingleEngineGraphDefTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=1, + # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which + # breaks the connection check, fix it. + # - my_trt_op_0 should have ["weights", "conv", "bias", "bias_add", + # "relu", "identity", "max_pool"] + expected_engines=["my_trt_op_0"], expected_output_dims=(100, 6, 6, 6), allclose_atol=1.e-03, allclose_rtol=1.e-03) -class SimpleMultiEngineGraphDefTest(trt_test.TfTrtIntegrationTestBase): +class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase): def GetParams(self): """Create a graph containing multiple segment.""" @@ -95,32 +101,246 @@ class SimpleMultiEngineGraphDefTest(trt_test.TfTrtIntegrationTestBase): padding="SAME", name="conv") c1 = constant_op.constant( - np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype) - p = conv * c1 + np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c1") + p = math_ops.mul(conv, c1, name="mul") c2 = constant_op.constant( - np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype) - q = conv / c2 + np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c2") + q = math_ops.div(conv, c2, name="div") - edge = self.trt_incompatible_op(q) - edge /= edge - r = edge + edge + edge = self.trt_incompatible_op(q, name="incompatible") + edge = math_ops.div(edge, edge, name="div1") + r = math_ops.add(edge, edge, name="add") - p -= edge - q *= edge - s = p + q - s -= r + p = math_ops.sub(p, edge, name="sub") + q = math_ops.mul(q, edge, name="mul1") + s = math_ops.add(p, q, name="add1") + s = math_ops.sub(s, r, name="sub1") array_ops.squeeze(s, name=self.output_name) return trt_test.TfTrtIntegrationTestParams( gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=2, + # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which + # breaks the connection check, fix it. + # - my_trt_op_0 should have ["mul", "sub", "div1", "mul1", "add1", + # "add", "sub1"]; + # - my_trt_op_1 should have ["weights","conv", "div"] + expected_engines=["my_trt_op_0", "my_trt_op_1"], expected_output_dims=(100, 12, 12, 6), allclose_atol=1.e-03, allclose_rtol=1.e-03) -# TODO(aaroey): add a large complex graph to test. +class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase): + + def setUp(self): + """Setup method.""" + super(PartiallyConvertedTestA, self).setUp() + # Let it fail to build the second engine. + trt_convert.add_test_value("my_trt_op_1:CreateTRTNode", "fail") + + def GetParams(self): + """Create a graph containing two segment.""" + input_name = "input" + input_dims = [2, 32, 32, 3] + g = ops.Graph() + with g.as_default(): + inp = array_ops.placeholder( + dtype=dtypes.float32, shape=input_dims, name=input_name) + with g.device("/GPU:0"): + n = inp + for i in range(2): + c = constant_op.constant(1.0, name="c%d" % i) + n = math_ops.add(n, c, name="add%d" % i) + n = math_ops.mul(n, n, name="mul%d" % i) + edge = self.trt_incompatible_op(n, name="incompatible") + with g.control_dependencies([edge]): + c = constant_op.constant(1.0, name="c2") + n = math_ops.add(n, c, name="add2") + n = math_ops.mul(n, n, name="mul2") + c = constant_op.constant(1.0, name="c3") + n = math_ops.add(n, c, name="add3") + n = math_ops.mul(n, n, name="mul3") + array_ops.squeeze(n, name=self.output_name) + return trt_test.TfTrtIntegrationTestParams( + gdef=g.as_graph_def(), + input_names=[input_name], + input_dims=[input_dims], + expected_engines={ + # Only the first engine is built. + "my_trt_op_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"] + }, + expected_output_dims=tuple(input_dims), + allclose_atol=1.e-06, + allclose_rtol=1.e-06) + + +class PartiallyConvertedTestB(PartiallyConvertedTestA): + + def setUp(self): + """Setup method.""" + super(PartiallyConvertedTestB, self).setUp() + # Let it fail to build the first engine. + trt_convert.clear_test_values("") + trt_convert.add_test_value("my_trt_op_0:CreateTRTNode", "fail") + + def GetParams(self): + """Create a graph containing two segment.""" + return super(PartiallyConvertedTestB, self).GetParams()._replace( + expected_engines={ + # Only the second engine is built. + "my_trt_op_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"] + }) + + +class ConstInputTest(trt_test.TfTrtIntegrationTestBase): + + def GetParams(self): + """Create a graph containing multiple segment.""" + input_name = "input" + input_dims = [2, 32, 32, 3] + g = ops.Graph() + with g.as_default(): + inp = array_ops.placeholder( + dtype=dtypes.float32, shape=input_dims, name=input_name) + with g.device("/GPU:0"): + n = inp + c = constant_op.constant(1.0, name="c") + # Adds control dependency from the constant op to a trt incompatible op, + # and adds control dependency from the trt incompatible op to all other + # ops, to make sure the constant op cannot be contracted with any trt + # segment that depends on it. + with g.control_dependencies([c]): + d = self.trt_incompatible_op(n, name="incompatible") + with g.control_dependencies([d]): + n = math_ops.add(n, c, name="add") + n = math_ops.mul(n, n, name="mul") + n = math_ops.add(n, n, name="add1") + n = self.trt_incompatible_op(n, name="incompatible1") + with g.control_dependencies([d]): + n = math_ops.add(n, c, name="add2") + n = math_ops.mul(n, n, name="mul1") + n = math_ops.add(n, n, name="add3") + array_ops.squeeze(n, name=self.output_name) + return trt_test.TfTrtIntegrationTestParams( + gdef=g.as_graph_def(), + input_names=[input_name], + input_dims=[input_dims], + expected_engines={ + "my_trt_op_0": ["add", "add1", "mul"], + "my_trt_op_1": ["add2", "add3", "mul1"] + }, + expected_output_dims=tuple(input_dims), + allclose_atol=1.e-06, + allclose_rtol=1.e-06) + + +class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase): + + def GetParams(self): + """Create a graph containing single segment.""" + input_name = "input" + input_dims = [2, 32, 32, 3] + g = ops.Graph() + with g.as_default(): + inp = array_ops.placeholder( + dtype=dtypes.float32, shape=input_dims, name=input_name) + with g.device("/GPU:0"): + n = inp + c = constant_op.constant(1.0, name="c") + n = math_ops.add(n, c, name="add") + n = math_ops.mul(n, n, name="mul") + n = math_ops.add(n, n, name="add1") + array_ops.squeeze(n, name=self.output_name) + return trt_test.TfTrtIntegrationTestParams( + gdef=g.as_graph_def(), + input_names=[input_name], + input_dims=[input_dims], + expected_engines={"my_trt_op_0": ["c", "add", "add1", "mul"]}, + expected_output_dims=tuple(input_dims), + allclose_atol=1.e-06, + allclose_rtol=1.e-06) + + +class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase): + + def GetParams(self): + """Create a graph containing multiple segment.""" + input_name = "input" + input_dims = [2, 32, 32, 3] + g = ops.Graph() + with g.as_default(): + inp = array_ops.placeholder( + dtype=dtypes.float32, shape=input_dims, name=input_name) + with g.device("/GPU:0"): + n = inp + c = constant_op.constant(1.0, name="c") + n = math_ops.add(n, c, name="add") + n = math_ops.mul(n, n, name="mul") + n = math_ops.add(n, n, name="add1") + n = self.trt_incompatible_op(n, name="incompatible1") + n = math_ops.add(n, c, name="add2") + n = math_ops.mul(n, n, name="mul1") + n = math_ops.add(n, n, name="add3") + array_ops.squeeze(n, name=self.output_name) + return trt_test.TfTrtIntegrationTestParams( + gdef=g.as_graph_def(), + input_names=[input_name], + input_dims=[input_dims], + expected_engines={ + "my_trt_op_0": ["add2", "add3", "mul1"], + # Why segment ["add", "add1", "mul"] was assigned segment id 1 + # instead of 0: the parent node of this segment is actually const + # node 'c', but it's removed later since it's const output of the + # segment which is not allowed. + "my_trt_op_1": ["add", "add1", "mul"] + }, + expected_output_dims=tuple(input_dims), + allclose_atol=1.e-06, + allclose_rtol=1.e-06) + + +class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase): + + def GetParams(self): + """Create a graph containing multiple segment.""" + input_name = "input" + input_dims = [2, 32, 32, 3] + g = ops.Graph() + with g.as_default(): + inp = array_ops.placeholder( + dtype=dtypes.float32, shape=input_dims, name=input_name) + with g.device("/GPU:0"): + c1 = constant_op.constant(1.0, name="c1") + c2 = constant_op.constant(1.0, name="c2") + d1 = constant_op.constant(1.0, name="d1") + d2 = self.trt_incompatible_op(inp, name="d2") + with g.control_dependencies([d1, d2]): + add = math_ops.add(inp, c1, name="add") + with g.control_dependencies([d1, d2]): + mul = math_ops.mul(add, add, name="mul") + with g.control_dependencies([d1, d2]): + add1 = math_ops.add(mul, mul, name="add1") + edge = self.trt_incompatible_op(add1, name="incompatible") + with g.control_dependencies([d1, d2, add, mul]): + add2 = math_ops.add(edge, c2, name="add2") + with g.control_dependencies([d1, d2, add1, mul]): + mul1 = math_ops.mul(add2, add2, name="mul1") + with g.control_dependencies([d1, d2, add, add1]): + add3 = math_ops.add(mul1, mul1, name="add3") + array_ops.squeeze(add3, name=self.output_name) + return trt_test.TfTrtIntegrationTestParams( + gdef=g.as_graph_def(), + input_names=[input_name], + input_dims=[input_dims], + expected_engines={ + "my_trt_op_0": ["c1", "add", "add1", "mul"], + "my_trt_op_1": ["c2", "add2", "add3", "mul1"] + }, + expected_output_dims=tuple(input_dims), + allclose_atol=1.e-06, + allclose_rtol=1.e-06) + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py index 730b6843fb9885b8ba0db2ad199b95d9d3219774..2e1107e30383926f6428c6551682caf66cd97498 100644 --- a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py +++ b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py @@ -66,7 +66,7 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name, w1_name, w2_name], input_dims=[input_dims, w1_dims, w2_dims], - num_expected_engines=1, + expected_engines=["my_trt_op_0"], expected_output_dims=(12, 5, 8, 7), allclose_atol=1.e-03, allclose_rtol=1.e-03) diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py index 0c03a10b640c8b243318bb4327d2ac5aac803be7..8be32f59b48e64412466370950298feafc03b35c 100644 --- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py +++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py @@ -102,7 +102,10 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=7, + expected_engines=[ + "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3", + "my_trt_op_4", "my_trt_op_5", "my_trt_op_6" + ], expected_output_dims=(48, 89), allclose_atol=1.e-03, allclose_rtol=1.e-03) diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py index dd673463a5930df4d0e4c1c7410b3f5eb88d664c..9316b14da07d5f7e47953504680e14d5d20c17a4 100644 --- a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py +++ b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py @@ -109,7 +109,24 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=16, + expected_engines=[ + "my_trt_op_0", + "my_trt_op_1", + "my_trt_op_2", + "my_trt_op_3", + "my_trt_op_4", + "my_trt_op_5", + "my_trt_op_6", + "my_trt_op_7", + "my_trt_op_8", + "my_trt_op_9", + "my_trt_op_10", + "my_trt_op_11", + "my_trt_op_12", + "my_trt_op_13", + "my_trt_op_14", + "my_trt_op_15", + ], expected_output_dims=(5, 23040), allclose_atol=1.e-03, allclose_rtol=1.e-03) diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/contrib/tensorrt/test/concatenation_test.py index 8c51c45b0a2c6f370415b9c8ac99a63dd37be900..1874b9dd45390407d3d36798cae620848df50c8d 100644 --- a/tensorflow/contrib/tensorrt/test/concatenation_test.py +++ b/tensorflow/contrib/tensorrt/test/concatenation_test.py @@ -73,7 +73,7 @@ class ConcatenationTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=1, + expected_engines=["my_trt_op_0"], expected_output_dims=(2, 126), allclose_atol=1.e-03, allclose_rtol=1.e-03) diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py index 97b29bf05ddc3a0396472d0500ff53ceca7c5d4b..8c59000b70e04cedc84308249865cfcb23ce80a3 100644 --- a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py +++ b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py @@ -58,7 +58,7 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=1, + expected_engines=['my_trt_op_0'], expected_output_dims=(5, 12, 12, 1), allclose_atol=1.e-02, allclose_rtol=1.e-02) diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py index 3dd95c6f62fe504cb23e01fdb8b9785cee080de4..66eb6be757d3f4dcc390435486f7ed4f6517f875 100644 --- a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py +++ b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py @@ -62,7 +62,7 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=1, + expected_engines=["my_trt_op_0"], expected_output_dims=(2, 15, 15, 10), allclose_atol=1.e-02, allclose_rtol=1.e-02) diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py index 734ccf6345777d543138daba2b720c9dc03f3295..fd55b8cd99171fe34424e48a417eb8981b051c17 100644 --- a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py +++ b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py @@ -77,7 +77,7 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=2, + expected_engines=["my_trt_op_0", "my_trt_op_1"], expected_output_dims=(2, 4, 5, 4), allclose_atol=1.e-03, allclose_rtol=1.e-03) diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py index 50265c0845005748d75bf8afc49df11a528c9169..51c905a50b29c017719d66f9049e9b1bc3a9ec97 100644 --- a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py +++ b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py @@ -25,7 +25,7 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops -from tensorflow.python.ops import gen_math_ops +from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.platform import test @@ -51,15 +51,18 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase): name="conv") b = constant_op.constant( np.random.normal(1.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype) - t = conv * b - e = gen_math_ops.tan(conv) - t = t - e + t = math_ops.mul(conv, b, name="mul") + e = self.trt_incompatible_op(conv, name="incompatible") + t = math_ops.sub(t, e, name="sub") array_ops.squeeze(t, name=self.output_name) return trt_test.TfTrtIntegrationTestParams( gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=2, + expected_engines={ + "my_trt_op_0": ["bias", "mul", "sub"], + "my_trt_op_1": ["weights", "conv"] + }, expected_output_dims=(2, 4, 5, 4), allclose_atol=1.e-03, allclose_rtol=1.e-03) diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py index bb7f5a77f011ee5c4fe748c246ac632a7bb19aff..6f85ada4649563d099c6054e8e17da27954071f7 100644 --- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py +++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py @@ -20,6 +20,7 @@ from __future__ import print_function from collections import namedtuple import itertools +import os import warnings import numpy as np import six @@ -30,6 +31,7 @@ from tensorflow.contrib.tensorrt.python.ops import trt_engine_op # pylint: enable=unused-import from tensorflow.core.protobuf import config_pb2 from tensorflow.core.protobuf import rewriter_config_pb2 +from tensorflow.python.framework import graph_io from tensorflow.python.framework import importer from tensorflow.python.framework import ops from tensorflow.python.framework import test_util @@ -37,10 +39,14 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import tf_logging as logging TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [ - "gdef", "input_names", "input_dims", "num_expected_engines", + "gdef", "input_names", "input_dims", "expected_engines", "expected_output_dims", "allclose_atol", "allclose_rtol" ]) +RunParams = namedtuple( + "RunParams", + ["use_optimizer", "precision_mode", "dynamic_engine", "test_name"]) + PRECISION_MODES = ["FP32", "FP16", "INT8"] @@ -48,6 +54,12 @@ def _IsQuantizationMode(mode): return mode == "INT8" +class GraphState(object): + ORIGINAL = 0 + CALIBRATE = 1 + INFERENCE = 2 + + class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): """Class to test Tensorflow-TensorRT integration.""" @@ -63,45 +75,90 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): def precision_modes(self): return ["FP32", "FP16", "INT8"] + # str is bytes in py2, but unicode in py3. + def _ToUnicode(self, s): + if six.PY2: + if isinstance(s, unicode): + return s + return s.decode("utf-8") + else: + if isinstance(s, str): + return s + return s.decode("utf-8") + def _ToBytes(self, s): if six.PY2: + if isinstance(s, unicode): + return s.encode("utf-8") return s else: - return s.encode("utf-8") + if isinstance(s, str): + return s.encode("utf-8") + return s def _ToString(self, s): if six.PY2: + if isinstance(s, unicode): + return s.encode("utf-8") return s else: + if isinstance(s, str): + return s return s.decode("utf-8") + @classmethod + def setUpClass(cls): + """Setup method for the module.""" + super(TfTrtIntegrationTestBase, cls).setUpClass() + trt_convert.enable_test_value() + def setUp(self): """Setup method.""" super(TfTrtIntegrationTestBase, self).setUp() warnings.simplefilter("always") + trt_convert.clear_test_values("") def GetParams(self): """Return a TfTrtIntegrationTestParams for test, implemented by subclass.""" raise NotImplementedError() - def _GetConfigProto(self, - params, - use_optimizer, - precision_mode=None, - is_dynamic_op=None): + def _PrepareRun(self, params, graph_state): + """Set up necessary testing environment before calling sess.run().""" + # Clear test values added by TRTEngineOp. + trt_convert.clear_test_values("my_trt_op_.*:ExecuteTrtEngine") + trt_convert.clear_test_values("my_trt_op_.*:ExecuteCalibration") + trt_convert.clear_test_values("my_trt_op_.*:ExecuteNativeSegment") + + def _VerifyRun(self, params, graph_state): + """Verify the state after sess.run().""" + for engine_name in params.expected_engines: + if graph_state == GraphState.ORIGINAL: + self._ExpectCalibration(engine_name, "") + self._ExpectNativeSegment(engine_name, "") + self._ExpectTrtEngine(engine_name, "") + elif graph_state == GraphState.CALIBRATE: + self._ExpectCalibration(engine_name, "done") + self._ExpectNativeSegment(engine_name, "done") + self._ExpectTrtEngine(engine_name, "") + elif graph_state == GraphState.INFERENCE: + self._ExpectCalibration(engine_name, "") + self._ExpectNativeSegment(engine_name, "") + self._ExpectTrtEngine(engine_name, "done") + + def _GetConfigProto(self, params, run_params, graph_state): """Get config proto based on specific settings.""" - if use_optimizer: + if graph_state != GraphState.ORIGINAL and run_params.use_optimizer: rewriter_cfg = rewriter_config_pb2.RewriterConfig() rewriter_cfg.optimizers.extend(["constfold", "layout"]) custom_op = rewriter_cfg.custom_optimizers.add() custom_op.name = "TensorRTOptimizer" - custom_op.parameter_map["minimum_segment_size"].i = 3 + custom_op.parameter_map["minimum_segment_size"].i = 2 custom_op.parameter_map["max_batch_size"].i = max( [dims[0] for dims in params.input_dims]) - custom_op.parameter_map["is_dynamic_op"].b = is_dynamic_op + custom_op.parameter_map["is_dynamic_op"].b = run_params.dynamic_engine custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25 custom_op.parameter_map["precision_mode"].s = self._ToBytes( - precision_mode) + run_params.precision_mode) graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg) else: graph_options = config_pb2.GraphOptions() @@ -115,7 +172,26 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): gpu_options=gpu_options, graph_options=graph_options) return config - def _RunGraph(self, params, gdef, input_data, config, num_runs=2): + def _ExpectTestValue(self, engine_name, method, expected_value): + label = "%s:%s" % (engine_name, method) + actual_value = trt_convert.get_test_value(label) + self.assertEqual( + expected_value, + actual_value, + msg="Unexpected test value with label %s. Actual: %s; expected: %s" % + (label, actual_value, expected_value)) + + def _ExpectCalibration(self, engine_name, value): + self._ExpectTestValue(engine_name, "ExecuteCalibration", value) + + def _ExpectTrtEngine(self, engine_name, value): + self._ExpectTestValue(engine_name, "ExecuteTrtEngine", value) + + def _ExpectNativeSegment(self, engine_name, value): + self._ExpectTestValue(engine_name, "ExecuteNativeSegment", value) + + def _RunGraph(self, params, gdef, input_data, config, graph_state, + num_runs=2): """Run given graphdef multiple times.""" assert len(params.input_names) == len(input_data) g = ops.Graph() @@ -132,93 +208,170 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): val = None # Defaults to 2 runs to verify result across multiple runs is same. for _ in range(num_runs): + self._PrepareRun(params, graph_state) new_val = sess.run(out, {inp[i]: input_data[i] for i in range(len(inp))}) self.assertEqual(params.expected_output_dims, new_val.shape) if val is not None: self.assertAllEqual(val, new_val) val = new_val + self._VerifyRun(params, graph_state) return val # Use real data that is representative of the inference dataset # for calibration. For this test script it is random data. def _RunCalibration(self, params, gdef, input_data, config): """Run calibration on given graph.""" - return self._RunGraph(params, gdef, input_data, config, 30) + return self._RunGraph( + params, gdef, input_data, config, GraphState.CALIBRATE, num_runs=5) - def _GetTrtGraphDef(self, params, gdef, precision_mode, is_dynamic_op): + def _GetTrtGraphDef(self, params, run_params, gdef): """Return trt converted graphdef.""" return trt_convert.create_inference_graph( input_graph_def=gdef, outputs=[self.output_name], max_batch_size=max([dims[0] for dims in params.input_dims]), max_workspace_size_bytes=1 << 25, - precision_mode=precision_mode, + precision_mode=run_params.precision_mode, minimum_segment_size=2, - is_dynamic_op=is_dynamic_op) - - def _VerifyGraphDef(self, - params, - gdef, - precision_mode=None, - is_calibrated=None, - dynamic_engine=None): + is_dynamic_op=run_params.dynamic_engine) + + def _WriteGraph(self, params, run_params, gdef, graph_state): + if graph_state == GraphState.ORIGINAL: + label = "Original" + elif graph_state == GraphState.CALIBRATE: + label = "CalibEngine" + elif graph_state == GraphState.INFERENCE: + label = "InferEngine" + graph_name = ( + self.__class__.__name__ + "_" + run_params.test_name + "_" + label + + ".pbtxt") + temp_dir = os.getenv("TRT_TEST_TMPDIR", self.get_temp_dir()) + logging.info("Writing graph to %s/%s", temp_dir, graph_name) + graph_io.write_graph(gdef, temp_dir, graph_name) + + def _VerifyConnections(self, params, converted_gdef): + old_to_new_node_map = { + self._ToString(node.name): self._ToString(node.name) + for node in params.gdef.node + } + for engine_name, node_names in params.expected_engines.items(): + for node_name in node_names: + old_to_new_node_map[node_name] = engine_name + name_to_node_map = { + self._ToString(node.name): node for node in params.gdef.node + } + + def _InputName(inp): + inp = self._ToString(inp) + prefix = "" + if inp[0] == "^": + prefix = "^" + inp = inp[1:] + parts = inp.split(":") + if len(parts) > 1 and parts[-1].isdigit(): + inp = inp[:-len(parts[-1]) - 1] + return (prefix, inp) + + expected_input_map = {} + for node in params.gdef.node: + name_str = self._ToString(node.name) + target_node_name = old_to_new_node_map[name_str] + is_engine_op = (target_node_name != name_str) + if target_node_name not in expected_input_map: + expected_input_map[target_node_name] = set() + input_set = expected_input_map[target_node_name] + for inp in node.input: + (prefix, inp_name) = _InputName(inp) + # Add the input only if it's outside the segment (note that it could be + # in a different engine). + if (not is_engine_op or + old_to_new_node_map[inp_name] != target_node_name): + if is_engine_op and name_to_node_map[inp_name].op == "Const": + # Const data input nodes to the segment has been copied to the + # segment graphdef and the engine, and the dependency has been + # converted to control dependendy. + input_set.add("^" + old_to_new_node_map[inp_name]) + else: + input_set.add(prefix + old_to_new_node_map[inp_name]) + + actual_input_map = {} + for node in converted_gdef.node: + name_str = self._ToString(node.name) + actual_input_map[name_str] = set() + input_set = actual_input_map[name_str] + for inp in node.input: + (prefix, node_name) = _InputName(inp) + input_set.add(prefix + node_name) + + self.assertEqual( + expected_input_map, + actual_input_map, + msg="expected:\n%s\nvs actual:\n%s" % (sorted( + expected_input_map.items()), sorted(actual_input_map.items()))) + + def _VerifyGraphDef(self, params, run_params, gdef, graph_state): + self._WriteGraph(params, run_params, gdef, graph_state) + num_engines = 0 - for n in gdef.node: - # TODO(jie): we should have coverage for failed conversion (TF fallback). - # where the conversion will fail and we shouldn't count this engine as the - # converted engines. - if n.op == "TRTEngineOp": + for node in gdef.node: + if node.op == "TRTEngineOp": num_engines += 1 - self.assertNotEqual(self._ToBytes(""), n.attr["serialized_segment"].s) - self.assertNotEqual(self._ToBytes(""), n.attr["segment_funcdef_name"].s) + self.assertTrue(node.name in params.expected_engines) + self.assertTrue(len(node.attr["serialized_segment"].s)) + self.assertTrue(len(node.attr["segment_funcdef_name"].s)) self.assertEqual( - self._ToBytes(precision_mode), n.attr["precision_mode"].s) - self.assertEqual(not dynamic_engine, n.attr["static_engine"].b) - if _IsQuantizationMode(precision_mode) and is_calibrated: - self.assertNotEqual(self._ToBytes(""), n.attr["calibration_data"].s) + self._ToBytes(run_params.precision_mode), + node.attr["precision_mode"].s) + + is_dynamic_engine = not node.attr["static_engine"].b + self.assertEqual(run_params.dynamic_engine, is_dynamic_engine) + + has_calibration_data = len(node.attr["calibration_data"].s) + if (_IsQuantizationMode(run_params.precision_mode) and + graph_state == GraphState.INFERENCE): + self.assertTrue(has_calibration_data) else: - self.assertEqual(self._ToBytes(""), n.attr["calibration_data"].s) - if precision_mode is None: # This means gdef is the original GraphDef. + self.assertFalse(has_calibration_data) + if graph_state == GraphState.ORIGINAL: self.assertEqual(0, num_engines) else: - self.assertEqual(num_engines, params.num_expected_engines) + self.assertEqual(num_engines, len(params.expected_engines)) + if isinstance(params.expected_engines, dict): + self._VerifyConnections(params, gdef) + # TODO(aaroey): consider verifying the corresponding TF function. - def RunTest(self, params, use_optimizer, precision_mode, - dynamic_infer_engine, dynamic_calib_engine): - assert precision_mode in PRECISION_MODES + def RunTest(self, params, run_params): + assert run_params.precision_mode in PRECISION_MODES input_data = [np.random.random_sample(dims) for dims in params.input_dims] input_gdef = params.gdef - self._VerifyGraphDef(params, input_gdef) + self._VerifyGraphDef(params, run_params, input_gdef, GraphState.ORIGINAL) # Get reference result without running trt. - config_no_trt = self._GetConfigProto(params, False) + config_no_trt = self._GetConfigProto(params, run_params, + GraphState.ORIGINAL) logging.info("Running original graph w/o trt, config:\n%s", str(config_no_trt)) - ref_result = self._RunGraph(params, input_gdef, input_data, config_no_trt) + ref_result = self._RunGraph(params, input_gdef, input_data, config_no_trt, + GraphState.ORIGINAL) # Run calibration if necessary. - if _IsQuantizationMode(precision_mode): + if _IsQuantizationMode(run_params.precision_mode): - calib_config = self._GetConfigProto(params, use_optimizer, precision_mode, - dynamic_calib_engine) + calib_config = self._GetConfigProto(params, run_params, + GraphState.CALIBRATE) logging.info("Running calibration graph, config:\n%s", str(calib_config)) - if use_optimizer: - self.assertTrue(False) - # TODO(aaroey): uncomment this and get infer_gdef when this mode is - # supported. - # result = self._RunCalibration(params, input_gdef, input_data, - # calib_config) + if run_params.use_optimizer: + result = self._RunCalibration(params, input_gdef, input_data, + calib_config) else: - calib_gdef = self._GetTrtGraphDef(params, input_gdef, precision_mode, - dynamic_calib_engine) - self._VerifyGraphDef(params, calib_gdef, precision_mode, False, - dynamic_calib_engine) + calib_gdef = self._GetTrtGraphDef(params, run_params, input_gdef) + self._VerifyGraphDef(params, run_params, calib_gdef, + GraphState.CALIBRATE) result = self._RunCalibration(params, calib_gdef, input_data, calib_config) - infer_gdef = trt_convert.calib_graph_to_infer_graph(calib_gdef) - self._VerifyGraphDef(params, infer_gdef, precision_mode, True, - dynamic_calib_engine) + infer_gdef = trt_convert.calib_graph_to_infer_graph(calib_gdef) + self._VerifyGraphDef(params, run_params, infer_gdef, GraphState.INFERENCE) self.assertAllClose( ref_result, @@ -229,18 +382,19 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): infer_gdef = input_gdef # Run inference. - infer_config = self._GetConfigProto(params, use_optimizer, precision_mode, - dynamic_infer_engine) + infer_config = self._GetConfigProto(params, run_params, + GraphState.INFERENCE) logging.info("Running final inference graph, config:\n%s", str(infer_config)) - if use_optimizer: - result = self._RunGraph(params, infer_gdef, input_data, infer_config) + if run_params.use_optimizer: + result = self._RunGraph(params, infer_gdef, input_data, infer_config, + GraphState.INFERENCE) else: - trt_infer_gdef = self._GetTrtGraphDef(params, infer_gdef, precision_mode, - dynamic_infer_engine) - self._VerifyGraphDef(params, trt_infer_gdef, precision_mode, True, - dynamic_infer_engine) - result = self._RunGraph(params, trt_infer_gdef, input_data, infer_config) + trt_infer_gdef = self._GetTrtGraphDef(params, run_params, infer_gdef) + self._VerifyGraphDef(params, run_params, trt_infer_gdef, + GraphState.INFERENCE) + result = self._RunGraph(params, trt_infer_gdef, input_data, infer_config, + GraphState.INFERENCE) self.assertAllClose( ref_result, @@ -263,66 +417,44 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): def _AddTests(test_class): """Adds test methods to TfTrtIntegrationTestBase.""" - def _GetTest(use_optimizer, precision_mode, dynamic_infer_engine, - dynamic_calib_engine): + def _GetTest(run_params): """Gets a single test method based on the parameters.""" def _Test(self): params = self.GetParams() logging.info( - "Running test with parameters: use_optimizer=%s, precision_mode=%s, " - "dynamic_infer_engine=%s, dynamic_calib_engine=%s", use_optimizer, - precision_mode, dynamic_infer_engine, dynamic_calib_engine) - self.RunTest(params, use_optimizer, precision_mode, dynamic_infer_engine, - dynamic_calib_engine) + "Running test %s with parameters: use_optimizer=%s, " + "precision_mode=%s, dynamic_engine=%s", + "testTfTrt_" + run_params.test_name, run_params.use_optimizer, + run_params.precision_mode, run_params.dynamic_engine) + self.RunTest(params, run_params) return _Test use_optimizer_options = [False, True] - dynamic_infer_engine_options = [False, True] - dynamic_calib_engine_options = [False, True] - for (use_optimizer, precision_mode, - dynamic_infer_engine, dynamic_calib_engine) in itertools.product( - use_optimizer_options, PRECISION_MODES, dynamic_infer_engine_options, - dynamic_calib_engine_options): + dynamic_engine_options = [False, True] + for (use_optimizer, precision_mode, dynamic_engine) in itertools.product( + use_optimizer_options, PRECISION_MODES, dynamic_engine_options): if _IsQuantizationMode(precision_mode): - if not dynamic_calib_engine and dynamic_infer_engine: - # TODO(aaroey): test this case, the conversion from static calibration - # engine to dynamic inference engine should be a noop. - continue if use_optimizer: # TODO(aaroey): if use_optimizer is True we need to get the inference # graphdef using custom python wrapper class, which is not currently # supported yet. continue - if not dynamic_calib_engine: + if not dynamic_engine: # TODO(aaroey): construction of static calibration engine is not # supported yet. continue - if dynamic_calib_engine and not dynamic_infer_engine: - # TODO(aaroey): construction of static inference engine using dynamic - # calibration engine is not supported yet. - continue - else: # In non int8 mode. - if dynamic_calib_engine: - # dynamic_calib_engine doesn't affect non-int8 modes, so just let - # related tests run once on dynamic_calib_engine=False. - continue conversion = "OptimizerConversion" if use_optimizer else "ToolConversion" - infer_engine_type = ("DynamicInferEngine" - if dynamic_infer_engine else "StaticInferEngine") - calib_engine_type = "" - if precision_mode == "INT8": - calib_engine_type = ("DynamicCalibEngine" - if dynamic_calib_engine else "StaticCalibEngine") - test_name = "%s_%s_%s%s" % (conversion, precision_mode, infer_engine_type, - ("_" + calib_engine_type) - if len(calib_engine_type) else "") - setattr( - test_class, "testTfTRT_" + test_name, - _GetTest(use_optimizer, precision_mode, dynamic_infer_engine, - dynamic_calib_engine)) + engine_type = ("DynamicEngine" if dynamic_engine else "StaticEngine") + test_name = "%s_%s_%s" % (conversion, precision_mode, engine_type) + run_params = RunParams( + use_optimizer=use_optimizer, + precision_mode=precision_mode, + dynamic_engine=dynamic_engine, + test_name=test_name) + setattr(test_class, "testTfTrt_" + test_name, _GetTest(run_params)) if trt_convert.is_tensorrt_enabled(): diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/contrib/tensorrt/test/unary_test.py index b9e977cf67b4e94282c10313477276b04ea828aa..500057a36d60efa3b7f96f22e27973444ecc277c 100644 --- a/tensorflow/contrib/tensorrt/test/unary_test.py +++ b/tensorflow/contrib/tensorrt/test/unary_test.py @@ -100,7 +100,10 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name, input2_name], input_dims=[input_dims, input2_dims], - num_expected_engines=5, + expected_engines=[ + "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3", + "my_trt_op_4" + ], expected_output_dims=(12, 5, 8, 12), allclose_atol=1.e-03, allclose_rtol=1.e-03) diff --git a/tensorflow/contrib/tensorrt/test/utils.cc b/tensorflow/contrib/tensorrt/test/utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..276308b3a0a6ce864969afb0179c6a3f00d6b70b --- /dev/null +++ b/tensorflow/contrib/tensorrt/test/utils.cc @@ -0,0 +1,101 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/contrib/tensorrt/test/utils.h" + +#include +#include + +#include "re2/re2.h" +#include "tensorflow/core/platform/macros.h" + +namespace tensorflow { +namespace tensorrt { +namespace test { + +// TODO(aaroey): make this class thread-safe. +class TestValueManager { + public: + static TestValueManager* singleton() { + static TestValueManager* manager = new TestValueManager(); + return manager; + } + + void Enable() { + VLOG(1) << "Enabling test value"; + enabled_ = true; + } + + void Add(const string& label, const string& value) { + if (TF_PREDICT_FALSE(enabled_)) { + QCHECK_NE("", value); + VLOG(1) << "Adding test value: " << label << " -> " << value; + values_.insert({label, value}); + } + } + + string Get(const string& label) { + if (TF_PREDICT_FALSE(enabled_)) { + VLOG(1) << "Getting test value by " << label; + auto itr = values_.find(label); + if (itr == values_.end()) return ""; + return itr->second; + } + return ""; + } + + void Clear(const string& pattern) { + if (TF_PREDICT_FALSE(enabled_)) { + VLOG(1) << "Clearing test values"; + if (pattern.empty()) { + values_.clear(); + return; + } + std::vector keys_to_clear; + for (const auto& kv : values_) { + if (RE2::FullMatch(kv.first, pattern)) { + keys_to_clear.push_back(kv.first); + } + } + for (const string& key : keys_to_clear) { + values_.erase(key); + } + } + } + + private: + TestValueManager() : enabled_(false) {} + + bool enabled_; + std::unordered_map values_; +}; + +void EnableTestValue() { TestValueManager::singleton()->Enable(); } + +void ClearTestValues(const string& pattern) { + TestValueManager::singleton()->Clear(pattern); +} + +void AddTestValue(const string& label, const string& value) { + TestValueManager::singleton()->Add(label, value); +} + +string GetTestValue(const string& label) { + return TestValueManager::singleton()->Get(label); +} + +} // namespace test +} // namespace tensorrt +} // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/test/utils.h b/tensorflow/contrib/tensorrt/test/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..4bb4120206cfaae70107e55d1818e3af2f02717a --- /dev/null +++ b/tensorflow/contrib/tensorrt/test/utils.h @@ -0,0 +1,44 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_ +#define TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_ + +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/status.h" + +namespace tensorflow { +namespace tensorrt { +namespace test { + +// Helper methods to inject values used by testing tools. +void EnableTestValue(); +void ClearTestValues(const string& pattern); +void AddTestValue(const string& label, const string& value); +string GetTestValue(const string& label); + +#define TRT_RETURN_IF_TEST_VALUE(label, value_to_return) \ + do { \ + if (::tensorflow::tensorrt::test::GetTestValue(label) == \ + value_to_return) { \ + return errors::Internal("Injected manually"); \ + } \ + } while (0) + +} // namespace test +} // namespace tensorrt +} // namespace tensorflow + +#endif // TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_ diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py index 2b134c3bce2b36e4530f8f8e58cce8d07c9bb13b..ab4d224db4d88c91c9b06d278b404879d989a834 100644 --- a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py +++ b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py @@ -72,7 +72,7 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=1, + expected_engines=["my_trt_op_0"], expected_output_dims=(5, 6, 2, 2), allclose_atol=1.e-03, allclose_rtol=1.e-03) diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_test.py index bec2f23eff3b1799d70519462f42c326d17924c1..56bdf848eadbdde3d5896e415ecd9754ed387eeb 100644 --- a/tensorflow/contrib/tensorrt/test/vgg_block_test.py +++ b/tensorflow/contrib/tensorrt/test/vgg_block_test.py @@ -63,7 +63,7 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase): gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], - num_expected_engines=1, + expected_engines=["my_trt_op_0"], expected_output_dims=(5, 2, 2, 6), allclose_atol=1.e-03, allclose_rtol=1.e-03) diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/contrib/tensorrt/trt_conversion.i index 422740fdf6ec381dc6f6c01e736ce8b3398586ce..6ea15fb8eff13663625420288a37ba002d57fa47 100644 --- a/tensorflow/contrib/tensorrt/trt_conversion.i +++ b/tensorflow/contrib/tensorrt/trt_conversion.i @@ -101,82 +101,22 @@ _LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong); #include "tensorflow/core/util/stat_summarizer.h" #include "tensorflow/contrib/tensorrt/convert/convert_graph.h" #include "tensorflow/contrib/tensorrt/convert/utils.h" +#include "tensorflow/contrib/tensorrt/test/utils.h" %} %ignoreall %unignore tensorflow; -%unignore trt_convert; %unignore calib_convert; %unignore get_linked_tensorrt_version; %unignore get_loaded_tensorrt_version; %unignore is_tensorrt_enabled; +%unignore enable_test_value; +%unignore clear_test_values; +%unignore add_test_value; +%unignore get_test_value; %{ -std::pair trt_convert( - string graph_def_string, // The serialized GraphDef string. - std::vector output_names, - size_t max_batch_size, - size_t max_workspace_size_bytes, - int precision_mode, - int minimum_segment_size, - bool is_dyn_op, - int max_cached_engines, - std::vector cached_engine_batches - // Unfortunately we can't use TF_Status here since it - // is in c/c_api and brings in a lot of other libraries - // which in turn declare ops. These ops are included - // statically in our library and cause an abort when - // module is loaded due to double registration - // until Tensorflow properly exposes these headers - // we have to work around this by returning a string - // and converting it to exception on python side. - //,TF_Status* out_status) { -) { -#if GOOGLE_CUDA && GOOGLE_TENSORRT - string out_status; - - tensorflow::GraphDef graph_def; - if (!graph_def.ParseFromString(graph_def_string)) { - out_status = "InvalidArgument;Couldn't interpret input as a GraphDef"; - return std::pair{out_status, ""}; - } - - if (precision_mode < 0 || precision_mode > 2) { - out_status = "InvalidArgument;Invalid precision_mode"; - return std::pair{out_status, ""}; - } - if (!output_names.size()) { - out_status = "InvalidArgument;Size of the output_names vector is 0"; - return std::pair{out_status, ""}; - } - tensorflow::GraphDef out_graph; - tensorflow::Status conversion_status = - tensorflow::tensorrt::convert::ConvertGraphDefToTensorRT( - graph_def, output_names, max_batch_size, max_workspace_size_bytes, - &out_graph, precision_mode, minimum_segment_size, - is_dyn_op, max_cached_engines, cached_engine_batches); - if (!conversion_status.ok()) { - auto retCode = (int)conversion_status.code(); - char buff[2000]; - snprintf(buff, 2000, "%d;%s", retCode, - conversion_status.error_message().c_str()); - out_status = buff; - return std::pair{out_status, ""}; - } - string result; - if (!out_graph.SerializeToString(&result)) { - out_status = "InvalidArgument;Couldn't serialize output as a GraphDef"; - return std::pair{out_status, ""}; - } - out_status = "OK;All good!"; - return std::pair{out_status, result}; -#else - // Returns FAILED_PRECONDITION. - return std::pair{"9;TensorRT is not enabled!", ""}; -#endif // GOOGLE_CUDA && GOOGLE_TENSORRT -} - std::pair calib_convert( string graph_def_string, bool is_dyn_op // unfortunately we can't use TF_Status here since it @@ -251,20 +191,44 @@ bool is_tensorrt_enabled() { return tensorflow::tensorrt::IsGoogleTensorRTEnabled(); } -%} +void enable_test_value() { + tensorflow::tensorrt::test::EnableTestValue(); +} + +#if PY_MAJOR_VERSION < 3 +#define TRT_PY_TO_CPP_STRING PyString_AsString +#define TRT_CPP_TO_PY_STRING PyString_FromString +#else +#define TRT_PY_TO_CPP_STRING PyUnicode_AsUTF8 +#define TRT_CPP_TO_PY_STRING PyUnicode_FromString +#endif + +void clear_test_values(PyObject* pattern) { + tensorflow::tensorrt::test::ClearTestValues( + string(TRT_PY_TO_CPP_STRING(pattern))); +} + +void add_test_value(PyObject* label, PyObject* value) { + tensorflow::tensorrt::test::AddTestValue( + string(TRT_PY_TO_CPP_STRING(label)), string(TRT_PY_TO_CPP_STRING(value))); +} -std::pair calib_convert(string graph_def_string, bool is_dyn_op); +PyObject* get_test_value(PyObject* label) { + string value = tensorflow::tensorrt::test::GetTestValue( + string(TRT_PY_TO_CPP_STRING(label))); + return TRT_CPP_TO_PY_STRING(value.c_str()); +} -std::pair trt_convert(string graph_def_string, - std::vector output_names, - size_t max_batch_size, - size_t max_workspace_size_bytes, - int precision_mode, int minimum_segment_size, - bool is_dyn_op, - int max_cached_engines, - std::vector cached_engine_batches); +%} + +std::pair calib_convert( + string graph_def_string, bool is_dyn_op); version_struct get_linked_tensorrt_version(); version_struct get_loaded_tensorrt_version(); bool is_tensorrt_enabled(); +void enable_test_value(); +void clear_test_values(PyObject* pattern); +void add_test_value(PyObject* label, PyObject* value); +PyObject* get_test_value(PyObject* label); %unignoreall diff --git a/tensorflow/contrib/timeseries/__init__.py b/tensorflow/contrib/timeseries/__init__.py index 11db56b1b7a48b401efeece91283eb7084747c14..654a4db098757a969c2d298f7ed490083e63b9da 100644 --- a/tensorflow/contrib/timeseries/__init__.py +++ b/tensorflow/contrib/timeseries/__init__.py @@ -27,6 +27,9 @@ @@TrainEvalFeatures @@FilteringResults + +@@TimeSeriesRegressor +@@OneShotPredictionHead """ from __future__ import absolute_import diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD index 7020989d6895fd6322db45cda6f7dd99d417d937..c230919168b937b26c68e141e15f0762ad70f3e6 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/BUILD +++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD @@ -94,7 +94,6 @@ py_library( "//tensorflow/python:training", "//tensorflow/python:util", "//tensorflow/python/estimator:estimator_py", - "//tensorflow/python/estimator:export", "//tensorflow/python/feature_column", ], ) @@ -149,9 +148,6 @@ py_library( "//tensorflow/python:util", "//tensorflow/python:variable_scope", "//tensorflow/python/estimator:estimator_py", - "//tensorflow/python/estimator:export", - "//tensorflow/python/estimator:head", - "//tensorflow/python/estimator:metric_keys", ], ) @@ -161,6 +157,7 @@ py_test( srcs = [ "head_test.py", ], + shard_count = 4, srcs_version = "PY2AND3", tags = ["no_pip_gpu"], # b/63391119 deps = [ diff --git a/tensorflow/contrib/timeseries/python/timeseries/__init__.py b/tensorflow/contrib/timeseries/python/timeseries/__init__.py index c683dad71de8f8502f08a4e823faa79d60d5604d..8462138339cda8557d9c9ee6e79d4c7a67ad1aa7 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/__init__.py +++ b/tensorflow/contrib/timeseries/python/timeseries/__init__.py @@ -24,5 +24,6 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils from tensorflow.contrib.timeseries.python.timeseries.ar_model import * from tensorflow.contrib.timeseries.python.timeseries.estimators import * from tensorflow.contrib.timeseries.python.timeseries.feature_keys import * +from tensorflow.contrib.timeseries.python.timeseries.head import * from tensorflow.contrib.timeseries.python.timeseries.input_pipeline import * # pylint: enable=wildcard-import diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py index 63f5d3568bc208e1ce0ae69abb3a93132163c860..5eb4deefb9494566bc31b2b8a72aab4f04f2980e 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py +++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py @@ -195,7 +195,7 @@ class ARModelTest(test.TestCase): self.train_helper(input_window_size=10, loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS, train_steps=300, - max_loss=1.5, + max_loss=2.5, anomaly_distribution=None) def test_autoregression_normal_multiple_periods(self): diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py index 769183f40ad269954dac70db393207c266052144..0ddc4b4144da25206735b0480aa0886374ed43a8 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py +++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py @@ -37,6 +37,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.training import training as train from tensorflow.python.util import nest @@ -79,12 +80,137 @@ class TimeSeriesRegressor(estimator_lib.Estimator): model_dir=model_dir, config=config) - # TODO(allenl): A parsing input receiver function, which takes a serialized - # tf.Example containing all features (times, values, any exogenous features) - # and serialized model state (possibly also as a tf.Example). - def build_raw_serving_input_receiver_fn(self, - default_batch_size=None, - default_series_length=None): + def _model_start_state_placeholders( + self, batch_size_tensor, static_batch_size=None): + """Creates placeholders with zeroed start state for the current model.""" + gathered_state = {} + # Models may not know the shape of their state without creating some + # variables/ops. Avoid polluting the default graph by making a new one. We + # use only static metadata from the returned Tensors. + with ops.Graph().as_default(): + self._model.initialize_graph() + # Evaluate the initial state as same-dtype "zero" values. These zero + # constants aren't used, but are necessary for feeding to + # placeholder_with_default for the "cold start" case where state is not + # fed to the model. + def _zeros_like_constant(tensor): + return tensor_util.constant_value(array_ops.zeros_like(tensor)) + start_state = nest.map_structure( + _zeros_like_constant, self._model.get_start_state()) + for prefixed_state_name, state in ts_head_lib.state_to_dictionary( + start_state).items(): + state_shape_with_batch = tensor_shape.TensorShape( + (static_batch_size,)).concatenate(state.shape) + default_state_broadcast = array_ops.tile( + state[None, ...], + multiples=array_ops.concat( + [batch_size_tensor[None], + array_ops.ones(len(state.shape), dtype=dtypes.int32)], + axis=0)) + gathered_state[prefixed_state_name] = array_ops.placeholder_with_default( + input=default_state_broadcast, + name=prefixed_state_name, + shape=state_shape_with_batch) + return gathered_state + + def build_one_shot_parsing_serving_input_receiver_fn( + self, filtering_length, prediction_length, default_batch_size=None, + values_input_dtype=None, truncate_values=False): + """Build an input_receiver_fn for export_savedmodel accepting tf.Examples. + + Only compatible with `OneShotPredictionHead` (see `head`). + + Args: + filtering_length: The number of time steps used as input to the model, for + which values are provided. If more than `filtering_length` values are + provided (via `truncate_values`), only the first `filtering_length` + values are used. + prediction_length: The number of time steps requested as predictions from + the model. Times and all exogenous features must be provided for these + steps. + default_batch_size: If specified, must be a scalar integer. Sets the batch + size in the static shape information of all feature Tensors, which means + only this batch size will be accepted by the exported model. If None + (default), static shape information for batch sizes is omitted. + values_input_dtype: An optional dtype specification for values in the + tf.Example protos (either float32 or int64, since these are the numeric + types supported by tf.Example). After parsing, values are cast to the + model's dtype (float32 or float64). + truncate_values: If True, expects `filtering_length + prediction_length` + values to be provided, but only uses the first `filtering_length`. If + False (default), exactly `filtering_length` values must be provided. + + Returns: + An input_receiver_fn which may be passed to the Estimator's + export_savedmodel. + + Expects features contained in a vector of serialized tf.Examples with + shape [batch size] (dtype `tf.string`), each tf.Example containing + features with the following shapes: + times: [filtering_length + prediction_length] integer + values: [filtering_length, num features] floating point. If + `truncate_values` is True, expects `filtering_length + + prediction_length` values but only uses the first `filtering_length`. + all exogenous features: [filtering_length + prediction_length, ...] + (various dtypes) + """ + if values_input_dtype is None: + values_input_dtype = dtypes.float32 + if truncate_values: + values_proto_length = filtering_length + prediction_length + else: + values_proto_length = filtering_length + + def _serving_input_receiver_fn(): + """A receiver function to be passed to export_savedmodel.""" + times_column = feature_column.numeric_column( + key=feature_keys.TrainEvalFeatures.TIMES, dtype=dtypes.int64) + values_column = feature_column.numeric_column( + key=feature_keys.TrainEvalFeatures.VALUES, dtype=values_input_dtype, + shape=(self._model.num_features,)) + parsed_features_no_sequence = ( + feature_column.make_parse_example_spec( + list(self._model.exogenous_feature_columns) + + [times_column, values_column])) + parsed_features = {} + for key, feature_spec in parsed_features_no_sequence.items(): + if isinstance(feature_spec, parsing_ops.FixedLenFeature): + if key == feature_keys.TrainEvalFeatures.VALUES: + parsed_features[key] = feature_spec._replace( + shape=((values_proto_length,) + + feature_spec.shape)) + else: + parsed_features[key] = feature_spec._replace( + shape=((filtering_length + prediction_length,) + + feature_spec.shape)) + elif feature_spec.dtype == dtypes.string: + parsed_features[key] = parsing_ops.FixedLenFeature( + shape=(filtering_length + prediction_length,), + dtype=dtypes.string) + else: # VarLenFeature + raise ValueError("VarLenFeatures not supported, got %s for key %s" + % (feature_spec, key)) + tfexamples = array_ops.placeholder( + shape=[default_batch_size], dtype=dtypes.string, name="input") + features = parsing_ops.parse_example( + serialized=tfexamples, + features=parsed_features) + features[feature_keys.TrainEvalFeatures.TIMES] = array_ops.squeeze( + features[feature_keys.TrainEvalFeatures.TIMES], axis=-1) + features[feature_keys.TrainEvalFeatures.VALUES] = math_ops.cast( + features[feature_keys.TrainEvalFeatures.VALUES], + dtype=self._model.dtype)[:, :filtering_length] + features.update( + self._model_start_state_placeholders( + batch_size_tensor=array_ops.shape( + features[feature_keys.TrainEvalFeatures.TIMES])[0], + static_batch_size=default_batch_size)) + return export_lib.ServingInputReceiver( + features, {"examples": tfexamples}) + return _serving_input_receiver_fn + + def build_raw_serving_input_receiver_fn( + self, default_batch_size=None, default_series_length=None): """Build an input_receiver_fn for export_savedmodel which accepts arrays. Automatically creates placeholders for exogenous `FeatureColumn`s passed to @@ -149,34 +275,10 @@ class TimeSeriesRegressor(estimator_lib.Estimator): + batch_only_feature_shape[1:]) placeholders[feature_key] = array_ops.placeholder( dtype=value_dtype, name=feature_key, shape=feature_shape) - # Models may not know the shape of their state without creating some - # variables/ops. Avoid polluting the default graph by making a new one. We - # use only static metadata from the returned Tensors. - with ops.Graph().as_default(): - self._model.initialize_graph() - # Evaluate the initial state as same-dtype "zero" values. These zero - # constants aren't used, but are necessary for feeding to - # placeholder_with_default for the "cold start" case where state is not - # fed to the model. - def _zeros_like_constant(tensor): - return tensor_util.constant_value(array_ops.zeros_like(tensor)) - start_state = nest.map_structure( - _zeros_like_constant, self._model.get_start_state()) batch_size_tensor = array_ops.shape(time_placeholder)[0] - for prefixed_state_name, state in ts_head_lib.state_to_dictionary( - start_state).items(): - state_shape_with_batch = tensor_shape.TensorShape( - (default_batch_size,)).concatenate(state.shape) - default_state_broadcast = array_ops.tile( - state[None, ...], - multiples=array_ops.concat( - [batch_size_tensor[None], - array_ops.ones(len(state.shape), dtype=dtypes.int32)], - axis=0)) - placeholders[prefixed_state_name] = array_ops.placeholder_with_default( - input=default_state_broadcast, - name=prefixed_state_name, - shape=state_shape_with_batch) + placeholders.update( + self._model_start_state_placeholders( + batch_size_tensor, static_batch_size=default_batch_size)) return export_lib.ServingInputReceiver(placeholders, placeholders) return _serving_input_receiver_fn diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py index 8686a803e5bb023bbddb7df3203080fee0e13fea..32194e400e6ada594ef2a067bf612826a6e4acd3 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/head.py +++ b/tensorflow/contrib/timeseries/python/timeseries/head.py @@ -26,6 +26,7 @@ from tensorflow.python.estimator.canned import metric_keys from tensorflow.python.estimator.export import export_lib from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops +from tensorflow.python.framework import sparse_tensor from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops @@ -180,7 +181,7 @@ class TimeSeriesRegressionHead(head_lib._Head): # pylint:disable=protected-acce return math_ops.cast(value, self.model.dtype) if name == feature_keys.PredictionFeatures.STATE_TUPLE: return value # Correct dtypes are model-dependent - return ops.convert_to_tensor(value) + return sparse_tensor.convert_to_tensor_or_sparse_tensor(value) def _gather_state(self, features): """Returns `features` with state packed, indicates if packing was done.""" @@ -202,6 +203,29 @@ class TimeSeriesRegressionHead(head_lib._Head): # pylint:disable=protected-acce flat_sequence=[tensor for _, _, tensor in numbered_state]) return features, True + def _check_predict_features(self, features): + """Raises errors if features are not suitable for prediction.""" + if feature_keys.PredictionFeatures.TIMES not in features: + raise ValueError("Expected a '{}' feature for prediction.".format( + feature_keys.PredictionFeatures.TIMES)) + if feature_keys.PredictionFeatures.STATE_TUPLE not in features: + raise ValueError("Expected a '{}' feature for prediction.".format( + feature_keys.PredictionFeatures.STATE_TUPLE)) + times_feature = features[feature_keys.PredictionFeatures.TIMES] + if not times_feature.get_shape().is_compatible_with([None, None]): + raise ValueError( + ("Expected shape (batch dimension, window size) for feature '{}' " + "(got shape {})").format(feature_keys.PredictionFeatures.TIMES, + times_feature.get_shape())) + _check_feature_shapes_compatible_with( + features=features, + compatible_with_name=feature_keys.PredictionFeatures.TIMES, + compatible_with_value=times_feature, + ignore=set([ + # Model-dependent shapes + feature_keys.PredictionFeatures.STATE_TUPLE + ])) + def create_estimator_spec(self, features, mode, labels=None): """Performs basic error checking and returns an EstimatorSpec.""" with ops.name_scope(self._name, "head"): @@ -230,7 +254,7 @@ class TimeSeriesRegressionHead(head_lib._Head): # pylint:disable=protected-acce mode == estimator_lib.ModeKeys.EVAL): _check_train_eval_features(features, self.model) elif mode == estimator_lib.ModeKeys.PREDICT: - _check_predict_features(features) + self._check_predict_features(features) else: raise ValueError("Unknown mode '{}' passed to model_fn.".format(mode)) @@ -267,6 +291,44 @@ class OneShotPredictionHead(TimeSeriesRegressionHead): each time predictions are requested when using this head. """ + def _check_predict_features(self, features): + """Raises errors if features are not suitable for one-shot prediction.""" + if feature_keys.PredictionFeatures.TIMES not in features: + raise ValueError("Expected a '{}' feature for prediction.".format( + feature_keys.PredictionFeatures.TIMES)) + if feature_keys.TrainEvalFeatures.VALUES not in features: + raise ValueError("Expected a '{}' feature for prediction.".format( + feature_keys.TrainEvalFeatures.VALUES)) + if feature_keys.PredictionFeatures.STATE_TUPLE not in features: + raise ValueError("Expected a '{}' feature for prediction.".format( + feature_keys.PredictionFeatures.STATE_TUPLE)) + times_feature = features[feature_keys.PredictionFeatures.TIMES] + if not times_feature.get_shape().is_compatible_with([None, None]): + raise ValueError( + ("Expected shape (batch dimension, window size) for feature '{}' " + "(got shape {})").format(feature_keys.PredictionFeatures.TIMES, + times_feature.get_shape())) + _check_feature_shapes_compatible_with( + features=features, + compatible_with_name=feature_keys.PredictionFeatures.TIMES, + compatible_with_value=times_feature, + ignore=set([ + # Model-dependent shapes + feature_keys.PredictionFeatures.STATE_TUPLE, + # One shot prediction head relies on values being shorter than + # times. Even though we're predicting eventually, we need values for + # the filtering phase. + feature_keys.TrainEvalFeatures.VALUES, + ])) + + def _evaluate_ops(self, features): + """Add ops for evaluation (aka filtering) to the graph.""" + spec = super(OneShotPredictionHead, self)._evaluate_ops(features) + # No state is fed to OneShotPredictionHead, so we don't return it; it being + # a tuple can cause issues for downstream infrastructure. + del spec.eval_metric_ops[feature_keys.State.STATE_TUPLE] + return spec + def _serving_ops(self, features): """Add ops for serving to the graph.""" with variable_scope.variable_scope("model", use_resource=True): @@ -333,29 +395,6 @@ def _check_feature_shapes_compatible_with(features, times_shape=compatible_with_value.get_shape())) -def _check_predict_features(features): - """Raises errors if features are not suitable for prediction.""" - if feature_keys.PredictionFeatures.TIMES not in features: - raise ValueError("Expected a '{}' feature for prediction.".format( - feature_keys.PredictionFeatures.TIMES)) - if feature_keys.PredictionFeatures.STATE_TUPLE not in features: - raise ValueError("Expected a '{}' feature for prediction.".format( - feature_keys.PredictionFeatures.STATE_TUPLE)) - times_feature = features[feature_keys.PredictionFeatures.TIMES] - if not times_feature.get_shape().is_compatible_with([None, None]): - raise ValueError( - ("Expected shape (batch dimension, window size) for feature '{}' " - "(got shape {})").format(feature_keys.PredictionFeatures.TIMES, - times_feature.get_shape())) - _check_feature_shapes_compatible_with( - features=features, - compatible_with_name=feature_keys.PredictionFeatures.TIMES, - compatible_with_value=times_feature, - ignore=set([ - feature_keys.PredictionFeatures.STATE_TUPLE # Model-dependent shapes - ])) - - def _check_train_eval_features(features, model): """Raise errors if features are not suitable for training/evaluation.""" if feature_keys.TrainEvalFeatures.TIMES not in features: diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py index 78c2cec21cf4b6ccf6c314e54de41f3e95466adf..bda3b53aca0d0156e542e2bedcadf5caa6b3d2cf 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py +++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools import os from absl.testing import parameterized @@ -26,12 +27,14 @@ import six from tensorflow.contrib.estimator.python.estimator import extenders from tensorflow.contrib.timeseries.examples import lstm as lstm_example +from tensorflow.contrib.timeseries.python.timeseries import ar_model from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators from tensorflow.contrib.timeseries.python.timeseries import feature_keys from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib from tensorflow.contrib.timeseries.python.timeseries import input_pipeline from tensorflow.contrib.timeseries.python.timeseries import model from tensorflow.contrib.timeseries.python.timeseries import state_management +from tensorflow.core.example import example_pb2 from tensorflow.python.client import session as session_lib from tensorflow.python.estimator import estimator_lib @@ -343,15 +346,33 @@ def _structural_ensemble_regressor( model_dir=model_dir) +def _ar_lstm_regressor( + model_dir, head_type, exogenous_feature_columns): + return ts_estimators.TimeSeriesRegressor( + model=ar_model.ARModel( + periodicities=10, input_window_size=10, output_window_size=6, + num_features=5, + exogenous_feature_columns=exogenous_feature_columns, + prediction_model_factory=functools.partial( + ar_model.LSTMPredictionModel, + num_units=10)), + head_type=head_type, + model_dir=model_dir) + + class OneShotTests(parameterized.TestCase): @parameterized.named_parameters( + {"testcase_name": "ar_lstm_regressor", + "estimator_factory": _ar_lstm_regressor}, {"testcase_name": "custom_time_series_regressor", "estimator_factory": _custom_time_series_regressor}, {"testcase_name": "structural_ensemble_regressor", "estimator_factory": _structural_ensemble_regressor}) def test_one_shot_prediction_head_export(self, estimator_factory): - model_dir = os.path.join(test.get_temp_dir(), str(ops.uid())) + def _new_temp_dir(): + return os.path.join(test.get_temp_dir(), str(ops.uid())) + model_dir = _new_temp_dir() categorical_column = feature_column.categorical_column_with_hash_bucket( key="categorical_exogenous_feature", hash_bucket_size=16) exogenous_feature_columns = [ @@ -376,8 +397,10 @@ class OneShotTests(parameterized.TestCase): input_pipeline.NumpyReader(train_features), shuffle_seed=2, num_threads=1, batch_size=16, window_size=16) estimator.train(input_fn=train_input_fn, steps=5) + result = estimator.evaluate(input_fn=train_input_fn, steps=1) + self.assertNotIn(feature_keys.State.STATE_TUPLE, result) input_receiver_fn = estimator.build_raw_serving_input_receiver_fn() - export_location = estimator.export_savedmodel(test.get_temp_dir(), + export_location = estimator.export_savedmodel(_new_temp_dir(), input_receiver_fn) graph = ops.Graph() with graph.as_default(): @@ -412,6 +435,41 @@ class OneShotTests(parameterized.TestCase): in predict_signature.outputs.items()} output = session.run(fetches, feed_dict=feeds) self.assertEqual((2, 15, 5), output["mean"].shape) + # Build a parsing input function, then make a tf.Example for it to parse. + export_location = estimator.export_savedmodel( + _new_temp_dir(), + estimator.build_one_shot_parsing_serving_input_receiver_fn( + filtering_length=20, prediction_length=15)) + graph = ops.Graph() + with graph.as_default(): + with session_lib.Session() as session: + example = example_pb2.Example() + times = example.features.feature[feature_keys.TrainEvalFeatures.TIMES] + values = example.features.feature[feature_keys.TrainEvalFeatures.VALUES] + times.int64_list.value.extend(range(35)) + for i in range(20): + values.float_list.value.extend( + [float(i) * 2. + feature_number + for feature_number in range(5)]) + real_feature = example.features.feature["2d_exogenous_feature"] + categortical_feature = example.features.feature[ + "categorical_exogenous_feature"] + for i in range(35): + real_feature.float_list.value.extend([1, 1]) + categortical_feature.bytes_list.value.append(b"strkey") + # Serialize the tf.Example for feeding to the Session + examples = [example.SerializeToString()] * 2 + signatures = loader.load( + session, [tag_constants.SERVING], export_location) + predict_signature = signatures.signature_def[ + feature_keys.SavedModelLabels.PREDICT] + ((_, input_value),) = predict_signature.inputs.items() + feeds = {graph.as_graph_element(input_value.name): examples} + fetches = {output_key: graph.as_graph_element(output_value.name) + for output_key, output_value + in predict_signature.outputs.items()} + output = session.run(fetches, feed_dict=feeds) + self.assertEqual((2, 15, 5), output["mean"].shape) if __name__ == "__main__": diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD index 5a7825f29a29585af87c113b2475fb9a1d795d75..2abf402e6cf566ee09a73b3d654f7ee2aa7b0436 100644 --- a/tensorflow/contrib/tpu/BUILD +++ b/tensorflow/contrib/tpu/BUILD @@ -41,13 +41,13 @@ py_library( "python/tpu/tpu_config.py", "python/tpu/tpu_context.py", "python/tpu/tpu_estimator.py", - "python/tpu/tpu_system_metadata.py", "python/tpu/util.py", ], srcs_version = "PY2AND3", deps = [ ":tpu_lib", - ":tpu_py", + "//tensorflow/compiler/xla/experimental/xla_sharding", + "//tensorflow/compiler/xla/python_api:xla_shape", "//tensorflow/contrib/training:training_py", "//tensorflow/core:protos_all_py", "//tensorflow/python:array_ops", @@ -62,10 +62,7 @@ py_library( "//tensorflow/python:training", "//tensorflow/python:variable_scope", "//tensorflow/python:variables", - "//tensorflow/python/estimator", - "//tensorflow/python/estimator:model_fn", - "//tensorflow/python/estimator:run_config", - "//tensorflow/python/estimator:util", + "//tensorflow/python/estimator:estimator_py", "@six_archive//:six", ], ) @@ -134,7 +131,7 @@ py_library( tf_custom_op_py_library( name = "tpu_py", - srcs = glob(["python/ops/*.py"]) + ["__init__.py"], + srcs = glob(["python/ops/*.py"]), dso = [":python/ops/_tpu_ops.so"], kernels = [ ":all_ops", @@ -153,9 +150,13 @@ tf_custom_op_py_library( py_library( name = "tpu", - srcs = ["python/tpu/__init__.py"], + srcs = [ + "__init__.py", + "python/tpu/__init__.py", + ], srcs_version = "PY2AND3", deps = [ + ":keras_support", # split out to avoid cycle with tpu_strategy ":tpu_estimator", ":tpu_lib", ], @@ -170,19 +171,13 @@ py_library( visibility = [ "//cloud/vmm/testing/tests/tpu:__subpackages__", "//learning/brain:__subpackages__", - # TODO(b/111651964): Clean special visibility for keras_support. - # - # Note: If you are an end user, please do not add your project to this - # visibility. This feature is experimental, and will be made public - # when ready. - "//third_party/cloud_tpu/models/keras:__subpackages__", "//tensorflow:__subpackages__", + "//third_party/cloud_tpu/models/keras:__subpackages__", ], deps = [ ":tpu_lib", - ":tpu_py", "//tensorflow/contrib/cluster_resolver:tpu_cluster_resolver_py", - "//tensorflow/contrib/distribute/python:tpu_strategy", + "//tensorflow/contrib/distribute", "//tensorflow/contrib/framework:framework_py", "//tensorflow/contrib/tpu/proto:compilation_result_proto_py", "//tensorflow/core:protos_all_py", @@ -197,7 +192,7 @@ py_library( "//tensorflow/python:tensor_spec", "//tensorflow/python:variable_scope", "//tensorflow/python/data/ops:dataset_ops", - "//tensorflow/python/estimator:model_fn", + "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/keras:backend", "//tensorflow/python/keras:engine", "//tensorflow/python/keras:layers", @@ -218,6 +213,7 @@ py_library( "python/tpu/tpu_function.py", "python/tpu/tpu_optimizer.py", "python/tpu/tpu_sharding.py", + "python/tpu/tpu_system_metadata.py", "python/tpu/training_loop.py", ], srcs_version = "PY2AND3", diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py index d5484e9032fb874e9f608ec398be4cd03b2aaf32..537d94b7979af3e4bd3fb7392c8dcc5a210e98af 100644 --- a/tensorflow/contrib/tpu/__init__.py +++ b/tensorflow/contrib/tpu/__init__.py @@ -18,6 +18,10 @@ @@cross_replica_sum @@infeed_dequeue @@infeed_dequeue_tuple +@@infeed_enqueue +@@infeed_enqueue_tuple +@@outfeed_dequeue +@@outfeed_dequeue_tuple @@outfeed_enqueue @@outfeed_enqueue_tuple @@ -47,6 +51,9 @@ @@InputPipelineConfig @@TPUConfig @@bfloat16_scope + +@@TPUDistributionStrategy +@@keras_to_tpu_model """ from __future__ import absolute_import @@ -58,11 +65,13 @@ from tensorflow.contrib.tpu.python import profiler from tensorflow.contrib.tpu.python.ops.tpu_ops import * from tensorflow.contrib.tpu.python.tpu.bfloat16 import * from tensorflow.contrib.tpu.python.tpu.device_assignment import * +from tensorflow.contrib.tpu.python.tpu.keras_support import tpu_model as keras_to_tpu_model +from tensorflow.contrib.tpu.python.tpu.keras_support import TPUDistributionStrategy from tensorflow.contrib.tpu.python.tpu.topology import * from tensorflow.contrib.tpu.python.tpu.tpu import * from tensorflow.contrib.tpu.python.tpu.tpu_config import * from tensorflow.contrib.tpu.python.tpu.tpu_estimator import * -from tensorflow.contrib.tpu.python.tpu.tpu_feed import * +from tensorflow.contrib.tpu.python.tpu.tpu_feed import InfeedQueue from tensorflow.contrib.tpu.python.tpu.tpu_optimizer import * from tensorflow.contrib.tpu.python.tpu.training_loop import * # pylint: enable=wildcard-import,unused-import diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc index f80f5652af79d410946971573ae160fdd0b85f6d..8e6e9aa0cded630f39bfd699def37e06a8b920e8 100644 --- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc +++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc @@ -84,8 +84,6 @@ ProfileRequest PopulateProfileRequest(int duration_ms, request.add_tools("memory_viewer"); request.add_tools("overview_page"); *request.mutable_opts() = opts; - std::cout << "Limiting the number of trace events to " << kMaxEvents - << std::endl; return request; } @@ -99,7 +97,6 @@ bool Profile(const string& service_addr, const string& logdir, int duration_ms, ::grpc::ClientContext context; ::grpc::ChannelArguments channel_args; - // TODO(ioeric): use `SetMaxReceiveMessageSize` instead once it's available. // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their // `ValidateHostPortPair` checks for empty host string case. channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, @@ -166,6 +163,85 @@ bool NewSession(const string& service_addr, return new_session_response.empty_trace(); } +// Starts tracing on a single or multiple TPU hosts and saves the result in the +// given logdir. If no trace was collected, retries tracing for +// num_tracing_attempts. +void StartTracing(const tensorflow::string& service_addr, + const tensorflow::string& logdir, + const tensorflow::string& workers_list, + bool include_dataset_ops, int duration_ms, + int num_tracing_attempts) { + // Use the current timestamp as the run name. + tensorflow::string session_id = GetCurrentTimeStampAsString(); + constexpr char kProfilePluginDirectory[] = "plugins/profile/"; + tensorflow::string repository_root = + io::JoinPath(logdir, kProfilePluginDirectory); + std::vector hostnames = + tensorflow::str_util::Split(workers_list, ","); + + bool empty_trace = false; + int remaining_attempts = num_tracing_attempts; + tensorflow::ProfileOptions opts; + opts.set_include_dataset_ops(include_dataset_ops); + while (true) { + std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. " + << "Remaining attempt(s): " << remaining_attempts-- << std::endl; + if (hostnames.empty()) { + empty_trace = tensorflow::tpu::Profile(service_addr, logdir, duration_ms, + repository_root, session_id, opts); + } else { + tensorflow::string tpu_master = service_addr; + empty_trace = + tensorflow::tpu::NewSession(tpu_master, hostnames, duration_ms, + repository_root, session_id, opts); + } + if (remaining_attempts <= 0 || !empty_trace) break; + std::cout << "No trace event is collected. Automatically retrying." + << std::endl + << std::endl; + } + + if (empty_trace) { + std::cout << "No trace event is collected after " << num_tracing_attempts + << " attempt(s). " + << "Perhaps, you want to try again (with more attempts?)." + << std::endl + << "Tip: increase number of attempts with --num_tracing_attempts." + << std::endl; + } +} + +MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level) { + MonitorRequest request; + request.set_duration_ms(duration_ms); + request.set_monitoring_level(monitoring_level); + return request; +} + +// Repeatedly collects profiles and shows user-friendly metrics for +// 'num_queries' time(s). +void StartMonitoring(const tensorflow::string& service_addr, int duration_ms, + int monitoring_level, int num_queries) { + for (int query = 0; query < num_queries; ++query) { + MonitorRequest request = + PopulateMonitorRequest(duration_ms, monitoring_level); + + ::grpc::ClientContext context; + ::grpc::ChannelArguments channel_args; + channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, + std::numeric_limits::max()); + std::unique_ptr stub = + TPUProfiler::NewStub(::grpc::CreateCustomChannel( + "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(), + channel_args)); + MonitorResponse response; + TF_QCHECK_OK(FromGrpcStatus(stub->Monitor(&context, request, &response))); + + std::cout << "Xprof Monitoring Results (Sample " << query + 1 << "):\n\n" + << response.data() << std::flush; + } +} + } // namespace } // namespace tpu } // namespace tensorflow @@ -174,9 +250,11 @@ int main(int argc, char** argv) { tensorflow::string FLAGS_service_addr; tensorflow::string FLAGS_logdir; tensorflow::string FLAGS_workers_list; - int FLAGS_duration_ms = 2000; + int FLAGS_duration_ms = 0; int FLAGS_num_tracing_attempts = 3; bool FLAGS_include_dataset_ops = true; + int FLAGS_monitoring_level = 0; + int FLAGS_num_queries = 100; std::vector flag_list = { tensorflow::Flag("service_addr", &FLAGS_service_addr, "Address of TPU profiler service e.g. localhost:8466"), @@ -186,21 +264,38 @@ int main(int argc, char** argv) { tensorflow::Flag("logdir", &FLAGS_logdir, "Path of TensorBoard log directory e.g. /tmp/tb_log, " "gs://tb_bucket"), - tensorflow::Flag("duration_ms", &FLAGS_duration_ms, - "Duration of tracing in ms. Default is 2000ms."), + tensorflow::Flag( + "duration_ms", &FLAGS_duration_ms, + "Duration of tracing or monitoring in ms. Default is 2000ms for " + "tracing and 1000ms for monitoring."), tensorflow::Flag("num_tracing_attempts", &FLAGS_num_tracing_attempts, "Automatically retry N times when no trace event " "is collected. Default is 3."), tensorflow::Flag("include_dataset_ops", &FLAGS_include_dataset_ops, "Set to false to profile longer TPU device traces."), - }; + tensorflow::Flag("monitoring_level", &FLAGS_monitoring_level, + "Choose a monitoring level between 1 and 2 to monitor " + "your TPU job continuously. Level 2 is more verbose " + "than level 1 and shows more metrics."), + tensorflow::Flag("num_queries", &FLAGS_num_queries, + "This script will run monitoring for num_queries before " + "it stops.")}; std::cout << "Welcome to the Cloud TPU Profiler v" << TPU_PROFILER_VERSION << std::endl; tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list); bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list); - if (!parse_ok || FLAGS_service_addr.empty() || FLAGS_logdir.empty()) { + if (!parse_ok || FLAGS_service_addr.empty() || + (FLAGS_logdir.empty() && FLAGS_monitoring_level == 0)) { + // Fail if flags are not parsed correctly or service_addr not provided. + // Also, fail if neither logdir is provided (required for tracing) nor + // monitoring level is provided (required for monitoring). + std::cout << usage.c_str() << std::endl; + return 2; + } + if (FLAGS_monitoring_level < 0 || FLAGS_monitoring_level > 2) { + // Invalid monitoring level. std::cout << usage.c_str() << std::endl; return 2; } @@ -213,52 +308,27 @@ int main(int argc, char** argv) { } tensorflow::port::InitMain(argv[0], &argc, &argv); - // Sets the minimum duration_ms and tracing attempts to one. - int duration_ms = std::max(FLAGS_duration_ms, 1); - int remaining_attempts = std::max(FLAGS_num_tracing_attempts, 1); - tensorflow::ProfileOptions opts; - opts.set_include_dataset_ops(FLAGS_include_dataset_ops); - tensorflow::ProfileResponse response; - - // Use the current timestamp as the run name. - tensorflow::string session_id = - tensorflow::tpu::GetCurrentTimeStampAsString(); - constexpr char kProfilePluginDirectory[] = "plugins/profile/"; - tensorflow::string repository_root = - ::tensorflow::io::JoinPath(FLAGS_logdir, kProfilePluginDirectory); - std::vector hostnames = - tensorflow::str_util::Split(FLAGS_workers_list, ","); - - bool empty_trace = false; - while (true) { - std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. " - << "Remaining attempt(s): " << remaining_attempts-- << std::endl; - if (hostnames.empty()) { - empty_trace = tensorflow::tpu::Profile(FLAGS_service_addr, FLAGS_logdir, - duration_ms, repository_root, - session_id, opts); - } else { - tensorflow::string tpu_master = FLAGS_service_addr; - empty_trace = - tensorflow::tpu::NewSession(tpu_master, hostnames, duration_ms, - repository_root, session_id, opts); - } - if (remaining_attempts <= 0 || !empty_trace) break; - std::cout << "No trace event is collected. Automatically retrying." - << std::endl - << std::endl; + // Sets the minimum duration_ms, tracing attempts and num queries. + int duration_ms = std::max(FLAGS_duration_ms, 0); + if (duration_ms == 0) { + // If profiling duration was not set by user or set to a negative value, we + // set it to default values of 2000ms for tracing and 1000ms for monitoring. + duration_ms = FLAGS_monitoring_level == 0 ? 2000 : 1000; } + int num_tracing_attempts = std::max(FLAGS_num_tracing_attempts, 1); + int num_queries = std::max(FLAGS_num_queries, 1); - if (empty_trace) { - std::cout << "No trace event is collected after " - << FLAGS_num_tracing_attempts << " attempt(s). " - << "Perhaps, you want to try again (with more attempts?)." - << std::endl - << "Tip: increase number of attempts with --num_tracing_attempts." + if (FLAGS_monitoring_level != 0) { + std::cout << "Since monitoring level is provided, profile " + << FLAGS_service_addr << " for " << duration_ms + << "ms and show metrics for " << num_queries << " time(s)." << std::endl; - // Don't dump profile data if no trace is collected. - return 0; + tensorflow::tpu::StartMonitoring(FLAGS_service_addr, duration_ms, + FLAGS_monitoring_level, num_queries); + } else { + tensorflow::tpu::StartTracing(FLAGS_service_addr, FLAGS_logdir, + FLAGS_workers_list, FLAGS_include_dataset_ops, + duration_ms, num_tracing_attempts); } - return 0; } diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py index 7a5d01cca42351f6d4d8b41d43756560ce7874d3..438f4428483a86b75ca1feb31d9c43f860fcc287 100644 --- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py +++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py @@ -50,7 +50,8 @@ flags.DEFINE_string( flags.DEFINE_string( 'logdir', None, 'Path of TensorBoard log directory e.g. /tmp/tb_log, ' 'gs://tb_bucket') -flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.') +flags.DEFINE_integer('duration_ms', 0, + 'Duration of tracing or monitoring in ms.') flags.DEFINE_integer( 'num_tracing_attempts', 3, 'Automatically retry N times when no trace ' 'event is collected.') @@ -58,6 +59,14 @@ flags.DEFINE_boolean('include_dataset_ops', True, 'Set to false to profile longer TPU ' 'device traces.') +# Monitoring parameters +flags.DEFINE_integer( + 'monitoring_level', 0, 'Choose a monitoring level between ' + '1 and 2 to monitor your TPU job continuously.') +flags.DEFINE_integer( + 'num_queries', 100, + 'This script will run monitoring for num_queries before it stops.') + FLAGS = flags.FLAGS EXECUTABLE = 'data/capture_tpu_profile' JOB_NAME = 'worker' @@ -118,6 +127,8 @@ def main(unused_argv=None): cmd.append('--duration_ms=' + str(FLAGS.duration_ms)) cmd.append('--num_tracing_attempts=' + str(FLAGS.num_tracing_attempts)) cmd.append('--include_dataset_ops=' + str(FLAGS.include_dataset_ops).lower()) + cmd.append('--monitoring_level=' + str(FLAGS.monitoring_level)) + cmd.append('--num_queries=' + str(FLAGS.num_queries)) subprocess.call(cmd) diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py index 19f088f8b862ce7b114490151f2b6a8c260b8580..d4ccb0f24679af830365037819d51529874f4fcc 100644 --- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py +++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py @@ -20,7 +20,7 @@ from __future__ import print_function from setuptools import setup -_VERSION = '1.9.0' +_VERSION = '1.10.0' CONSOLE_SCRIPTS = [ 'capture_tpu_profile=cloud_tpu_profiler.main:run_main', diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h index 1bf49966d12db83f1e6904f8c00453bba278847c..aee094177bf8a36c98463055aafc777a7ed40f44 100644 --- a/tensorflow/contrib/tpu/profiler/version.h +++ b/tensorflow/contrib/tpu/profiler/version.h @@ -16,6 +16,6 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ -#define TPU_PROFILER_VERSION "1.9.0" +#define TPU_PROFILER_VERSION "1.10.0" #endif // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py index 726b2d248e3086e1882004827076ed3e563d960d..471b1fa46c679dcab70e9bc12d61ada84cba79bb 100644 --- a/tensorflow/contrib/tpu/python/tpu/device_assignment.py +++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py @@ -175,6 +175,8 @@ class DeviceAssignment(object): """Returns the physical topology coordinates of a logical core.""" if logical_core is None: logical_core = np.array([0, 0, 0], np.int32) + else: + logical_core = np.asarray(logical_core) if any(logical_core < 0) or any(logical_core >= self.computation_shape): raise ValueError("Invalid core {}; computation shape is {}".format( diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py index 81798ee42313cb9e2232a4796f56d4d16068b82f..ff893a722f4e77c743edd3b8db77aa90be1e498d 100644 --- a/tensorflow/contrib/tpu/python/tpu/keras_support.py +++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py @@ -55,7 +55,6 @@ import time import numpy as np from tensorflow.contrib.cluster_resolver.python.training import tpu_cluster_resolver -from tensorflow.contrib.distribute.python import tpu_strategy from tensorflow.contrib.framework.python.framework import experimental from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result from tensorflow.contrib.tpu.python.ops import tpu_ops @@ -82,7 +81,11 @@ from tensorflow.python.ops import random_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import tf_logging as logging -TPUDistributionStrategy = tpu_strategy.TPUStrategy # pylint: disable=invalid-name + +# Work-around dependency cycle between DistributionStrategy and TPU lib. +def TPUDistributionStrategy(*args, **kw): # pylint: disable=invalid-name + from tensorflow.contrib.distribute.python import tpu_strategy # pylint: disable=g-import-not-at-top + return tpu_strategy.TPUStrategy(*args, **kw) class TPUEmbedding(embeddings.Embedding): @@ -1130,7 +1133,7 @@ Output shape: %(output_shape)s 'layer': layer, 'input_shape': layer.input_shape, 'output_shape': layer.output_shape - }) + }) @experimental diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py index 92c1eaba710d888d461dad39766bb9189ad1ab78..7fa06d6d560a4b6ffa6d9a3fd0fa208b4c60ee7f 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu.py @@ -970,8 +970,15 @@ def rewrite(computation, Args: computation: A Python function that builds a computation to apply to the input. If the function takes n inputs, 'inputs' should be - a list of n tensors. If the function returns m outputs, rewrite - will return a list of m tensors. + a list of n tensors. + + `computation` may return a list of operations and tensors. Tensors must + come before operations in the returned list. The return value of + `rewrite` is a list of tensors corresponding to the tensors from the + from `computation`. + + All `Operation`s returned from `computation` will be executed when + evaluating any of the returned output tensors. inputs: A list of input tensors or `None` (equivalent to an empty list). infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple of arguments as inputs to `computation`. @@ -1008,6 +1015,19 @@ _BLACKLISTED_INFERENCE_OPS = set([ ]) +def under_tpu_inference_context(): + """Check if it is currently under `tpu.rewrite_for_inference()`.""" + graph = ops.get_default_graph() + + context = graph._get_control_flow_context() # pylint: disable=protected-access + while context: + if isinstance(context, _TPUInferenceContext): + return True + context = context.outer_context + + return False + + class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext): """A `ControlFlowContext` for nodes inside a TPU inference computation. diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py index 9e010922dcf565e78944bd77d49f7d3fa07f2cc4..8d05e081a7c6e0327fedae6dc2c3ba45df40d029 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py @@ -44,7 +44,6 @@ class InputPipelineConfig(object): BROADCAST = 4 -# TODO(b/72511246) Provide a simplified api to configure model parallelism. class TPUConfig( collections.namedtuple('TPUConfig', [ 'iterations_per_loop', @@ -53,6 +52,7 @@ class TPUConfig( 'per_host_input_for_training', 'tpu_job_name', 'initial_infeed_sleep_secs', + 'input_partition_dims', ])): r"""TPU related configuration required by `TPUEstimator`. @@ -90,6 +90,17 @@ class TPUConfig( initial_infeed_sleep_secs: The number of seconds the infeed thread should wait before enqueueing the first batch. This helps avoid timeouts for models that require a long compilation time. + input_partition_dims: A nested list to describe the partition dims + for all the tensors from input_fn(). The structure of + input_partition_dims must match the structure of `features` and + `labels` from input_fn(). The total number of partitions must match + `num_cores_per_replica`. For example, if input_fn() returns two tensors: + images with shape [N, H, W, C] and labels [N]. + input_partition_dims = [[1, 2, 2, 1], None] will split the images to 4 + pieces and feed into 4 TPU cores. labels tensor are directly broadcasted + to all the TPU cores since the partition dims is `None`. + Current limitations: This feature is only supported with the PER_HOST_V2 + input mode. Raises: ValueError: If `computation_shape` or `computation_shape` are invalid. @@ -101,7 +112,8 @@ class TPUConfig( num_cores_per_replica=None, per_host_input_for_training=True, tpu_job_name=None, - initial_infeed_sleep_secs=None): + initial_infeed_sleep_secs=None, + input_partition_dims=None): # Check iterations_per_loop. util_lib.check_positive_integer(iterations_per_loop, @@ -111,6 +123,20 @@ class TPUConfig( if num_shards is not None: util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards') + if input_partition_dims is not None: + if len(input_partition_dims) != 1 and len(input_partition_dims) != 2: + raise ValueError( + 'input_partition_dims must be a list/tuple with one or two' + ' elements.') + + if per_host_input_for_training is not InputPipelineConfig.PER_HOST_V2: + raise ValueError( + 'input_partition_dims is only supported in PER_HOST_V2 mode.') + + if num_cores_per_replica is None: + raise ValueError( + 'input_partition_dims requires setting num_cores_per_replica.') + # Parse computation_shape if num_cores_per_replica is not None: if num_cores_per_replica not in [1, 2, 4, 8]: @@ -139,7 +165,8 @@ class TPUConfig( num_cores_per_replica=num_cores_per_replica, per_host_input_for_training=per_host_input_for_training, tpu_job_name=tpu_job_name, - initial_infeed_sleep_secs=initial_infeed_sleep_secs) + initial_infeed_sleep_secs=initial_infeed_sleep_secs, + input_partition_dims=input_partition_dims) class RunConfig(run_config_lib.RunConfig): diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py index a9cf54f77d8192b51af094e71707a958594874f6..806ae1c4c9918be0bf0af8579c12386c0a18aff0 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py @@ -232,11 +232,16 @@ class _InternalTPUContext(object): if tpu_system_metadata is not None: return tpu_system_metadata + cluster_def = None + if (self._config.session_config and + self._config.session_config.cluster_def.job): + cluster_def = self._config.session_config.cluster_def + # pylint: disable=protected-access tpu_system_metadata = ( tpu_system_metadata_lib._query_tpu_system_metadata( master, - run_config=self._config, + cluster_def=cluster_def, query_topology=self.model_parallelism_enabled)) self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata @@ -272,6 +277,10 @@ class _InternalTPUContext(object): def model_parallelism_enabled(self): return self._model_parallelism_enabled + @property + def input_partition_dims(self): + return self._config.tpu_config.input_partition_dims + @property def device_assignment(self): return (self._get_device_assignment() diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index ee9ad525ee34ff114808a4dc7a49702b19c78543..029492b489ea2b790660d7a02dfd189451acf26c 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -224,7 +224,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec): # pylint: disable=prote `metric_fn` runs on CPU to generate metrics and `tensors` represents the `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`. To be precise, TPU evaluation expects a slightly different signature from the - @{tf.estimator.Estimator}. While `EstimatorSpec.eval_metric_ops` expects a + `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`. The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The `tensors` usually specify the model logits, which are transferred back from @@ -247,7 +247,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec): # pylint: disable=prote sending tensors from TPU to CPU. To reduce the overhead, try reducing the size of the tensors. The `tensors` are concatenated along their major (batch) dimension, and so must be >= rank 1. The `host_call` is useful for writing - summaries with @{tf.contrib.summary.create_file_writer}. + summaries with `tf.contrib.summary.create_file_writer`. """ def __new__(cls, @@ -781,16 +781,26 @@ def generate_per_host_v2_enqueue_ops_fn_for_host( flattened_inputs = ( inputs_structure_recorder.flatten_features_and_labels( features, labels)) - control_deps.extend(flattened_inputs) per_host_sharded_inputs.append(flattened_inputs) - infeed_queue = tpu_feed.InfeedQueue( - number_of_tuple_elements=len(per_host_sharded_inputs[0])) - captured_infeed_queue.capture(infeed_queue) + if inputs_structure_recorder.flattened_input_dims: + # pylint: disable=protected-access + infeed_queue = tpu_feed._PartitionedInfeedQueue( + number_of_tuple_elements=len(per_host_sharded_inputs[0]), + host_id=host_id, + input_partition_dims=inputs_structure_recorder.flattened_input_dims, + device_assignment=ctx.device_assignment) + per_host_enqueue_ops = infeed_queue.generate_enqueue_ops( + per_host_sharded_inputs) + else: + infeed_queue = tpu_feed.InfeedQueue( + number_of_tuple_elements=len(per_host_sharded_inputs[0])) + per_host_enqueue_ops = infeed_queue.generate_enqueue_ops( + per_host_sharded_inputs, + tpu_ordinal_function=tpu_ordinal_function_impl) + captured_infeed_queue.capture(infeed_queue) - per_host_enqueue_ops = infeed_queue.generate_enqueue_ops( - per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl) return per_host_enqueue_ops return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset @@ -907,21 +917,68 @@ class _InputPipeline(object): class InputsStructureRecorder(object): """The recorder to record inputs structure.""" - def __init__(self): + def __init__(self, input_partition_dims=None): # Holds the structure of inputs self._feature_names = [] self._label_names = [] self._has_labels = False self._signals_helper = None + self._flattened_input_dims = None + + if input_partition_dims: + # This should have been validated in TPUConfig. + assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.' + if len(input_partition_dims) == 2: + self._feature_dims, self._label_dims = input_partition_dims + else: + self._feature_dims = input_partition_dims[0] + self._label_dims = None + + assert self._feature_dims is not None, ('input_partition_dims[0] must ' + 'not be None') + else: + self._feature_dims = None + self._label_dims = None # Internal state. self._initialized = False + @property + def flattened_input_dims(self): + assert self._initialized, 'InputsStructureRecorder is not initialized.' + return self._flattened_input_dims + def has_labels(self): return self._has_labels + def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims, + label_dims_names, label_names, has_labels): + """Flatten input dims with the same order as flattened input tensors.""" + flattened_input_dims = [] + if feature_dims_names: + # We need a fixed ordering for matching the tensors in features. + flattened_input_dims.extend( + [feature_dims[name] for name in feature_dims_names]) + else: + flattened_input_dims.append(feature_dims) + + if label_dims_names: + # We need a fixed ordering for matching the tensors in labels. + flattened_input_dims.extend( + [label_dims[name] for name in label_dims_names]) + else: + if label_names: + num_tensors_in_label = len(label_names) + else: + num_tensors_in_label = int(has_labels) + # Setting `None` in input_partition_dims[1] will apply `None` to + # all the tensors in labels, regardless of internal structure. + flattened_input_dims.extend([label_dims] * num_tensors_in_label) + + return flattened_input_dims + def validate_and_record_structure(self, features, labels, signals=None): - """Validates and records the structure of features` and `labels`.""" + """Validates and records the structure of `features` and `labels`.""" def _extract_key_names(tensor_or_dict): if tensor_or_dict is None: @@ -949,6 +1006,24 @@ class _InputPipeline(object): self._feature_names = feature_names self._label_names = label_names self._has_labels = has_labels + if self._feature_dims is not None: + feature_dims_names = _extract_key_names(self._feature_dims) + if feature_dims_names != feature_names: + raise ValueError( + 'TPUConfig.input_partition_dims[0] mismatched feature' + ' keys. Expected {}, got {}'.format(feature_names, + feature_dims_names)) + + label_dims_names = _extract_key_names(self._label_dims) + if self._label_dims is not None and label_dims_names != label_names: + raise ValueError( + 'TPUConfig.input_partition_dims[1] mismatched label' + ' keys. Expected {}, got {}'.format(label_names, + label_dims_names)) + + self._flattened_input_dims = self._flatten_input_dims( + self._feature_dims, feature_dims_names, self._label_dims, + label_dims_names, label_names, has_labels) def flatten_features_and_labels(self, features, labels, signals=None): """Flattens the `features` and `labels` to a single tensor list.""" @@ -1043,7 +1118,8 @@ class _InputPipeline(object): Raises: ValueError: If both `sharded_features` and `num_cores` are `None`. """ - self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder() + self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder( + ctx.input_partition_dims) self._sharded_per_core = ctx.is_input_sharded_per_core() self._input_fn = input_fn @@ -2810,7 +2886,8 @@ def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): multi_tpu_predict_steps_on_single_shard, inputs=[], num_shards=num_cores, - outputs_from_all_shards=False) + outputs_from_all_shards=False, + device_assignment=ctx.device_assignment) scaffold = _get_scaffold(captured_scaffold_fn) return dummy_predict_op, host_calls, scaffold, captured_predict_hooks.get() diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py index a44b4f4622afabced9cb1b801acedb0e7b1e5d12..d9c77a3ea1bbc456f058f36d78eec1f0843ddc79 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py @@ -20,8 +20,13 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import itertools + +import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin +from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding +from tensorflow.compiler.xla.python_api import xla_shape from tensorflow.contrib.tpu.python.ops import tpu_ops from tensorflow.contrib.tpu.python.tpu import tpu from tensorflow.contrib.tpu.python.tpu import tpu_sharding @@ -30,6 +35,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops +from tensorflow.python.util import nest class InfeedQueue(object): @@ -640,3 +646,264 @@ class InfeedQueue(object): tpu_ordinal=tpu_ordinal_function(index)) for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards)) ] + + +class _PartitionedInfeedQueue(InfeedQueue): + """A helper object to build a device infeed queue with input partition. + + Args: + number_of_tuple_elements: the number of Tensors fed atomically through the + queue, must be present unless it can be inferred from other arguments. + device_assignment: A TPU `DeviceAssignment` which is used to place all the + partitions to different TPU infeed queues. + host_id: The id of the host machine. + input_partition_dims: A nested list/tuple of integers. Each inner + list/tuple describes how to partition the corresponding input tensor. + tuple_types: If not None, a list of types of the elements of the queue. + tuple_shapes: If not None, a list of shapes of the elements of the queue. + name: The name of the queue. + """ + + def __init__(self, + number_of_tuple_elements, + device_assignment, + host_id, + input_partition_dims=None, + tuple_types=None, + tuple_shapes=None, + name=None): + super(_PartitionedInfeedQueue, self).__init__( + number_of_tuple_elements=number_of_tuple_elements, + tuple_types=tuple_types, + tuple_shapes=None, + shard_dimensions=None, + name="PartitionedInfeedQueue" if name is None else name) + self._input_partition_dims = input_partition_dims + self._host_id = host_id + self._device_assignment = device_assignment + + def generate_dequeue_op(self, tpu_device=0): + """Generate TPU dequeue ops. + + Args: + tpu_device: The TPU device ordinal where the infeed instruction should be + placed. + + Returns: + A list of Outputs corresponding to a partition of infeed dequeued + into XLA, suitable for use within a replicated block. + + Raises: + ValueError: if the types or shapes of the tuple elements have not been + set; or if a dequeue op has already been generated. + """ + self.freeze() + if self._generated_dequeue_op: + raise ValueError("Can't generate two dequeue Ops from the same queue") + self._generated_dequeue_op = True + full_name = "%s/dequeue" % self._name + sharded_shapes = [ + policy.get_sharded_shape(shape) + for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies) + ] + with ops.device(tpu.core(tpu_device)): + values = tpu_ops.infeed_dequeue_tuple( + dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name) + return self._tag_sharding_attribute_for_dequeued_tensors( + values, self._input_partition_dims) + + def generate_enqueue_ops(self, per_host_sharded_inputs): + """Generates the host-side Ops to enqueue the partitioned inputs. + + per_host_sharded_inputs is a list, one for each replica, of lists of + Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed + replica i. + sharded_inputs[i][j] is partitioned by self._input_partition_dims[j]. + + For example, if sharded_inputs[i][j] is a 2-D Tensor: + [[A, B, C, D], + [E ,F, G, H]] + self._input_partition_dims[j] is [2, 4]. + + sharded_inputs[i][j] will be partitioned and flattened into: + [A, B, C, D, E, F, G, H] and fed into the logical core ids: + [0, 1, 2, 3, 4, 5, 6, 7] respectively. + + Args: + per_host_sharded_inputs: a list of lists of Tensors. The length of the + outer list determines the number of shards. Each inner list indicates + the types and shapes of the tuples in the corresponding shard. + + Returns: + A list of host-side Ops, one for each shard, that when executed together + will enqueue a full-size element of infeed. + + Raises: + ValueError: if the queue configuration has previously been frozen and the + shapes of the elements of sharded_inputs are not compatible with the + frozen configuration; or if the shapes of the elements of sharded_inputs + don't form a consistent unsharded tuple; or if the elements of a tuple + have different device constraints; or if the partition dims are invalid. + TypeError: if the queue configuration has previously been frozen and the + types of the elements of sharded_inputs are not compatible with the + frozen configuration; or if the types of the elements of sharded_inputs + don't form a consistent unsharded tuple. + """ + self.set_configuration_from_sharded_input_tensors(per_host_sharded_inputs) + number_of_replicas_per_host = len(per_host_sharded_inputs) + number_of_tuple_elements = len(per_host_sharded_inputs[0]) + + assert len(self._input_partition_dims) == number_of_tuple_elements + per_host_enqueue_ops = [] + + for replica_index in range(number_of_replicas_per_host): + flattened_inputs = per_host_sharded_inputs[replica_index] + inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs, + self._input_partition_dims) + inputs_parted_iters = [ + iter(self._partition_or_replicate_on_host(x, dims)) for x, dims in + zip(per_host_sharded_inputs[replica_index], inputs_part_dims_flat) + ] + + for core_index in xrange(self._device_assignment.num_cores_per_replica): + # Places different partitions to different logic cores. + logical_core = self._get_logical_core(core_index) + replica_id = self._device_assignment.lookup_replicas( + self._host_id, logical_core)[replica_index] + ordinal = self._device_assignment.tpu_ordinal( + replica=replica_id, logical_core=logical_core) + infeed_inputs = [] + for it in inputs_parted_iters: + input_for_device = next(it, None) + if input_for_device is not None: + infeed_inputs.append(input_for_device) + + if infeed_inputs: + per_host_enqueue_ops.append( + tpu_ops.infeed_enqueue_tuple( + inputs=infeed_inputs, + shapes=[x.shape for x in infeed_inputs], + name="enqueue/replica_{0}/input_{1}".format( + replica_index, core_index), + device_ordinal=ordinal)) + return per_host_enqueue_ops + + def _check_input_partition_dims(self, tensor, dims): + """Checks that input partition dims are valid for the `Tensor`. + + Args: + tensor: Input tensor for partitioning. + dims: A list of integer describes how to partition the input tensor. + + Raises: + ValueError: If the tensor can't be partitioned by dims or the + num_cores_per_replica doesn't match the number of + partitions(dims.prod()). + """ + if dims is None: + return + + dims = np.array(dims) + + if (dims < 1).any(): + raise ValueError("All input partition dims must be >= 1.") + + # No partitioning, so don't perform further checks. + if dims.prod() == 1: + return + + if dims.prod() != self._device_assignment.num_cores_per_replica: + raise ValueError( + "The product of each input parition dim should equal to " + "num_cores_per_replica. (dim = {}, num_cores_per_replica " + "= {})".format(dims, self._device_assignment.num_cores_per_replica)) + if dims.shape[0] != tensor.shape.ndims: + raise ValueError( + "Input partition dims must have the same number of dimensions " + "as the `Tensor` to be partitioned. (tensor shape = {}, input " + "partition dims = {}).".format(tensor.shape.as_list(), dims)) + + tensor.shape.assert_is_fully_defined() + if (np.array(tensor.shape.as_list()) % dims != 0).any(): + raise ValueError( + "All input partition dims must divide exactly into the `Tensor` " + "shape (tensor shape = {}, input partition dims = {}).".format( + tensor.shape.as_list(), dims)) + + def _partition_or_replicate_on_host(self, tensor, dims): + """Partitions or replicates the input tensor. + + The ops inside this function are placed on the host side. + + Args: + tensor: The input tensor which will be partioned or replicated. + dims: A list of integer describes how to partition the input tensor. + Returns: + An iterator of `Tensor`s or a list of partioned tensors. + """ + self._check_input_partition_dims(tensor, dims) + if dims is None: + return itertools.repeat(tensor) + else: + output = [tensor] + for axis, dim in enumerate(dims): + if dim > 1: + output = [array_ops.split(x, dim, axis=axis) for x in output] + output = nest.flatten(output) + return output + + def _tag_sharding_attribute_for_dequeued_tensor(self, tensor, dims): + """Tags appropriate XLA sharding attribute to the dequeued tensor. + + Args: + tensor: The dequeued tensor on TPU. + dims: A list of integer describes how the tensor is partitioned. + + Returns: + The same tensor with the xla_sharding attribute. + """ + if dims is None: + return xla_sharding.replicate(tensor) + elif np.prod(dims) == 1: + return xla_sharding.assign_device(tensor, 0) + else: + tile_shape = np.array(tensor.shape.as_list()) // dims + tile_assignment = np.arange(np.prod(dims)).reshape(dims) + return xla_sharding.tile( + tensor=tensor, + tile_shape=xla_shape.CreateShapeFromDtypeAndTuple( + dtype=np.dtype(tensor.dtype.as_numpy_dtype), + shape_tuple=tile_shape), + tile_assignment=tile_assignment) + + def _tag_sharding_attribute_for_dequeued_tensors(self, dequeues, dims): + """Tags appropriate XLA sharding attribute to the dequeued tensors. + + Args: + dequeues: A list of dequeued tensors on TPU. + dims: A list of integer describes how the tensor is partitioned. + + Returns: + The same dequeues with appropriate xla_sharding attribute. + """ + nest.assert_shallow_structure(dequeues, dims) + return nest.map_structure_up_to( + dequeues, self._tag_sharding_attribute_for_dequeued_tensor, dequeues, + dims) + + def _get_logical_core(self, core_index): + """Maps the core index to the 3D coordinate within replica. + + The lowest dimension number in computation_shape is the slowest varying + dimension (most major). + + Args: + core_index: An integer represents the core index within replcia. + + Returns: + A tuple with three integers which represents the 3D coordinate. + """ + computation_shape = self._device_assignment.computation_shape + return (core_index // (computation_shape[1] * computation_shape[2]), + core_index % (computation_shape[1] * computation_shape[2]) // + computation_shape[2], core_index % computation_shape[2]) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py index 894f21d0635ca47d3da1c0d2c3f5c37bac690920..ec682e5829c4df536a043334b74200f0b6259df3 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py @@ -45,7 +45,7 @@ _TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [ ]) -def _query_tpu_system_metadata(master_address, run_config, +def _query_tpu_system_metadata(master_address, cluster_def=None, query_topology=False): """Automatically detects the TPU system metadata in the system.""" tpu_core_count = 0 @@ -61,7 +61,8 @@ def _query_tpu_system_metadata(master_address, run_config, with session_lib.Session( master_address, config=get_session_config_with_timeout( - _PINGING_MASTER_TIMEOUT_IN_MS, run_config)) as sess: + _PINGING_MASTER_TIMEOUT_IN_MS, + cluster_def)) as sess: devices = sess.list_devices() for device in devices: match = _TPU_DEVICE_REG.match(device.name) @@ -105,7 +106,7 @@ def _query_tpu_system_metadata(master_address, run_config, 'TPU worker has some problems. Available devices: {}'.format( master_address, devices)) - topology = _obtain_topology(master_address, run_config) + topology = _obtain_topology(master_address, cluster_def) metadata = _TPUSystemMetadata( num_cores=tpu_core_count, @@ -127,14 +128,15 @@ def _query_tpu_system_metadata(master_address, run_config, return metadata -def _obtain_topology(master_address, run_config): +def _obtain_topology(master_address, cluster_def): + """Obtains TPU fabric topology.""" try: logging.info('Initializing TPU system (master: %s) to fetch topology ' 'for model parallelism. This might take a while.', master_address) with ops.Graph().as_default(): session_config = get_session_config_with_timeout( - _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, run_config) + _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def) with session_lib.Session( master_address, config=session_config) as sess: topology = sess.run(tpu.initialize_system()) @@ -146,11 +148,8 @@ def _obtain_topology(master_address, run_config): master_address)) -def get_session_config_with_timeout(timeout_in_secs, run_config): - cluster_def = None - if run_config.session_config and run_config.session_config.cluster_def.job: - cluster_def = run_config.session_config.cluster_def - +def get_session_config_with_timeout(timeout_in_secs, cluster_def): + """Returns a session given a timeout and a cluster configuration.""" config = config_pb2.ConfigProto( operation_timeout_in_ms=timeout_in_secs, cluster_def=cluster_def) return config diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD index 76927e62e82d02de172a0851819716dc63180371..ddf8365d6130dcb4c8234ac60c91955d007e2410 100644 --- a/tensorflow/contrib/training/BUILD +++ b/tensorflow/contrib/training/BUILD @@ -61,7 +61,7 @@ py_library( "//tensorflow/python:variable_scope", "//tensorflow/python:variables", "//tensorflow/python/data", - "//tensorflow/python/estimator:inputs_queues", + "//tensorflow/python/estimator:estimator_py", "//third_party/py/numpy", "@six_archive//:six", ], @@ -133,7 +133,7 @@ py_test( "//tensorflow/python:framework_ops", "//tensorflow/python:session", "//tensorflow/python:training", - "//tensorflow/python/estimator:inputs_queues", + "//tensorflow/python/estimator:estimator_py", "//third_party/py/numpy", ], ) diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py index f7fd66d33fc0c329db7daaf87373385156d84217..01bac891da7ddf8523e6cc8c99decf4a61aa2741 100644 --- a/tensorflow/contrib/training/python/training/evaluation.py +++ b/tensorflow/contrib/training/python/training/evaluation.py @@ -142,9 +142,9 @@ from tensorflow.python.ops import state_ops from tensorflow.python.platform import tf_logging as logging from tensorflow.python.summary import summary from tensorflow.python.training import basic_session_run_hooks +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import evaluation from tensorflow.python.training import monitored_session -from tensorflow.python.training import saver as tf_saver from tensorflow.python.training import session_run_hook from tensorflow.python.training import training_util @@ -189,7 +189,7 @@ def wait_for_new_checkpoint(checkpoint_dir, logging.info('Waiting for new checkpoint at %s', checkpoint_dir) stop_time = time.time() + timeout if timeout is not None else None while True: - checkpoint_path = tf_saver.latest_checkpoint(checkpoint_dir) + checkpoint_path = checkpoint_management.latest_checkpoint(checkpoint_dir) if checkpoint_path is None or checkpoint_path == last_checkpoint: if stop_time is not None and time.time() + seconds_to_sleep > stop_time: return None diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py index 39d75a080604e3a7ae93391652d4c03be9857218..53e4f23a7cd940c026e462dc7fb55cf9f175bf02 100644 --- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py +++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py @@ -988,14 +988,14 @@ class SequenceQueueingStateSaver(object): assert isinstance(sequences, dict) assert isinstance(context, dict) assert isinstance(states, dict) - self._name_to_index = dict( - (name, ix) + self._name_to_index = { + name: ix for (ix, name) in enumerate([ "__length", "__total_length", "__next_key", "__sequence", "__sequence_count" ] + ["__sequence__%s" % k for k in sequences.keys()] + [ "__context__%s" % k for k in context.keys() - ] + ["__state__%s" % k for k in states.keys()])) + ] + ["__state__%s" % k for k in states.keys()])} self._index_to_name = [ name for (name, _) in sorted( diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py index a2444934bc21d58ed57d15494b3548a31ce3a2df..f46d03209ce7b111415b61181906c496f8181e71 100644 --- a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py +++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py @@ -156,7 +156,7 @@ def prepend_from_queue_and_padded_batch_dataset(batch_size, Returns: A `Dataset` transformation function, which can be passed to - @{tf.data.Dataset.apply}. + `tf.data.Dataset.apply`. """ def _apply_fn(dataset): diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py index f72e0a3f831f9e9c61a2e9d77828ffb12d8428b1..c272a2ac144068cfb7355c2647eebf5bd0ce9d50 100644 --- a/tensorflow/contrib/training/python/training/training.py +++ b/tensorflow/contrib/training/python/training/training.py @@ -484,7 +484,8 @@ def train(train_op, save_checkpoint_secs=600, save_summaries_steps=100, config=None, - max_wait_secs=7200): + max_wait_secs=7200, + run_metadata=None): """Runs the training loop. Args: @@ -511,6 +512,7 @@ def train(train_op, become available. This should be kept relatively short to help detect incorrect code, but sometimes may need to be increased if the chief takes a while to start up. + run_metadata: A [`RunMetadata`] protocol buffer. Returns: the value of the loss function after training. @@ -541,5 +543,5 @@ def train(train_op, max_wait_secs=max_wait_secs) as session: loss = None while not session.should_stop(): - loss = session.run(train_op) + loss = session.run(train_op, run_metadata=run_metadata) return loss diff --git a/tensorflow/contrib/training/python/training/training_test.py b/tensorflow/contrib/training/python/training/training_test.py index 4877c010fad2c567d26b9674d2904274c0895f55..94cf7788b2bd3bc3fe87eefd599ce88de03042af 100644 --- a/tensorflow/contrib/training/python/training/training_test.py +++ b/tensorflow/contrib/training/python/training/training_test.py @@ -36,6 +36,7 @@ from tensorflow.python.ops.losses import losses from tensorflow.python.platform import gfile from tensorflow.python.platform import test from tensorflow.python.training import basic_session_run_hooks +from tensorflow.python.training import checkpoint_management from tensorflow.python.training import gradient_descent from tensorflow.python.training import monitored_session from tensorflow.python.training import saver as saver_lib @@ -421,7 +422,7 @@ class TrainTest(test.TestCase): train_op = self.create_train_op() model_variables = variables_lib2.global_variables() - model_path = saver_lib.latest_checkpoint(logdir1) + model_path = checkpoint_management.latest_checkpoint(logdir1) assign_fn = variables_lib.assign_from_checkpoint_fn( model_path, model_variables) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 35a112e8340ccee1f27fb1cd44227a37bff5bacd..0af8627290f0a0c4c72b256edc3d02be220e938a 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -735,7 +735,10 @@ cc_library( "util/reporter.h", ], copts = tf_copts(), - linkopts = ["-lm"], + linkopts = select({ + "//tensorflow:windows": [], + "//conditions:default": ["-lm"], + }), visibility = ["//visibility:public"], deps = [ ":lib", @@ -860,7 +863,6 @@ tf_cuda_library( "util/work_sharder.h", ] + select({ "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": [ "util/memmapped_file_system.h", "util/memmapped_file_system_writer.h", @@ -2036,7 +2038,7 @@ cc_library( linkopts = select({ "//tensorflow:freebsd": [], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], + "//tensorflow:android": [], "//conditions:default": [ "-ldl", "-lpthread", @@ -2125,7 +2127,6 @@ cc_library( linkopts = select({ "//tensorflow:freebsd": [], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": ["-ldl"], }), deps = [ @@ -2150,7 +2151,6 @@ cc_library( linkopts = select({ "//tensorflow:freebsd": [], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": ["-ldl"], }), deps = [ @@ -2182,7 +2182,6 @@ cc_library( linkopts = select({ "//tensorflow:freebsd": [], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": ["-ldl"], }), deps = [ @@ -2238,6 +2237,7 @@ cc_library( linkopts = ["-ldl"], deps = [ "//tensorflow/core/platform/default/build_config:jpeg", + "//tensorflow/core/platform/default/build_config:logging", ], ) @@ -2266,6 +2266,7 @@ cc_library( linkopts = ["-ldl"], deps = [ "//tensorflow/core/platform/default/build_config:gif", + "//tensorflow/core/platform/default/build_config:logging", ], ) @@ -2292,6 +2293,7 @@ cc_library( copts = tf_copts(), linkopts = ["-ldl"], deps = [ + "//tensorflow/core/platform/default/build_config:logging", "@png_archive//:png", ], ) @@ -2483,7 +2485,6 @@ tf_cuda_library( ], ) + select({ "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], "//conditions:default": [ "util/memmapped_file_system.cc", "util/memmapped_file_system_writer.cc", @@ -2492,13 +2493,13 @@ tf_cuda_library( hdrs = FRAMEWORK_INTERNAL_PUBLIC_HEADERS, copts = tf_copts(), linkopts = select({ - "//tensorflow:freebsd": [], + "//tensorflow:freebsd": ["-lm"], "//tensorflow:windows": [], - "//tensorflow:windows_msvc": [], - "//conditions:default": ["-ldl"], - }) + [ - "-lm", - ], + "//conditions:default": [ + "-ldl", + "-lm", + ], + }), deps = [ ":lib", ":lib_internal", @@ -3142,7 +3143,10 @@ cc_library( testonly = 1, srcs = ["platform/test_main.cc"], copts = tf_copts(), - linkopts = ["-lm"], + linkopts = select({ + "//tensorflow:windows": [], + "//conditions:default": ["-lm"], + }), visibility = ["//tensorflow:internal"], deps = [ ":lib", @@ -3233,6 +3237,7 @@ tf_cc_tests( "platform/fingerprint_test.cc", "platform/integral_types_test.cc", "platform/logging_test.cc", + "platform/mutex_test.cc", "platform/net_test.cc", "platform/port_test.cc", "platform/profile_utils/cpu_utils_test.cc", @@ -3490,6 +3495,7 @@ tf_cc_tests( "framework/tensor_shape_test.cc", "framework/tensor_slice_test.cc", "framework/tensor_test.cc", + "framework/tensor_testutil_test.cc", "framework/tensor_util_test.cc", "framework/tracking_allocator_test.cc", "framework/types_test.cc", @@ -4576,6 +4582,8 @@ filegroup( # PNG data "lib/png/testdata/lena_gray.png", "lib/png/testdata/lena_rgba.png", + "lib/png/testdata/lena_palette.png", + "lib/png/testdata/lena_palette_trns.png", # JPEG data "lib/jpeg/testdata/jpeg_merge_test1.jpg", "lib/jpeg/testdata/jpeg_merge_test1_cmyk.jpg", diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc index ae03a61ae66ec8d0119d91eefe8c64e61348e9b4..51812caeb2979270c913adee4fba2ce02f9c4d0e 100644 --- a/tensorflow/core/api_def/api_test.cc +++ b/tensorflow/core/api_def/api_test.cc @@ -59,8 +59,8 @@ void GetGoldenApiDefs(Env* env, const string& api_files_dir, file_contents = PBTxtFromMultiline(file_contents); ApiDefs api_defs; - CHECK(tensorflow::protobuf::TextFormat::ParseFromString(file_contents, - &api_defs)) + QCHECK(tensorflow::protobuf::TextFormat::ParseFromString(file_contents, + &api_defs)) << "Failed to load " << file_path; CHECK_EQ(api_defs.op_size(), 1); (*name_to_api_def)[api_defs.op(0).graph_op_name()] = api_defs.op(0); diff --git a/tensorflow/core/api_def/base_api/api_def_Ceil.pbtxt b/tensorflow/core/api_def/base_api/api_def_Ceil.pbtxt index ad1ada8d717a51ee3a058da5d32ed7bf50375b13..3134fceecabb4969f5d8cf3a67e9288c7ca2a186 100644 --- a/tensorflow/core/api_def/base_api/api_def_Ceil.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_Ceil.pbtxt @@ -1,4 +1,4 @@ op { graph_op_name: "Ceil" - summary: "Returns element-wise smallest integer in not less than x." + summary: "Returns element-wise smallest integer not less than x." } diff --git a/tensorflow/core/api_def/base_api/api_def_FilterByLastComponentDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FilterByLastComponentDataset.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..0b41229872347c586dd644f557df2f0dbdcddf5e --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_FilterByLastComponentDataset.pbtxt @@ -0,0 +1,7 @@ +op { + graph_op_name: "FilterByLastComponentDataset" + visibility: HIDDEN + summary: + "Creates a dataset containing elements of first " + "component of `input_dataset` having true in the last component." +} diff --git a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt index 342a1f6b0504046ae837e5e1ad1c91aaa2da95fc..a0e42dd02c5b570e34fb22867af53dcfce3a0f1d 100644 --- a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt @@ -27,7 +27,7 @@ slice of `params`: output[\\(i_0, ..., i_{K-2}\\)] = params[indices[\\(i_0, ..., i_{K-2}\\)]] -Whereas in @{tf.gather} `indices` defines slices into the first +Whereas in `tf.gather` `indices` defines slices into the first dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the first `N` dimensions of `params`, where `N = indices.shape[-1]`. diff --git a/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt b/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt index e7bc5ddae237deb226606dc96141845e3efcc859..40d7d371ca2fbcd5ed886816b3cc8e2e0e11c27e 100644 --- a/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt @@ -1,6 +1,6 @@ op { graph_op_name: "Igamma" - summary: "Compute the lower regularized incomplete Gamma function `Q(a, x)`." + summary: "Compute the lower regularized incomplete Gamma function `P(a, x)`." description: <