diff --git a/configure.py b/configure.py index f663f6df6f1d26b37b32474ae2963b2e8856822c..dfb87550b1c5ba9ab1f62e3c4149d53154e1e970 100644 --- a/configure.py +++ b/configure.py @@ -859,7 +859,7 @@ def set_tf_cuda_version(environ_cp): cuda_toolkit_paths_full = [ os.path.join(cuda_toolkit_path, x) for x in cuda_rt_lib_paths ] - if any([os.path.exists(x) for x in cuda_toolkit_paths_full]): + if any(os.path.exists(x) for x in cuda_toolkit_paths_full): break # Reset and retry diff --git a/tensorflow/BUILD b/tensorflow/BUILD index ab704860022b8056eee2e0e6028dfa89ddf79b4b..fd4b94202aad24a82abef8abd16431f61a8326f0 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -361,7 +361,7 @@ package_group( "-//third_party/tensorflow/python/estimator", "//learning/meta_rank/...", "//tensorflow/...", - "//tensorflow_estimator/...", + "//tensorflow_estimator/contrib/...", "//tensorflow_fold/llgtm/...", "//tensorflow_text/...", "//third_party/py/tensor2tensor/...", diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py index 0d49756838505289a960a6cabeb7cab02fad995b..59b07e15b878a94487d12459a70a70ba1a88c270 100644 --- a/tensorflow/api_template.__init__.py +++ b/tensorflow/api_template.__init__.py @@ -21,8 +21,6 @@ from __future__ import print_function as _print_function import os as _os # pylint: disable=g-bad-import-order -from tensorflow.python import pywrap_tensorflow # pylint: disable=unused-import - from tensorflow.python.tools import component_api_helper as _component_api_helper _component_api_helper.package_hook( parent_package_str=__name__, @@ -34,7 +32,8 @@ from tensorflow.python.platform import flags # pylint: disable=g-import-not-at- # Make sure directory containing top level submodules is in # the __path__ so that "from tensorflow.foo import bar" works. -_tf_api_dir = _os.path.dirname(_os.path.dirname(app.__file__)) # pylint: disable=undefined-variable +# We're using bitwise, but there's nothing special about that. +_tf_api_dir = _os.path.dirname(_os.path.dirname(bitwise.__file__)) # pylint: disable=undefined-variable if _tf_api_dir not in __path__: __path__.append(_tf_api_dir) diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD index 84238ffc1f2b73c59055461fbeba33687d756208..f653e581bf3beda9fdbf8fb7905a4f9fe170e7fb 100644 --- a/tensorflow/c/BUILD +++ b/tensorflow/c/BUILD @@ -121,6 +121,7 @@ tf_cuda_library( ":c_api", ":c_api_internal", "//tensorflow/c/eager:c_api", + "//tensorflow/c/eager:c_api_internal", "//tensorflow/compiler/jit:flags", "//tensorflow/contrib/tpu:all_ops", "//tensorflow/core:core_cpu", @@ -263,7 +264,7 @@ tf_cuda_cc_test( tf_cc_test( name = "c_api_experimental_test", - size = "small", + size = "medium", srcs = ["c_api_experimental_test.cc"], data = ["testdata/tf_record"], linkopts = select({ @@ -274,8 +275,11 @@ tf_cc_test( # the shared library must be able to use core:framework. # linkstatic = tf_kernel_tests_linkstatic(), deps = [ + ":c_api", ":c_api_experimental", ":c_test_util", + "//tensorflow/c/eager:c_api", + "//tensorflow/c/eager:c_api_test_util", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", "//tensorflow/core:test", diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index bc434e759340f7f2c16346f2980d20270489d673..69de4cb711ef89734af3729c5e5518c14a7f5738 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -15,13 +15,18 @@ limitations under the License. #include "tensorflow/c/c_api_experimental.h" +#include "tensorflow/c/c_api.h" #include "tensorflow/c/c_api_internal.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_internal.h" #include "tensorflow/compiler/jit/flags.h" #include "tensorflow/core/common_runtime/eager/attr_builder.h" #include "tensorflow/core/framework/tensor.pb.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/node_builder.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/init_main.h" +#include "tensorflow/core/platform/net.h" #include "tensorflow/core/platform/platform.h" #include "tensorflow/core/protobuf/config.pb.h" #include "tensorflow/core/protobuf/tensorflow_server.pb.h" @@ -51,8 +56,8 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) { // These XLA flags are needed to trigger XLA properly from C (more generally // non-Python) clients. If this API is called again with `enable` set to // false, it is safe to keep these flag values as is. - tensorflow::legacy_flags::MarkForCompilationPassFlags* flags = - tensorflow::legacy_flags::GetMarkForCompilationPassFlags(); + tensorflow::MarkForCompilationPassFlags* flags = + tensorflow::GetMarkForCompilationPassFlags(); flags->tf_xla_cpu_global_jit = true; flags->tf_xla_min_cluster_size = 1; } else { @@ -71,8 +76,8 @@ TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation, // These XLA flags are needed to trigger XLA properly from C (more generally // non-Python) clients. If this API is called again with `enable` set to // false, it is safe to keep these flag values as is. - tensorflow::legacy_flags::MarkForCompilationPassFlags* flags = - tensorflow::legacy_flags::GetMarkForCompilationPassFlags(); + tensorflow::MarkForCompilationPassFlags* flags = + tensorflow::GetMarkForCompilationPassFlags(); flags->tf_xla_cpu_global_jit = true; flags->tf_xla_min_cluster_size = 1; } else { @@ -8739,8 +8744,55 @@ void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) { TF_DeleteStatus(status); } -TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status, - const char* errMsg) { +struct TFE_ExecuteOpNotification { + TFE_ExecuteOpNotification() : status(TF_NewStatus(), TF_DeleteStatus) {} + tensorflow::Notification n; + std::unique_ptr thread; + std::unique_ptr status; +}; + +TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread(TFE_Op* op, + TFE_TensorHandle** retvals, + int* num_retvals, + TF_Status* status) { + TFE_ExecuteOpNotification* n = new TFE_ExecuteOpNotification; + + n->thread.reset(op->operation.EagerContext()->TFEnv()->StartThread( + tensorflow::ThreadOptions(), "ExecuteOpThread", + [op, retvals, num_retvals, n]() { + TFE_Execute(op, retvals, num_retvals, n->status.get()); + n->n.Notify(); + })); + + return n; +} + +void TFE_ExecuteOpNotificationWaitAndDelete( + TFE_ExecuteOpNotification* notification, TF_Status* status) { + if (notification == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "Passed in notification is a nullptr."); + + return; + } + if (notification->thread == nullptr) { + status->status = tensorflow::errors::InvalidArgument( + "Passed in notification didn't start a thread correctly. Cleaning up " + "this notification. Please re-execute the operation to get a new " + "notification."); + + delete notification; + return; + } + + notification->n.WaitForNotification(); + + status->status = notification->status->status; + + delete notification; +} + +void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg) { status->status = tensorflow::errors::Internal(errMsg); } @@ -8800,3 +8852,21 @@ const char* TF_GetNumberAttrForOpListInput(const char* op_name, int input_index, // The returned string is owned by OpRegistry, so liveness is not a concern. return input_arg.number_attr().c_str(); } + +int TF_OpIsStateful(const char* op_type, TF_Status* status) { + const tensorflow::OpRegistrationData* op_reg_data; + status->status = + tensorflow::OpRegistry::Global()->LookUp(op_type, &op_reg_data); + if (!status->status.ok()) { + return 0; + } + return op_reg_data->op_def.is_stateful(); +} + +void TF_InitMain(const char* usage, int* argc, char*** argv) { + tensorflow::port::InitMain(usage, argc, argv); +} + +int TF_PickUnusedPortOrDie() { + return tensorflow::internal::PickUnusedPortOrDie(); +} diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h index 6639b0be72bdf81d0e3c806770364d7bc5082ad2..c04cd441bfbd89dafc8b3f0882ab06cd98a1b6fb 100644 --- a/tensorflow/c/c_api_experimental.h +++ b/tensorflow/c/c_api_experimental.h @@ -180,6 +180,25 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor( TF_CAPI_EXPORT extern void TFE_TensorHandlePrintDebugString( TFE_TensorHandle* handle); +typedef struct TFE_ExecuteOpNotification TFE_ExecuteOpNotification; + +// Allows invoking a kernel asynchronously, and explicitly returns a +// notification that can be waited upon. This always executes the kernel in a +// new thread. +// 1. `retvals` and `num_retvals` can only be consumed after +// `TFE_ExecuteOp` returns successfully. They shouldn't be used +// if the return is unsuccessful +// 2. These new APIs cannot be used together with the TFE context level async +// support. +TF_CAPI_EXPORT extern TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread( + TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, + TF_Status* status); + +// Waits to complete the op execution, and cleans up the notification. +// Errors reported by op execution are set in `status`. +TF_CAPI_EXPORT extern void TFE_ExecuteOpNotificationWaitAndDelete( + TFE_ExecuteOpNotification* notification, TF_Status* status); + TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg); @@ -209,6 +228,19 @@ TF_CAPI_EXPORT extern void TF_AttrBuilderCheckCanRunOnDevice( TF_CAPI_EXPORT extern const char* TF_GetNumberAttrForOpListInput( const char* op_name, int input_index, TF_Status* status); +// Returns 1 if the op is stateful, 0 otherwise. The return value is undefined +// if the status is not ok. +TF_CAPI_EXPORT extern int TF_OpIsStateful(const char* op_type, + TF_Status* status); + +// Platform specific initialization routine. Very few platforms actually require +// this to be called. +TF_CAPI_EXPORT void TF_InitMain(const char* usage, int* argc, char*** argv); + +// Platform-specific implementation to return an unused port. (This should used +// in tests only.) +TF_CAPI_EXPORT int TF_PickUnusedPortOrDie(); + #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc index c6effd39697e0397278770b53e98508074f99862..daa7701b7fe7e8ce757b6504329cf6434ad39778 100644 --- a/tensorflow/c/c_api_experimental_test.cc +++ b/tensorflow/c/c_api_experimental_test.cc @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/c/c_api_experimental.h" #include "tensorflow/c/c_test_util.h" +#include "tensorflow/c/eager/c_api.h" +#include "tensorflow/c/eager/c_api_test_util.h" #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" @@ -162,5 +164,137 @@ protocol: "grpc" TF_DeleteStatus(status); } +TEST(CAPI_EXPERIMENTAL, IsStateful) { + std::unique_ptr status( + TF_NewStatus(), TF_DeleteStatus); + int assign = TF_OpIsStateful("AssignAddVariableOp", status.get()); + ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + EXPECT_EQ(assign, 1); + int id = TF_OpIsStateful("Identity", status.get()); + ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + EXPECT_EQ(id, 0); +} + +TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Simple) { + TF_Status* status = TF_NewStatus(); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + TFE_Context* ctx = TFE_NewContext(opts, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContextOptions(opts); + + TFE_TensorHandle* m = TestMatrixTensorHandle(); + + TFE_Op* matmul_op = MatMulOp(ctx, m, m); + + TFE_TensorHandle* retvals[1] = {nullptr}; + int num_retvals = 1; + + auto* r = + TFE_ExecuteOpInNewThread(matmul_op, &retvals[0], &num_retvals, status); + + TFE_ExecuteOpNotificationWaitAndDelete(r, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + float product[4] = {0}; + EXPECT_EQ(sizeof(product), TF_TensorByteSize(t)); + memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t)); + TF_DeleteTensor(t); + EXPECT_EQ(7, product[0]); + EXPECT_EQ(10, product[1]); + EXPECT_EQ(15, product[2]); + EXPECT_EQ(22, product[3]); + + TFE_DeleteOp(matmul_op); + TFE_DeleteTensorHandle(m); + + TFE_DeleteTensorHandle(retvals[0]); + TFE_DeleteContext(ctx); + TF_DeleteStatus(status); +} + +// Perform a send/recv test. Recv blocks, so they need to be executed +// asynchronously. +TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Blocking) { + TF_Status* status = TF_NewStatus(); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_Context* ctx = TFE_NewContext(opts, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContextOptions(opts); + + // Returns a 2x2 float32 Tensor on the CPU, with data 1., 2., 3., 4. + TFE_TensorHandle* m = TestMatrixTensorHandle(); + + // Build a send op. + TFE_Op* send_op = TFE_NewOp(ctx, "_Send", status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_OpAddInput(send_op, m, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + string tensor_name = "Tensor"; + TFE_OpSetAttrType(send_op, "T", TF_FLOAT); + TFE_OpSetAttrString(send_op, "tensor_name", tensor_name.c_str(), + tensor_name.size()); + string send_device = "/job:localhost/replica:0/task:0/device:CPU:0"; + TFE_OpSetAttrString(send_op, "send_device", send_device.c_str(), + send_device.size()); + TFE_OpSetAttrInt(send_op, "send_device_incarnation", 1234); + string recv_device = "/job:localhost/replica:0/task:0/device:CPU:0"; + TFE_OpSetAttrString(send_op, "recv_device", recv_device.c_str(), + recv_device.size()); + TFE_OpSetAttrBool(send_op, "client_terminated", true); + + // Build a recv op. + TFE_Op* recv_op = TFE_NewOp(ctx, "_Recv", status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TFE_OpSetAttrType(recv_op, "tensor_type", TF_FLOAT); + TFE_OpSetAttrString(recv_op, "tensor_name", tensor_name.c_str(), + tensor_name.size()); + TFE_OpSetAttrString(recv_op, "send_device", send_device.c_str(), + send_device.size()); + TFE_OpSetAttrInt(recv_op, "send_device_incarnation", 1234); + TFE_OpSetAttrString(recv_op, "recv_device", recv_device.c_str(), + recv_device.size()); + TFE_OpSetAttrBool(recv_op, "client_terminated", true); + + TFE_TensorHandle* send_retvals; + int send_num_retvals = 0; + auto* send_result = TFE_ExecuteOpInNewThread(send_op, &send_retvals, + &send_num_retvals, status); + + TFE_TensorHandle* recv_retvals[1] = {nullptr}; + int recv_num_retvals = 1; + auto* recv_result = TFE_ExecuteOpInNewThread(recv_op, &recv_retvals[0], + &recv_num_retvals, status); + + TFE_ExecuteOpNotificationWaitAndDelete(send_result, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_ExecuteOpNotificationWaitAndDelete(recv_result, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + TF_Tensor* t = TFE_TensorHandleResolve(recv_retvals[0], status); + ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + + float product[4] = {0}; + EXPECT_EQ(sizeof(product), TF_TensorByteSize(t)); + memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t)); + TF_DeleteTensor(t); + EXPECT_EQ(1, product[0]); + EXPECT_EQ(2, product[1]); + EXPECT_EQ(3, product[2]); + EXPECT_EQ(4, product[3]); + + TFE_DeleteOp(send_op); + TFE_DeleteOp(recv_op); + TFE_DeleteTensorHandle(m); + + TFE_DeleteTensorHandle(recv_retvals[0]); + TFE_DeleteContext(ctx); + TF_DeleteStatus(status); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index c25b78434ff147a9eadff5349820b58fe6897782..ba3d8533db7623b8fa7fdf35093abcd1450776b1 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -133,7 +133,6 @@ tf_cuda_cc_test( tags = [ "guitar", "multi_gpu", - "no_oss", ], deps = [ ":c_api", diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc index 55331022b9dbd0696928fa44430f340f371432ac..0045bb5622647974a3c9f2cdf35bc21e126b4f52 100644 --- a/tensorflow/c/eager/c_api_test.cc +++ b/tensorflow/c/eager/c_api_test.cc @@ -589,9 +589,22 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) { TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get()); ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); const int num_devices = TF_DeviceListCount(devices); + bool has_gpu0 = false; + bool has_gpu1 = false; + for (int i = 0; i < num_devices; ++i) { + const char* dev = TF_DeviceListName(devices, i, status.get()); + ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get()); + string device_name(dev); + if (device_name.find("GPU:0") != string::npos) { + has_gpu0 = true; + } + if (device_name.find("GPU:1") != string::npos) { + has_gpu1 = true; + } + } const char* kCPUDevice = "CPU:0"; - if (num_devices < 3) { + if (!has_gpu0 || !has_gpu1) { TF_DeleteDeviceList(devices); TF_DeleteTensor(t); TFE_DeleteTensorHandle(hcpu); diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h index 5ba55a203ff70cc64c07e96b5a869a1f11c9334e..5c11f51e8749de84547ae873f5f55ebd42bc4b3d 100644 --- a/tensorflow/c/eager/tape.h +++ b/tensorflow/c/eager/tape.h @@ -141,8 +141,9 @@ class GradientTape { // null. The result is populated with one tensor per target element. Status ComputeGradient( const VSpace& vspace, - gtl::ArraySlice target_tensor_ids, - gtl::ArraySlice source_tensor_id, + const gtl::ArraySlice target_tensor_ids, + const gtl::ArraySlice source_tensor_ids, + const gtl::FlatMap sources_that_are_targets, gtl::ArraySlice output_gradients, std::vector* result); @@ -396,6 +397,7 @@ template Status InitialGradients( const VSpace& vspace, gtl::ArraySlice target_tensor_ids, + gtl::FlatMap sources_that_are_targets, gtl::ArraySlice output_gradients, const TensorTape& tensor_tape, const OpTape& op_tape, gtl::FlatMap>* result) { @@ -425,8 +427,13 @@ Status InitialGradients( "none of operations outputs match expected tensor"); } } else { - // No record of the target tensor found on the tape, so no gradient - // needs to be computed from it. Do nothing. + // This target tensor was not generated by any operation recorded on + // the tape, so no gradient needs to be computed from it unless this + // target is also a source. + auto source_tensor = sources_that_are_targets.find(id); + if (source_tensor != sources_that_are_targets.end()) { + (*result)[id].push_back(vspace.Ones(source_tensor->second)); + } } } else { (*result)[id].push_back(output_gradients[i]); @@ -467,8 +474,9 @@ constexpr int kMinAggregateBytes = 128 * 1024 * 1024; template Status GradientTape::ComputeGradient( const VSpace& vspace, - gtl::ArraySlice target_tensor_ids, - gtl::ArraySlice source_tensor_ids, + const gtl::ArraySlice target_tensor_ids, + const gtl::ArraySlice source_tensor_ids, + const gtl::FlatMap sources_that_are_targets, gtl::ArraySlice output_gradients, std::vector* result) { gtl::FlatSet sources_set(source_tensor_ids.begin(), @@ -478,7 +486,8 @@ Status GradientTape::ComputeGradient( std::vector op_stack = InitialStack(state.op_tape, state.op_missing_tensor); gtl::FlatMap> gradients; - Status s = InitialGradients(vspace, target_tensor_ids, output_gradients, + Status s = InitialGradients(vspace, target_tensor_ids, + sources_that_are_targets, output_gradients, tensor_tape_, state.op_tape, &gradients); auto cleanup = [this, &state]() { if (!persistent_) { diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD index 83353b79f722f0a95f508b32d4a49b14b35624fb..a09becc49b10d2c58f98fbcc11df5190f794c1d4 100644 --- a/tensorflow/cc/BUILD +++ b/tensorflow/cc/BUILD @@ -489,6 +489,7 @@ tf_gen_op_wrappers_cc( "image_ops", "io_ops", "linalg_ops", + "list_ops", "logging_ops", "lookup_ops", "manip_ops", diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc index c6abe2f41b9b5ec2faee6f65b429ff606f8ac08e..ec116f68cf4b61c9b2d15065916ad9169017b659 100644 --- a/tensorflow/cc/saved_model/loader.cc +++ b/tensorflow/cc/saved_model/loader.cc @@ -193,6 +193,15 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir, Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def, std::vector* asset_file_defs) { + // With SavedModel v2, we write asset file def into metagraph instead of + // collection, so read from metagraph first. + if (meta_graph_def.asset_file_def_size() > 0) { + for (const auto& asset : meta_graph_def.asset_file_def()) { + asset_file_defs->push_back(asset); + } + return Status::OK(); + } + // Fall back to read from collection to be backward compatible with v1. const auto& collection_def_map = meta_graph_def.collection_def(); const auto assets_it = collection_def_map.find(kSavedModelAssetsKey); if (assets_it == collection_def_map.end()) { diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc index b17bc658fa06b9feb7edb292bd89ef31e6309169..697599f3bb114d411f3242ecde77dcc2eeefbffe 100644 --- a/tensorflow/compiler/aot/codegen.cc +++ b/tensorflow/compiler/aot/codegen.cc @@ -164,7 +164,8 @@ string RewriteWithName(const string& name, string code, } // Generate methods for args (inputs). -Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps, +Status GenArgMethods(const tf2xla::Config& config, + const xla::ProgramShapeProto& ps, const CompileResult& compile_result, string* methods) { size_t num_args = ps.parameters_size(); if (config.feed_size() != num_args) { @@ -204,7 +205,7 @@ Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps, // Generate methods for results (outputs). Status GenResultMethods(const tf2xla::Config& config, - const xla::ProgramShape& ps, string* methods) { + const xla::ProgramShapeProto& ps, string* methods) { if (ps.result().element_type() != xla::TUPLE) { // The XlaCompiler we use to build the xla computation always generates a // tuple result, and we rely on this to simplify code generation. @@ -336,7 +337,7 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config, ExtractEntryParamBufferInfos(buffer_infos); std::vector buffer_infos_for_temps = ExtractTempBufferInfos(buffer_infos); - const xla::ProgramShape& ps = compile_result.program_shape; + const xla::ProgramShapeProto& ps = compile_result.program_shape; string methods_arg, methods_result; TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg)); TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result)); @@ -548,8 +549,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction { static const char** StaticResultNames() {{RESULT_NAMES_CODE}} // Shape of the args and results. - static const xla::ProgramShape* StaticProgramShape() { - static const xla::ProgramShape* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}}; + static const xla::ProgramShapeProto* StaticProgramShape() { + static const xla::ProgramShapeProto* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}}; return kShape; } @@ -615,11 +616,11 @@ static string CreateUniqueIdentifier(const CodegenOpts& opts, Status GenerateMetadata(const CodegenOpts& opts, const CompileResult& compile_result, MetadataResult* metadata_result) { - std::unique_ptr program_shape; + std::unique_ptr program_shape; if (opts.gen_program_shape) { program_shape = - absl::make_unique(compile_result.program_shape); + absl::make_unique(compile_result.program_shape); // The parameter names are currently meaningless, and redundant with the // rest of our metadata, so clear them out to avoid confusion and save @@ -631,8 +632,8 @@ Status GenerateMetadata(const CodegenOpts& opts, // a shim that evaluates to nullptr, which is what we want. ProtobufToEmbed program_shape_protobuf{ - CreateUniqueIdentifier(opts, "ProgramShape"), "xla::ProgramShape", - program_shape.get()}; + CreateUniqueIdentifier(opts, "ProgramShapeProto"), + "xla::ProgramShapeProto", program_shape.get()}; ProtobufToEmbed hlo_profile_printer_data_protobuf{ CreateUniqueIdentifier(opts, "HloProfilePrinterData"), diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h index 90410c46a8e36e44454f1219ad76d0fb0937070d..9485e86b10e225a3c9c12eafd9905bdf7c15c9fa 100644 --- a/tensorflow/compiler/aot/codegen.h +++ b/tensorflow/compiler/aot/codegen.h @@ -57,7 +57,7 @@ struct MetadataResult { std::vector header_variable_decls; // program_shape_access_shim is a C++ expression that constructs the - // xla::ProgramShape instance for the CompileResult passed to + // xla::ProgramShapeProto instance for the CompileResult passed to // GenerateMetadata. string program_shape_access_shim; diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc index bb288d23000527be74f01630d20bbf82e50007ce..c1788ca32a1d099284eeb870f9513891051fd29e 100644 --- a/tensorflow/compiler/aot/codegen_test.cc +++ b/tensorflow/compiler/aot/codegen_test.cc @@ -181,13 +181,15 @@ TEST(CodegenTest, Golden) { BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/1), BufferInfo::MakeTempBuffer(3), BufferInfo::MakeTempBuffer(120)}, 5, {})); - compile_result.program_shape = xla::ShapeUtil::MakeProgramShape( - { - xla::ShapeUtil::MakeShape(xla::F32, {1, 2}), - xla::ShapeUtil::MakeShape(xla::S64, {3, 4}), - }, - xla::ShapeUtil::MakeTupleShape( - {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})})); + compile_result.program_shape = + xla::ShapeUtil::MakeProgramShape( + { + xla::ShapeUtil::MakeShape(xla::F32, {1, 2}), + xla::ShapeUtil::MakeShape(xla::S64, {3, 4}), + }, + xla::ShapeUtil::MakeTupleShape( + {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})})) + .ToProto(); compile_result.entry_point = "entry_point"; compile_result.pointer_size = 8; diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden index e4d8a02877c75fa72c5747650ab9c7ac229955b3..a2cdab5d1a8e72504ca11b789287d4efd07a59e9 100644 --- a/tensorflow/compiler/aot/codegen_test_h.golden +++ b/tensorflow/compiler/aot/codegen_test_h.golden @@ -22,7 +22,7 @@ extern "C" void entry_point( void* result, const xla::ExecutableRunOptions* run_options, const void** args, void** temps, tensorflow::int64* profile_counters); -extern "C" char __tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[]; +extern "C" char __tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[]; namespace foo { @@ -253,10 +253,10 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction { } // Shape of the args and results. - static const xla::ProgramShape* StaticProgramShape() { - static const xla::ProgramShape* kShape = []() { - xla::ProgramShape* proto = new xla::ProgramShape; - proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[0], 52); + static const xla::ProgramShapeProto* StaticProgramShape() { + static const xla::ProgramShapeProto* kShape = []() { + xla::ProgramShapeProto* proto = new xla::ProgramShapeProto; + proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 52); return proto; }(); return kShape; diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden index eb001c5d45bdfefc76629d7303d89f5480432235..ce8e5ec8c96a2c3696f14b8eea206d648182ecb5 100644 Binary files a/tensorflow/compiler/aot/codegen_test_o.golden and b/tensorflow/compiler/aot/codegen_test_o.golden differ diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc index 2b5f97b34cd928d32eb220536342c715d91d45bb..3bc99ef7e61178ad51f14e19d4b4417a37471c5c 100644 --- a/tensorflow/compiler/aot/compile.cc +++ b/tensorflow/compiler/aot/compile.cc @@ -56,8 +56,8 @@ Status CompileXla(xla::CompileOnlyClient* client, return errors::Unknown("Couldn't get XLA program shape: ", pshape_or.status().error_message()); } - compile_result->program_shape = *pshape_or.ValueOrDie(); - xla::ProgramShape* pshape = &compile_result->program_shape; + compile_result->program_shape = pshape_or.ValueOrDie()->ToProto(); + xla::ProgramShapeProto* pshape = &compile_result->program_shape; std::vector arg_layouts; arg_layouts.reserve(pshape->parameters_size()); for (int i = 0; i < pshape->parameters_size(); ++i) { diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h index e03c5b1aa77c1262ed903aae3072ef65f34d80a2..ee7bb26fabd2d897b85b62f38778ecbfe2238eb6 100644 --- a/tensorflow/compiler/aot/compile.h +++ b/tensorflow/compiler/aot/compile.h @@ -33,9 +33,9 @@ namespace tfcompile { struct CompileResult { // Contains object file and meta-info. std::unique_ptr aot; - xla::ProgramShape program_shape; // Static shape of args and results. - string entry_point; // Name of generated function. - int pointer_size = 0; // Size of a pointer in bytes. + xla::ProgramShapeProto program_shape; // Static shape of args and results. + string entry_point; // Name of generated function. + int pointer_size = 0; // Size of a pointer in bytes. }; // CompileGraph compiles the graph_def into an object file containing a function diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc index f10852c7850f61bfd8b99fa9f1648202d182085e..711feed8f321ff9b17c1dd90b389d3f9b11e0a74 100644 --- a/tensorflow/compiler/aot/tests/tfcompile_test.cc +++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc @@ -526,7 +526,7 @@ TEST(TFCompileTest, ProgramShape) { // muladd has the program shape defined. MatMulAndAddComp muladd; - const xla::ProgramShape* muladd_shape = muladd.ProgramShape(); + const xla::ProgramShapeProto* muladd_shape = muladd.ProgramShape(); ASSERT_TRUE(muladd_shape != nullptr); ASSERT_EQ(muladd_shape->parameters_size(), 2); EXPECT_TRUE(ShapeUtil::Compatible(muladd_shape->parameters(0), f32_2x2)); diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc index 01bea34af91fbd8dbb7236db0390f26fa4517642..9f4042630edaec1b9519b6434d859a48372e8b15 100644 --- a/tensorflow/compiler/jit/build_xla_ops_pass.cc +++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc @@ -320,10 +320,10 @@ Status BuildXlaOpsPass::Run(const GraphOptimizationPassOptions& options) { return IsXlaCompiledKernel(*n); }); - bool lazy_compilation_enabled = enable_lazy_compilation_ - ? *enable_lazy_compilation_ - : legacy_flags::GetBuildXlaOpsPassFlags() - .tf_xla_enable_lazy_compilation; + bool lazy_compilation_enabled = + enable_lazy_compilation_ + ? *enable_lazy_compilation_ + : GetBuildXlaOpsPassFlags().tf_xla_enable_lazy_compilation; for (Node* n : xla_compiled_kernels) { TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun( diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc index 28ec37b1b9c8a1a306b5e778bac5b6ba01c2c997..bcc3213285bee2a2094bd6c39b37ba95874d90ed 100644 --- a/tensorflow/compiler/jit/encapsulate_util.cc +++ b/tensorflow/compiler/jit/encapsulate_util.cc @@ -86,7 +86,7 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name, continue; } else if (src_xla_computation && !dst_xla_computation) { if (src_outside_compilation) { - // Case 1d: outside compilation to host computation control edge. + // Case 1c: outside compilation to host computation control edge. edges_to_remove.push_back(e); TF_RETURN_IF_ERROR(AppendToListAttr( @@ -94,7 +94,7 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name, } } else if (!src_xla_computation && dst_xla_computation) { if (dst_outside_compilation) { - // Case 1d: host computation control to outside compilation edge. + // Case 1c: host computation control to outside compilation edge. edges_to_remove.push_back(e); TF_RETURN_IF_ERROR(AppendToListAttr( @@ -103,40 +103,24 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name, } else { // src_xla_computation && dst_xla_computation if (*src_xla_computation != *dst_xla_computation) { if (src_outside_compilation && dst_outside_compilation) { - // Case 1c: outside compilation to outside compilation control edge. + // Case 1b: outside compilation to outside compilation control edge. edges_to_remove.push_back(e); TF_RETURN_IF_ERROR(AppendToListAttr( e->dst(), kXlaControlDependenciesAttrName, e->src()->name())); } else if (src_outside_compilation && !dst_outside_compilation) { - // Case 1b: outside compilation to another XLA computaition control + // Case 1a: outside compilation to another XLA computaition control // edge. TF_RETURN_IF_ERROR(AppendToListAttr( e->src(), kXlaConnectedToOtherXlaComputationAttrName, *dst_xla_computation)); } else if (!src_outside_compilation && dst_outside_compilation) { - // Case 1b: another XLA computaition to outside compilation control + // Case 1a: another XLA computaition to outside compilation control // edge. TF_RETURN_IF_ERROR(AppendToListAttr( e->dst(), kXlaConnectedFromOtherXlaComputationAttrName, *src_xla_computation)); } - } else { // *src_xla_computation == *dst_xla_computation - if (src_outside_compilation && dst_outside_compilation) { - if (*src_outside_compilation != *dst_outside_compilation) { - // Case 1c: outside compilation to outside compilation control edge. - edges_to_remove.push_back(e); - - TF_RETURN_IF_ERROR(AppendToListAttr( - e->dst(), kXlaControlDependenciesAttrName, e->src()->name())); - } - } else if (src_outside_compilation && !dst_outside_compilation) { - // Case 1a: outside compilation to its XLA computation control edge. - ReplaceAttr(e->src(), kXlaConnectedToXlaComputationAttrName, true); - } else if (!src_outside_compilation && dst_outside_compilation) { - // Case 1a: XLA computation to outside compilation in it control edge. - ReplaceAttr(e->dst(), kXlaConnectedFromXlaComputationAttrName, true); - } } } } @@ -181,12 +165,6 @@ Status ProcessXlaToXlaDataEdges(Graph* g, edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()}); VLOG(4) << "XLA -> XLA edge: " << e->DebugString(); } - } else { // *src_xla_computation == *dst_xla_computation - if (src_outside_compilation && dst_outside_compilation && - *src_outside_compilation != *dst_outside_compilation) { - edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()}); - VLOG(4) << "XLA -> XLA edge: " << e->DebugString(); - } } } @@ -594,14 +572,242 @@ Status AddControlDependencies( return Status::OK(); } +// Step 1 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of +// `PreprocessEdgesBetweenOutsideCompilations` for details. +Status PreprocessControlEdgesBetweenOutsideCompilations( + Graph* g, const string& outside_compilation_attr_name) { + // Gather edges to remove. We should not remove the edge while iterating. + std::vector edges_to_remove; + for (const Edge* e : g->edges()) { + if (!e->IsControlEdge()) { + continue; + } + + auto src_outside_compilation = + GetStringAttr(*e->src(), outside_compilation_attr_name); + auto dst_outside_compilation = + GetStringAttr(*e->dst(), outside_compilation_attr_name); + + if (src_outside_compilation && dst_outside_compilation) { + if (*src_outside_compilation != *dst_outside_compilation) { + // Case 1a: outside compilation to outside compilation control edge. + edges_to_remove.push_back(e); + + TF_RETURN_IF_ERROR(AppendToListAttr( + e->dst(), kXlaControlDependenciesWithinXlaClusterAttrName, + e->src()->name())); + } + } else if (src_outside_compilation && !dst_outside_compilation) { + // Case 1b: outside compilation to its XLA computation control edge. + ReplaceAttr(e->src(), kXlaConnectedToXlaComputationAttrName, true); + } else if (!src_outside_compilation && dst_outside_compilation) { + // Case 1b: XLA computation to outside compilation in it control edge. + ReplaceAttr(e->dst(), kXlaConnectedFromXlaComputationAttrName, true); + } + } + + for (auto e : edges_to_remove) { + g->RemoveEdge(e); + } + return Status::OK(); +} + +// Step 2 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of +// `PreprocessEdgesBetweenOutsideCompilations` for details. +Status PreprocessDataEdgesBetweenOutsideCompilations( + Graph* g, const string& outside_compilation_attr_name) { + // Gather edges between outside compilation and host computation. Notice that + // we do not store `Edge*` directly because we remove some nodes while adding + // Identity nodes, and those Edge pointers might be invalidated. + struct EdgeInfo { + int dst_input, dst_node_id; + }; + std::vector edges; + for (const Edge* e : g->edges()) { + if (e->IsControlEdge()) { + continue; + } + + auto src_outside_compilation = + GetStringAttr(*e->src(), outside_compilation_attr_name); + auto dst_outside_compilation = + GetStringAttr(*e->dst(), outside_compilation_attr_name); + + if (src_outside_compilation && dst_outside_compilation && + *src_outside_compilation != *dst_outside_compilation) { + edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()}); + VLOG(4) << "Oc -> oc edge: " << e->DebugString(); + } + } + + // Remove the edge from host to outside compilation. Add a placeholder as + // outside compilation node input. + std::map placeholders; + for (int i = 0; i < edges.size(); i++) { + Node* dst = g->FindNodeId(edges[i].dst_node_id); + const Edge* e; + TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e)); + Node* src = e->src(); + int src_output = e->src_output(), dst_input = e->dst_input(); + g->RemoveEdge(e); + + // Find or create placeholder node. + string new_name = absl::StrCat(src->name(), "_oc_to_oc_placeholder"); + auto iter = placeholders.find(new_name); + Node* placeholder_node; + if (iter == placeholders.end()) { + NodeDefBuilder placeholder_builder(new_name, "Placeholder"); + placeholder_builder.Attr("dtype", src->output_type(src_output)); + string outside_compilation_attr; + TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(), + outside_compilation_attr_name, + &outside_compilation_attr)); + placeholder_builder.Attr(outside_compilation_attr_name, + outside_compilation_attr); + placeholder_builder.Attr(kOutsideCompilationOriginalNodeAttrName, + src->name()); + placeholder_builder.Attr(kOutsideCompilationSrcOutputAttrName, + src_output); + NodeDef placeholder_def; + TF_RETURN_IF_ERROR(placeholder_builder.Finalize(&placeholder_def)); + Status s; + placeholder_node = g->AddNode(placeholder_def, &s); + TF_RETURN_IF_ERROR(s); + placeholders[new_name] = placeholder_node; + } else { + placeholder_node = iter->second; + } + g->AddEdge(placeholder_node, 0, dst, dst_input); + + // Replace `e->dst()` because its input node changed. + NodeDef new_def = dst->def(); + *new_def.mutable_input(dst_input) = placeholder_node->name(); + TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def)); + + // Other edge in `edges` might have `e->dst()` as src or dst + // node. Before removing `e->dst()`, replace those edges with + // corresponding edges for `dst_replace_node`. + for (int j = i + 1; j < edges.size(); j++) { + if (edges[j].dst_node_id == edges[i].dst_node_id) { + edges[j].dst_node_id = dst_replace_node->id(); + } + } + } + return Status::OK(); +} + +// Step 1 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of +// `PostprocessEdgesBetweenOutsideCompilations` for details. +Status PostprocessDataEdgesBetweenOutsideCompilations( + Graph* g, const string& outside_compilation_attr_name) { + // Gather all outside compilation to outside compilation nodes. + std::vector placeholder_nodes; + for (Node* n : g->nodes()) { + if (n->type_string() == "Placeholder" && + HasNodeAttr(n->def(), kOutsideCompilationOriginalNodeAttrName)) { + placeholder_nodes.push_back(n); + } + } + + // Remove the placeholder nodes, and reconnect original edge. + auto node_name_index = g->BuildNodeNameIndex(); + for (auto n : placeholder_nodes) { + string node_name; + int node_src_output; + TF_RETURN_IF_ERROR(GetNodeAttr( + n->attrs(), kOutsideCompilationOriginalNodeAttrName, &node_name)); + TF_RETURN_IF_ERROR(GetNodeAttr( + n->attrs(), kOutsideCompilationSrcOutputAttrName, &node_src_output)); + auto iter = node_name_index.find(node_name); + if (iter == node_name_index.end()) { + return errors::Internal( + "Cannot find original node for oc -> host placeholder node ", + node_name); + } + + // Change all usage node to use the original node instead. + Node* original_node = iter->second; + std::vector control_edges; + std::vector data_edges; + for (auto e : n->out_edges()) { + if (e->IsControlEdge()) { + control_edges.push_back(e); + } else { + data_edges.push_back({e->dst(), e->src_output(), e->dst_input()}); + } + } + for (const Edge* e : control_edges) { + g->AddControlEdge(original_node, e->dst()); + g->RemoveEdge(e); + } + for (int i = 0; i < data_edges.size(); i++) { + Node* dst = data_edges[i].dst; + NodeDef new_def = dst->def(); + int dst_input = data_edges[i].dst_input; + *new_def.mutable_input(dst_input) = + absl::StrCat(original_node->name(), ":", node_src_output); + TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def)); + + const Edge* edge_to_replace = nullptr; + TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace)); + g->RemoveEdge(edge_to_replace); + g->AddEdge(original_node, node_src_output, replace_node, dst_input); + + // Other edges might have `dst` as dst node. Update those edges with + // `replace_node`. + for (int j = i + 1; j < data_edges.size(); j++) { + if (data_edges[j].dst == dst) { + data_edges[j].dst = replace_node; + } + } + + // Other placeholder node might have `dst` as original node. Update + // `node_name_index` with `replace_node`. + node_name_index[replace_node->name()] = replace_node; + } + + // Remove placeholder node. + g->RemoveNode(n); + } + return Status::OK(); +} + +// Step 2 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of +// `PostprocessEdgesBetweenOutsideCompilations` for details. +Status PostprocessControlEdgesBetweenOutsideCompilations( + Graph* g, const string& outside_compilation_attr_name) { + auto node_name_index = g->BuildNodeNameIndex(); + + // Reconnect outside compilation to outside compilation control edge. + for (Node* n : g->nodes()) { + std::vector control_deps; + Status s = + GetNodeAttr(n->attrs(), kXlaControlDependenciesWithinXlaClusterAttrName, + &control_deps); + if (!s.ok()) { + if (s.code() != error::NOT_FOUND) { + return s; + } else { + continue; + } + } else { + n->ClearAttr(kXlaControlDependenciesWithinXlaClusterAttrName); + for (const string& control_input : control_deps) { + auto iter = node_name_index.find(control_input); + if (iter == node_name_index.end()) { + return errors::Internal("Cannot find original node for ", + control_input); + } + g->AddControlEdge(iter->second, n); + } + } + } + return Status::OK(); +} } // namespace const char kXlaInferredShapesAttrName[] = "_xla_inferred_shapes"; -const char kXlaConnectedToXlaComputationAttrName[] = - "_xla_connected_to_xla_computation"; -const char kXlaConnectedFromXlaComputationAttrName[] = - "_xla_connected_from_xla_computation"; const char kXlaConnectedToOtherXlaComputationAttrName[] = "_xla_connected_to_other_xla_computation"; const char kXlaConnectedFromOtherXlaComputationAttrName[] = @@ -616,6 +822,15 @@ const char kHostToOutsideCompilationOriginalNodeAttrName[] = "_xla_host_to_oc_node_name"; const char kHostToOutsideCompilationSrcOutputAttrName[] = "_xla_host_to_oc_src_output"; +const char kXlaConnectedToXlaComputationAttrName[] = + "_xla_connected_to_xla_computation"; +const char kXlaConnectedFromXlaComputationAttrName[] = + "_xla_connected_from_xla_computation"; +const char kOutsideCompilationOriginalNodeAttrName[] = + "_xla_oc_to_oc_node_name"; +const char kOutsideCompilationSrcOutputAttrName[] = "_xla_oc_to_oc_src_output"; +const char kXlaControlDependenciesWithinXlaClusterAttrName[] = + "_xla_control_dependencies_within_xla_cluster"; Status PerformStaticShapeInferenceBeforeEncapsulation( Graph* g, const string& xla_computation_attr_name, @@ -699,4 +914,39 @@ Status PostprocessForEncapsulation( return Status::OK(); } +Status PreprocessEdgesBetweenOutsideCompilations( + Graph* g, const string& outside_compilation_attr_name) { + // Remove edges from source node to outside compilation nodes, and edges + // from outside compilation nodes to sink node. + std::vector edges_to_remove; + for (const Edge* e : g->source_node()->out_edges()) { + if (HasNodeAttr(e->dst()->def(), outside_compilation_attr_name)) { + edges_to_remove.push_back(e); + } + } + for (const Edge* e : g->sink_node()->in_edges()) { + if (HasNodeAttr(e->src()->def(), outside_compilation_attr_name)) { + edges_to_remove.push_back(e); + } + } + for (auto e : edges_to_remove) { + g->RemoveEdge(e); + } + + TF_RETURN_IF_ERROR(PreprocessControlEdgesBetweenOutsideCompilations( + g, outside_compilation_attr_name)); + TF_RETURN_IF_ERROR(PreprocessDataEdgesBetweenOutsideCompilations( + g, outside_compilation_attr_name)); + return Status::OK(); +} + +Status PostprocessEdgesBetweenOutsideCompilations( + Graph* g, const string& outside_compilation_attr_name) { + TF_RETURN_IF_ERROR(PostprocessDataEdgesBetweenOutsideCompilations( + g, outside_compilation_attr_name)); + TF_RETURN_IF_ERROR(PostprocessControlEdgesBetweenOutsideCompilations( + g, outside_compilation_attr_name)); + return Status::OK(); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h index 5e0c4bf6a0cc92d69209595e257989665404db6b..e363bc5754ac395bae262dc67a780a0173efaf5e 100644 --- a/tensorflow/compiler/jit/encapsulate_util.h +++ b/tensorflow/compiler/jit/encapsulate_util.h @@ -44,14 +44,6 @@ Status PerformStaticShapeInferenceBeforeEncapsulation( Graph* g, const string& xla_computation_attr_name, const string& outside_compilation_attr_name); -// Attribute indicating that some ops in this node's XLA computation has control -// dependency on this node. Attribute value will always be "true". -extern const char kXlaConnectedToXlaComputationAttrName[]; - -// Attribute indicating that this node has control dependency on some ops in -// this node's XLA computation. Attribute value will always be "true". -extern const char kXlaConnectedFromXlaComputationAttrName[]; - // Attribute indicating that some ops in other XLA computation has control // dependency on this node. Attribute value will be a list of string (XLA // computation names). @@ -81,6 +73,14 @@ extern const char kOutsideCompilationToHostOriginalNodeAttrName[]; // int (src_output for original edge). extern const char kOutsideCompilationToHostSrcOutputAttrName[]; +// Attribute indicating that some ops in this node's XLA computation has control +// dependency on this node. Attribute value will always be "true". +extern const char kXlaConnectedToXlaComputationAttrName[]; + +// Attribute indicating that this node has control dependency on some ops in +// this node's XLA computation. Attribute value will always be "true". +extern const char kXlaConnectedFromXlaComputationAttrName[]; + // Attribute indicating that this is an Placeholder node added to act as a // temporary input node for an host node. Attribute value will be string // (original input node name). @@ -91,19 +91,31 @@ extern const char kHostToOutsideCompilationOriginalNodeAttrName[]; // for original edge). extern const char kHostToOutsideCompilationSrcOutputAttrName[]; -// Preprocesses the graph for encapsulation. It will perform the following -// operations in order: +// Attribute indicating that this is an Placeholder node added to act as a +// temporary input node for an outside compilation node. Attribute value will be +// string (original input node name). +extern const char kOutsideCompilationOriginalNodeAttrName[]; + +// Attribute indicating that this is an Placeholder node added to act as a +// temporary input node for an outside compilation node. Attribute value will be +// int (src_output for original edge). +extern const char kOutsideCompilationSrcOutputAttrName[]; + +// Attribute indicating that this node has control dependencies on some other +// nodes within the same XLA cluster. Attribute value will be a list of string +// (node names). +extern const char kXlaControlDependenciesWithinXlaClusterAttrName[]; + +// Preprocesses edges between different XLA clusters for encapsulation. It will +// perform the following operations in order: // -// 1a. For control edges between outside compilation and its XLA computation, -// add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the -// outside compilation node. -// 1b. For control edges between outside compilation and another XLA +// 1a. For control edges between outside compilation and another XLA // computation, add attr "kXlaConnected{From, To}OtherXlaComputationAttrName // = XLA computation node name" to the outside compilation node. -// 1c. For control edges between different outside compilations, remove the edge -// and add attr "kXlaControlDependenciesAttrName = src node name" to dst -// node. -// 1d. For control edges between outside compilation and host computation, +// 1b. For control edges between different outside compilations (in different +// XLA computations), remove the edge and add attr +// "kXlaControlDependenciesAttrName = src node name" to dst node. +// 1c. For control edges between outside compilation and host computation, // remove the edge and add attr "kXlaControlDependenciesAttrName = src node // name" to dst node. // 2. For data edges between different XLA computations, if either src or dst @@ -146,26 +158,53 @@ struct XlaClusterInfo { const std::map host_compute_core; }; -// Postprocesses the graph for encapsulation. This function reverts what -// `PreprocessForEncapsulation` did. It will perform the following operations in -// order: +// Postprocesses edges between different XLA clusters for encapsulation. This +// function reverts what `PreprocessForEncapsulation` did. It will perform the +// following operations in order: // // 1. Remove Placeholder nodes between outside compilation and host computation // (created in `PreprocessForEncapsulation` step 3). // 2. Remove Identity nodes created in `PreprocessForEncapsulation` step 2. -// 3a. Reconnect control edges between different outside compilations (marked by -// `PreprocessForEncapsulation` step 1c) and control edges between outside -// compilation and host computation (marked by `PreprocessForEncapsulation` -// step 1d). -// 3b. Reconnect control edges between outside compilation and another XLA -// computation (marked by `PreprocessForEncapsulation` step 1b). -// Notice that control edges marked by `PreprocessForEncapsulation` step 1a are -// not handled here. They are handled in `RewriteOutsideCompilationSubgraphFn`. +// 3a. Reconnect control edges between outside compilation and another XLA +// computation (marked by `PreprocessForEncapsulation` step 1a). +// 3b. Reconnect control edges between different outside compilations (marked by +// `PreprocessForEncapsulation` step 1b). +// 3c. Reconnect control edges between outside compilation and host computation +// (marked by `PreprocessForEncapsulation` step 1c). Status PostprocessForEncapsulation( Graph* g, const string& xla_computation_attr_name, const string& outside_compilation_attr_name, const std::unordered_map& clusters); +// Preprocesses edges within the same XLA cluster. It will perform the following +// operations in order: +// +// 0. Remove edges from source node to outside compilation nodes, and edges +// from outside compilation nodes to sink node. +// 1a. For edges between different outside compilation clusters, remove the edge +// and add attr "kXlaControlDependenciesWithinXlaClusterAttrName = src node +// name" to dst node. +// 1b. For control edges between outside compilation and its XLA computation, +// add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the +// outside compilation node. +// 2. For data edges between different outside compilations, remove the edge +// and create a Placeholder node as dst node's input. +Status PreprocessEdgesBetweenOutsideCompilations( + Graph* g, const string& outside_compilation_attr_name); + +// Postprocesses edges within the same XLA cluster. This function reverts what +// `PreprocessEdgesBetweenOutsideCompilations` did. It will perform the +// following operations in order: +// +// 1. Remove Placeholder nodes between different outside compilations (created +// in `PreprocessEdgesBetweenOutsideCompilations` step 2). +// 2a. Reconnect control edges between different outside compilations (marked by +// `PreprocessEdgesBetweenOutsideCompilations` step 1a). +// Notice that control edges marked by +// `PreprocessEdgesBetweenOutsideCompilations` step 1b are not handled here. +// They are handled in `RewriteOutsideCompilationSubgraphFn`. +Status PostprocessEdgesBetweenOutsideCompilations( + Graph* g, const string& outside_compilation_attr_name); } // namespace tensorflow #endif // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_ diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc index 7255df3112916b7abcc98ff8204efc8c02209b13..25c32cef01d7f9877a35001457539f2ad189192f 100644 --- a/tensorflow/compiler/jit/encapsulate_util_test.cc +++ b/tensorflow/compiler/jit/encapsulate_util_test.cc @@ -107,28 +107,19 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) { identity4_node->AddAttr("_xla", "1"); identity4_node->AddAttr("_oc", "0"); identity5_node->AddAttr("_xla", "1"); - // Case 1a: control edges between outside compilation and its XLA computation. - g.AddControlEdge(add_node, identity0_node); - g.AddControlEdge(identity0_node, identity1_node); - // Case 1b: control edges between outside compilation and another XLA + // Case 1a: control edges between outside compilation and another XLA // computation. g.AddControlEdge(identity0_node, identity3_node); g.AddControlEdge(identity1_node, identity4_node); - // Case 1c: control edges between different outside compilations. + // Case 1b: control edges between different outside compilations. g.AddControlEdge(identity0_node, identity4_node); - // Case 1d: control edges between outside compilation and host computation. + // Case 1c: control edges between outside compilation and host computation. g.AddControlEdge(const0_node, identity0_node); g.AddControlEdge(identity0_node, identity2_node); TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc")); - // Case 1a: add attr "_xla_connected_{from/to}_xla_computation = true" to the - // outside compilation node. - EXPECT_TRUE(HasNodeAttr(identity0_node->def(), - kXlaConnectedFromXlaComputationAttrName)); - EXPECT_TRUE(HasNodeAttr(identity0_node->def(), - kXlaConnectedToXlaComputationAttrName)); - // Case 1b: add attr "_xla_control_deps_{from/to} = XLA computation node name" + // Case 1a: add attr "_xla_control_deps_{from/to} = XLA computation node name" // to the outside compilation node. std::vector attr; TF_CHECK_OK(GetNodeAttr(identity0_node->def(), @@ -140,13 +131,13 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) { kXlaConnectedFromOtherXlaComputationAttrName, &attr)); EXPECT_EQ(attr.size(), 1); EXPECT_EQ(attr[0], "0"); - // Case 1c: add attr "_xla_control_deps = src node name" to dst node. + // Case 1b: add attr "_xla_control_deps = src node name" to dst node. attr.clear(); TF_CHECK_OK(GetNodeAttr(identity4_node->def(), kXlaControlDependenciesAttrName, &attr)); EXPECT_EQ(attr.size(), 1); EXPECT_EQ(attr[0], "identity0"); - // Case 1d: add attr "_xla_control_deps = src node name" to dst node. + // Case 1c: add attr "_xla_control_deps = src node name" to dst node. attr.clear(); TF_CHECK_OK(GetNodeAttr(identity0_node->def(), kXlaControlDependenciesAttrName, &attr)); diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc index 8b3587c5087a0651c466f53f3709ba21e75dd273..e3c7e2f89be9b37b51a633dabb099969c181013f 100644 --- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc +++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc @@ -366,7 +366,7 @@ Status ReplaceOrRemoveOutsideCompilationCallNode( // replace this node with compilation result node. // 3) all outside compilation graphs. Status ConstructHostGraph( - const string& xla_cluster_name, + const string& xla_cluster_name, const string& outside_compilation_attr_name, const std::vector& outside_compilation_host_graphs, FunctionLibraryDefinition* fld, std::unique_ptr* host_graph) { host_graph->reset(new Graph(fld)); @@ -476,6 +476,10 @@ Status ConstructHostGraph( host_graph->get(), std::unordered_set{(*host_graph)->sink_node()}); + // Postprocess edges between different outside compilations. + TF_RETURN_IF_ERROR(PostprocessEdgesBetweenOutsideCompilations( + host_graph->get(), outside_compilation_attr_name)); + if (VLOG_IS_ON(4)) { dump_graph::DumpGraphToFile( absl::StrCat("extract_outside_compilation_host_graph_for_", @@ -801,6 +805,11 @@ Status ExtractOutsideCompilationForFunction( }, &fbody)); std::unique_ptr fbody_deleter(fbody); + + // Preprocess edges between different outside compilations. They will be + // restored in `ConstructHostGraph()`. + TF_RETURN_IF_ERROR(PreprocessEdgesBetweenOutsideCompilations( + fbody->graph, outside_compilation_attr_name)); if (VLOG_IS_ON(4)) { dump_graph::DumpGraphToFile( absl::StrCat("extract_outside_compilation_for_func_before_", func_name), @@ -860,8 +869,9 @@ Status ExtractOutsideCompilationForFunction( // Construct host graph. if (!outside_compilation_host_graphs.empty()) { - TF_RETURN_IF_ERROR(ConstructHostGraph( - xla_cluster_name, outside_compilation_host_graphs, fld, host_graph)); + TF_RETURN_IF_ERROR( + ConstructHostGraph(xla_cluster_name, outside_compilation_attr_name, + outside_compilation_host_graphs, fld, host_graph)); } // Remove the outside compilation graphs from function library. diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc index c5bd64f004ef98853955372680277e04c16bdc9e..bff956100da661b679b4557fce53671e6cef88c5 100644 --- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc +++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc @@ -290,21 +290,18 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) { TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shapes", &shapes)); EXPECT_EQ(shapes.size(), 1); EXPECT_EQ(shapes[0].dim_size(), 1); - // Check XlaHostCompute nodes' "shape_inference_graph" attr. "0" should have a - // non-empty value, and "1" should have an empty value. + // Check XlaHostCompute nodes' "shape_inference_graph" attr. Both should have + // empty values. string shape_inference_graph; TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph", &shape_inference_graph)); - EXPECT_EQ(shape_inference_graph, - "_outside_compilation_shape_inference_cluster_0"); + EXPECT_EQ(shape_inference_graph, ""); TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph", &shape_inference_graph)); EXPECT_EQ(shape_inference_graph, ""); // Check `shape_inference_graphs`. - EXPECT_EQ(shape_inference_graphs.size(), 1); - EXPECT_EQ(shape_inference_graphs[0], - "_outside_compilation_shape_inference_cluster_0"); + EXPECT_EQ(shape_inference_graphs.size(), 0); // Check `host_graph`: verify we have key placeholder and sequencer. Node *key_placeholder = nullptr, *sequencer = nullptr; @@ -333,8 +330,8 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) { send_recv_nodes.push_back(n); } } - EXPECT_EQ(num_send_from_host, 2); - EXPECT_EQ(num_recv_at_host, 2); + EXPECT_EQ(num_send_from_host, 1); + EXPECT_EQ(num_recv_at_host, 1); for (Node *n : send_recv_nodes) { Node *input_node; TF_CHECK_OK(n->input_node(n->num_inputs() - 1, &input_node)); diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc index 702197e1b166b801d52d405d12811a724f73b0b8..98e344b3a080aa8aab27cd41564a90427bac151e 100644 --- a/tensorflow/compiler/jit/flags.cc +++ b/tensorflow/compiler/jit/flags.cc @@ -20,7 +20,6 @@ limitations under the License. #include "tensorflow/core/util/command_line_flags.h" namespace tensorflow { -namespace legacy_flags { namespace { BuildXlaOpsPassFlags* build_ops_flags; @@ -150,5 +149,4 @@ void AppendDumpGraphFlags(std::vector* flag_list) { AppendDumpGraphFlagsInternal(flag_list); } -} // namespace legacy_flags } // namespace tensorflow diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h index 50f4f358365fd6600803b1c696da49a0a4077c7a..5ddea588eef5270880d91623dc05893da265960a 100644 --- a/tensorflow/compiler/jit/flags.h +++ b/tensorflow/compiler/jit/flags.h @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/core/util/command_line_flags.h" namespace tensorflow { -namespace legacy_flags { // Flags associated with the XLA bridge's mark_for_compilation_pass module. struct MarkForCompilationPassFlags { @@ -99,7 +98,6 @@ void AppendMarkForCompilationPassFlags( std::vector* flag_list); void AppendDumpGraphFlags(std::vector* flag_list); -} // namespace legacy_flags } // namespace tensorflow #endif // TENSORFLOW_COMPILER_JIT_FLAGS_H_ diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc index a8295bb77b997ade4f726d09207365a47f32818a..ce53f70b79d97ab087fefe542920b33f883632a2 100644 --- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc +++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc @@ -345,8 +345,7 @@ Status FindAndRewriteSlices(Graph* g, bool* changed) { Status IncreaseDynamismForAutoJitPass::Run( const GraphOptimizationPassOptions& options) { - legacy_flags::MarkForCompilationPassFlags* flags = - legacy_flags::GetMarkForCompilationPassFlags(); + MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags(); if (flags->tf_xla_clustering_debug) { dump_graph::DumpGraphToFile("before_increase_dynamism_for_auto_jit_pass", **options.graph, options.flib_def); diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc index 2dea0c53302a955aee98f7993d02b923bfd66c68..ad71df5a694a5f8da94675049df1062a7edb6253 100644 --- a/tensorflow/compiler/jit/kernels/xla_ops.cc +++ b/tensorflow/compiler/jit/kernels/xla_ops.cc @@ -418,7 +418,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) { cannot_compile_cluster = cannot_compile_cluster_; } - if (legacy_flags::GetXlaOpsCommonFlags().tf_xla_always_defer_compilation || + if (GetXlaOpsCommonFlags().tf_xla_always_defer_compilation || cannot_compile_cluster) { executable = nullptr; } else { diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index 9b02b0c3f99bf3c9891abefce771120406b48f21..25796435a5c87af5e252981abf96833f4cda9a5e 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -72,6 +72,11 @@ struct OperationFilter { // to resort to a dummy implementation. Currently Assert and CheckNumerics ops // have dummy XLA implementations. bool allow_dummy_ops; + + // Whether ops that produce or consume DT_VARIANT values are allowed. We + // don't auto-cluster these ops because we don't yet support live-in or + // live-out DT_VARIANT values. + bool allow_ops_producing_or_consuming_variant; }; bool IsDummyImplOp(absl::string_view op_name) { @@ -84,6 +89,12 @@ bool IsStatefulRandomOp(absl::string_view op_name) { op_name == "TruncatedNormal"; } +bool OpProducesOrConsumesVariant(const Node& node) { + auto is_variant = [](DataType dtype) { return dtype == DT_VARIANT; }; + return absl::c_any_of(node.input_types(), is_variant) || + absl::c_any_of(node.output_types(), is_variant); +} + bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) { // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient // is really a kind of function call and will be handled by @@ -246,6 +257,10 @@ bool IsCompilableCall(const NodeDef& call_def, if (!op_filter.allow_dummy_ops && IsDummyImplOp(node->type_string())) { return false; } + if (!op_filter.allow_ops_producing_or_consuming_variant && + OpProducesOrConsumesVariant(*node)) { + return false; + } if (!HasXLAKernel(*node, jit_device_type) && !IsCompilableCall(node->def(), jit_device_type, op_filter, depth + 1, lib_runtime)) { @@ -427,8 +442,7 @@ Status FindCompilationCandidates( BackwardsConstAnalysis(graph, /*compile_time_const_arg_indices=*/nullptr, &compile_time_const_nodes)); - int64& fuel = - legacy_flags::GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel; + int64& fuel = GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel; // Iterate over nodes in sorted order so that compiler fuel is deterministic. // We can't simply pass op_nodes().begin() and op_nodes().end to the @@ -471,16 +485,15 @@ Status FindCompilationCandidates( XlaOpRegistry::GetCompilationDevice(device_type.type(), ®istration)); DeviceType jit_device_type(registration->compilation_device_name); + bool always_auto_cluster = registration->autoclustering_policy == + XlaOpRegistry::AutoclusteringPolicy::kAlways; + OperationFilter op_filter; op_filter.allow_resource_ops = registration->compile_resource_ops; - op_filter.allow_stateful_rng_ops = - (registration->autoclustering_policy == - XlaOpRegistry::AutoclusteringPolicy::kAlways); - op_filter.allow_control_trigger = - (registration->autoclustering_policy == - XlaOpRegistry::AutoclusteringPolicy::kAlways); - op_filter.allow_dummy_ops = (registration->autoclustering_policy == - XlaOpRegistry::AutoclusteringPolicy::kAlways); + op_filter.allow_stateful_rng_ops = always_auto_cluster; + op_filter.allow_control_trigger = always_auto_cluster; + op_filter.allow_dummy_ops = always_auto_cluster; + op_filter.allow_ops_producing_or_consuming_variant = always_auto_cluster; if (!HasXLAKernel(*node, jit_device_type) && !IsCompilableCall(node->def(), jit_device_type, op_filter, 0, @@ -504,6 +517,12 @@ Status FindCompilationCandidates( << node->type_string() << ")"; continue; } + if (!op_filter.allow_ops_producing_or_consuming_variant && + OpProducesOrConsumesVariant(*node)) { + VLOG(2) << "Rejecting " << node->name() + << ": produces or consumes DT_VARIANT"; + continue; + } if (!op_filter.allow_resource_ops && (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) { @@ -607,8 +626,7 @@ OptimizerOptions::GlobalJitLevel GetGlobalJitLevel( // To set compilation to be on by default, change the following line. global_jit_level = OptimizerOptions::OFF; } - legacy_flags::MarkForCompilationPassFlags* flags = - legacy_flags::GetMarkForCompilationPassFlags(); + MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags(); if (flags->tf_xla_auto_jit == -1 || (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) { // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides @@ -641,6 +659,7 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) { op_filter.allow_stateful_rng_ops = true; op_filter.allow_control_trigger = true; op_filter.allow_dummy_ops = true; + op_filter.allow_ops_producing_or_consuming_variant = true; return IsCompilableCall(ndef, jit_device_type, op_filter, 0, flr); } @@ -651,8 +670,7 @@ Status MarkForCompilationPass::Run( // device ahead of time. OptimizerOptions::GlobalJitLevel global_jit_level = GetGlobalJitLevel(options); - legacy_flags::MarkForCompilationPassFlags* flags = - legacy_flags::GetMarkForCompilationPassFlags(); + MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags(); bool fusion_only = flags->tf_xla_fusion_only; VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only; @@ -953,8 +971,7 @@ Status MarkForCompilationPass::RunImpl( OptimizerOptions::GlobalJitLevel global_jit_level = GetGlobalJitLevel(options); - legacy_flags::MarkForCompilationPassFlags* flags = - legacy_flags::GetMarkForCompilationPassFlags(); + MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags(); // Repeatedly contract edges between clusters that are on the same device, // provided the contraction would not create a cycle. diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc index 24d78c077268f83cebbdafddc1a658ae8dc6b8d8..bf2c5508ea9e987e80093f4c2e15d3ff5191126f 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/cc/ops/array_ops.h" #include "tensorflow/cc/ops/control_flow_ops_internal.h" #include "tensorflow/cc/ops/function_ops.h" +#include "tensorflow/cc/ops/list_ops.h" #include "tensorflow/cc/ops/resource_variable_ops.h" #include "tensorflow/cc/ops/sendrecv_ops.h" #include "tensorflow/cc/ops/standard_ops.h" @@ -1147,5 +1148,80 @@ TEST(XlaCompilationTest, DontAutoClusterDummyOps) { EXPECT_EQ(clusters["test/check"], ""); } +TEST(XlaCompilationTest, DontAutoClusterOpsProducingVariant) { + Scope root = Scope::NewRootScope().ExitOnError(); + Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64); + Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64); + + Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32); + Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32); + + Output tensor_list_reserve = ops::TensorListReserve( + root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(root.ToGraph(graph.get())); + + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + + std::unordered_map clusters = GetClusters(*graph); + EXPECT_EQ(clusters["test/tensor_list_reserve"], ""); +} + +TEST(XlaCompilationTest, DontAutoClusterOpsConsumingVariant) { + Scope root = Scope::NewRootScope().ExitOnError(); + Output dummy_input = + ops::Placeholder(root.WithOpName("test/dummy_input"), DT_INT64); + Output variant_input = + ops::Placeholder(root.WithOpName("test/variant_input"), DT_VARIANT); + + // Create one more node so that we don't avoid creating a cluster solely + // because it would be trivial. + Output dummy_cast = + ops::Cast(root.WithOpName("test/dummy_cast"), dummy_input, DT_INT32); + + Output tensor_list_element_shape = ops::TensorListElementShape( + root.WithOpName("test/tensor_list_element_shape"), variant_input, + DT_INT32); + + root.graph()->AddControlEdge(dummy_cast.node(), + tensor_list_element_shape.node()); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(root.ToGraph(graph.get())); + + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + + std::unordered_map clusters = GetClusters(*graph); + EXPECT_EQ(clusters["test/tensor_list_element_shape"], ""); +} + +TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) { + Scope root = Scope::NewRootScope().ExitOnError(); + Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64); + Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64); + + Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32); + Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32); + + Output tensor_list_reserve = ops::TensorListReserve( + root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT); + + std::unique_ptr graph(new Graph(OpRegistry::Global())); + TF_ASSERT_OK(root.ToGraph(graph.get())); + + string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0"; + for (Node* n : graph->nodes()) { + if (absl::StartsWith(n->name(), /*prefix=*/"test/")) { + n->set_assigned_device_name(xla_cpu_device); + } + } + + TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph)); + + std::unordered_map clusters = GetClusters(*graph); + EXPECT_NE(clusters["test/tensor_list_reserve"], ""); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc index 13126e0878c8cd813238bcc23f9c3e9b1cab40d3..9006dd514b166ad8291d2d437305e53de2a093a4 100644 --- a/tensorflow/compiler/jit/xla_cpu_device.cc +++ b/tensorflow/compiler/jit/xla_cpu_device.cc @@ -37,7 +37,7 @@ class XlaCpuDeviceFactory : public DeviceFactory { Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& session_options, const string& name_prefix, std::vector* devices) { - legacy_flags::XlaDeviceFlags* flags = legacy_flags::GetXlaDeviceFlags(); + XlaDeviceFlags* flags = GetXlaDeviceFlags(); bool compile_on_demand = flags->tf_xla_compile_on_demand; XlaOpRegistry::DeviceRegistration registration; diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc index 738bac54cca450857b506681a6d8fe54fbffb86c..4201ff91a89b1bee370e6a43337c51abe3bf974a 100644 --- a/tensorflow/compiler/jit/xla_device.cc +++ b/tensorflow/compiler/jit/xla_device.cc @@ -410,6 +410,31 @@ Status XlaDevice::Sync() { return Status::OK(); } +void XlaDevice::Sync(const DoneCallback& done) { + VLOG(1) << "XlaDevice::Sync (asynchronous)"; + std::shared_ptr stream; + { + mutex_lock lock(mu_); + stream = stream_; + } + if (!stream) { + done(Status::OK()); + return; + } + + stream->ThenEnqueueOnBackgroundThread( + [this, stream, done](se::StreamExecutor*) { + tracing::ScopedActivity activity("XlaDevice::Sync::Callback", + /*is_expensive=*/true); + mutex_lock lock(mu_); + while (outstanding_asynchronous_operations_ > 0) { + outstanding_asynchronous_operations_cv_.wait(lock); + } + done(stream->ok() ? Status::OK() + : errors::Internal("XlaDevice::Sync() failed.")); + }); +} + Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs, Tensor* tensor) { diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h index dc8f49a9c975a25cbae0e980749db5a1daf039e4..c8bb276cdb9673fdcba4cc15a9f33ecd3ae96dbb 100644 --- a/tensorflow/compiler/jit/xla_device.h +++ b/tensorflow/compiler/jit/xla_device.h @@ -135,6 +135,7 @@ class XlaDevice : public LocalDevice { void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context, AsyncOpKernel::DoneCallback done) override; Status Sync() override; + void Sync(const DoneCallback& done) override; Status FillContextMap(const Graph* graph, DeviceContextMap* device_context_map) override diff --git a/tensorflow/compiler/tests/adagrad_da_test.py b/tensorflow/compiler/tests/adagrad_da_test.py index 69fb3ec2964a09508e612515b9e291fc14121d68..e9c2d363acab96c0fb968cb7f901ce105ea8703e 100644 --- a/tensorflow/compiler/tests/adagrad_da_test.py +++ b/tensorflow/compiler/tests/adagrad_da_test.py @@ -50,8 +50,8 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase): zip([grads0, grads1], [var0, var1]), global_step=global_step) variables.global_variables_initializer().run() - self.assertAllClose([0.0, 0.0], var0.eval()) - self.assertAllClose([0.0, 0.0], var1.eval()) + self.assertAllClose([0.0, 0.0], self.evaluate(var0)) + self.assertAllClose([0.0, 0.0], self.evaluate(var1)) # Run a step of AdagradDA update.run() @@ -63,9 +63,9 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase): # For -0.1*3.0*(0.1 - 0)/(0 + sqrt(0.1 + 0.1*0.1)) = -0.904534 # similarly for others. self.assertAllCloseAccordingToType( - np.array([-0.904534, -1.603567]), var0.eval()) + np.array([-0.904534, -1.603567]), self.evaluate(var0)) self.assertAllCloseAccordingToType( - np.array([-0.094821, -0.189358]), var1.eval()) + np.array([-0.094821, -0.189358]), self.evaluate(var1)) def testAdagradDAwithoutRegularizationBasic2(self): for dtype in self.float_types: @@ -87,16 +87,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase): zip([grads0, grads1], [var0, var1]), global_step=global_step) variables.global_variables_initializer().run() - self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) - self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval()) + self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0)) + self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1)) # Run a step of AdagradDA update.run() self.assertAllCloseAccordingToType( - np.array([-0.904534, -1.603567]), var0.eval()) + np.array([-0.904534, -1.603567]), self.evaluate(var0)) self.assertAllCloseAccordingToType( - np.array([-0.094821, -0.189358]), var1.eval()) + np.array([-0.094821, -0.189358]), self.evaluate(var1)) def testAdagradDAWithL1(self): for dtype in self.float_types: @@ -118,16 +118,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase): zip([grads0, grads1], [var0, var1]), global_step=global_step) variables.global_variables_initializer().run() - self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) - self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval()) + self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0)) + self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1)) # Run a step of AdagradDA update.run() self.assertAllCloseAccordingToType( - np.array([-0.895489, -1.59555]), var0.eval()) + np.array([-0.895489, -1.59555]), self.evaluate(var0)) self.assertAllCloseAccordingToType( - np.array([-0.085339, -0.17989]), var1.eval()) + np.array([-0.085339, -0.17989]), self.evaluate(var1)) def testAdagradDAWithL1_L2(self): for dtype in self.float_types: @@ -149,16 +149,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase): zip([grads0, grads1], [var0, var1]), global_step=global_step) variables.global_variables_initializer().run() - self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) - self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval()) + self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0)) + self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1)) # Run a step of AdagradDA update.run() self.assertAllCloseAccordingToType( - np.array([-0.046907, -0.093659]), var0.eval()) + np.array([-0.046907, -0.093659]), self.evaluate(var0)) self.assertAllCloseAccordingToType( - np.array([-0.004275, -0.009023]), var1.eval()) + np.array([-0.004275, -0.009023]), self.evaluate(var1)) if __name__ == "__main__": diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py index ab69319c59fb07e7ce56c3c287a50a6290effdfd..e26483303c3934fd51675cb1fbc998b276caf527 100644 --- a/tensorflow/compiler/tests/adagrad_test.py +++ b/tensorflow/compiler/tests/adagrad_test.py @@ -42,17 +42,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase): zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 3 steps of adagrad for _ in range(3): ada_update.run() # Validate updated params self.assertAllCloseAccordingToType( - np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(), + np.array([-1.6026098728179932, -0.6026098728179932]), + self.evaluate(var0), float_rtol=1e-5) self.assertAllCloseAccordingToType( - np.array([2.715679168701172, 3.715679168701172]), var1.eval(), + np.array([2.715679168701172, 3.715679168701172]), + self.evaluate(var1), float_rtol=1e-5) def testTensorLearningRate(self): @@ -68,17 +70,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase): zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 3 steps of adagrad for _ in range(3): ada_update.run() # Validate updated params self.assertAllCloseAccordingToType( - np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(), + np.array([-1.6026098728179932, -0.6026098728179932]), + self.evaluate(var0), float_rtol=1e-5) self.assertAllCloseAccordingToType( - np.array([2.715679168701172, 3.715679168701172]), var1.eval(), + np.array([2.715679168701172, 3.715679168701172]), + self.evaluate(var1), float_rtol=1e-5) def testSharing(self): @@ -103,18 +107,20 @@ class AdagradOptimizerTest(xla_test.XLATestCase): variables.global_variables_initializer().run() # Fetch params to validate initial values. - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Mix the first and the second adagrad for 3 steps. ada_update1.run() ada_update2.run() ada_update1.run() # Validate updated params (the same as with only 1 Adagrad). self.assertAllCloseAccordingToType( - np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(), + np.array([-1.6026098728179932, -0.6026098728179932]), + self.evaluate(var0), float_rtol=1e-5) self.assertAllCloseAccordingToType( - np.array([2.715679168701172, 3.715679168701172]), var1.eval(), + np.array([2.715679168701172, 3.715679168701172]), + self.evaluate(var1), float_rtol=1e-5) diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py index 058576b3d4b695209952158769162bb24e7ccfce..8bcff9d379d34f8a6bb8b0fdc60b7588c6d80be9 100644 --- a/tensorflow/compiler/tests/adam_test.py +++ b/tensorflow/compiler/tests/adam_test.py @@ -75,23 +75,24 @@ class AdamOptimizerTest(xla_test.XLATestCase): variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) beta1_power, beta2_power = opt._get_beta_accumulators() # Run 3 steps of Adam for t in range(1, 4): - self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) - self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) + self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power)) + self.assertAllCloseAccordingToType(0.999**t, + self.evaluate(beta2_power)) update.run(feed_dict={grads0: grads0_np, grads1: grads1_np}) var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params - self.assertAllCloseAccordingToType(var0_np, var0.eval()) - self.assertAllCloseAccordingToType(var1_np, var1.eval()) + self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) def testTensorLearningRate(self): for dtype in self.float_types: @@ -117,23 +118,24 @@ class AdamOptimizerTest(xla_test.XLATestCase): variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) beta1_power, beta2_power = opt._get_beta_accumulators() # Run 3 steps of Adam for t in range(1, 4): - self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) - self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) + self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power)) + self.assertAllCloseAccordingToType(0.999**t, + self.evaluate(beta2_power)) update.run(feed_dict={grads0: grads0_np, grads1: grads1_np}) var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params - self.assertAllCloseAccordingToType(var0_np, var0.eval()) - self.assertAllCloseAccordingToType(var1_np, var1.eval()) + self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) def testSharing(self): for dtype in self.float_types: @@ -162,13 +164,14 @@ class AdamOptimizerTest(xla_test.XLATestCase): beta1_power, beta2_power = opt._get_beta_accumulators() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 3 steps of intertwined Adam1 and Adam2. for t in range(1, 4): - self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) - self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) + self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power)) + self.assertAllCloseAccordingToType(0.999**t, + self.evaluate(beta2_power)) if t % 2 == 0: update1.run(feed_dict={grads0: grads0_np, grads1: grads1_np}) else: @@ -178,8 +181,8 @@ class AdamOptimizerTest(xla_test.XLATestCase): var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params - self.assertAllCloseAccordingToType(var0_np, var0.eval()) - self.assertAllCloseAccordingToType(var1_np, var1.eval()) + self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) if __name__ == "__main__": diff --git a/tensorflow/compiler/tests/adamax_test.py b/tensorflow/compiler/tests/adamax_test.py index 3ed1d41b7121f44dd7470f61180f7a7055369174..961b46375c941bdc3922e460a2f58345086dbceb 100644 --- a/tensorflow/compiler/tests/adamax_test.py +++ b/tensorflow/compiler/tests/adamax_test.py @@ -78,8 +78,8 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase): variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) beta1_power = opt._get_beta_accumulators() @@ -87,14 +87,17 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase): for t in range(1, 4): update.run() - self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval()) + self.assertAllCloseAccordingToType(0.9**(t + 1), + self.evaluate(beta1_power)) var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params - self.assertAllCloseAccordingToType(var0_np, var0.eval(), rtol=1e-2) - self.assertAllCloseAccordingToType(var1_np, var1.eval(), rtol=1e-2) + self.assertAllCloseAccordingToType( + var0_np, self.evaluate(var0), rtol=1e-2) + self.assertAllCloseAccordingToType( + var1_np, self.evaluate(var1), rtol=1e-2) self.assertEqual("var0_%d/AdaMax:0" % (i,), opt.get_slot(var=var0, name="m").name) @@ -118,22 +121,23 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase): variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) beta1_power = opt._get_beta_accumulators() # Run 3 steps of AdaMax for t in range(1, 4): - self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval()) + self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power)) update.run() var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params - self.assertAllCloseAccordingToType(var0_np, var0.eval()) - self.assertAllCloseAccordingToType(var1_np, var1.eval()) + self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) + if __name__ == "__main__": test.main() diff --git a/tensorflow/compiler/tests/addsign_test.py b/tensorflow/compiler/tests/addsign_test.py index 1bc07ace23ccdc83103abe71ee11b72994c75a6d..a37c97e6d374440aeb860b9d02f2d5dd95c91f62 100644 --- a/tensorflow/compiler/tests/addsign_test.py +++ b/tensorflow/compiler/tests/addsign_test.py @@ -90,8 +90,8 @@ class AddSignTest(xla_test.XLATestCase): variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 7 steps of AddSign # first 4 steps with positive gradient @@ -125,8 +125,8 @@ class AddSignTest(xla_test.XLATestCase): # Validate updated params self.assertAllCloseAccordingToType( - var0_np, var0.eval(), half_rtol=1e-2) - self.assertAllCloseAccordingToType(var1_np, var1.eval()) + var0_np, self.evaluate(var0), half_rtol=1e-2) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) def testDense(self): decay_steps = 10 diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py index a57d1dc81ea2c9c188b0a3005904738aa8156bf3..f17e84df13c43add0eb877b48a8c46f6d0767419 100644 --- a/tensorflow/compiler/tests/categorical_op_test.py +++ b/tensorflow/compiler/tests/categorical_op_test.py @@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import random_seed from tensorflow.python.ops import array_ops from tensorflow.python.ops import random_ops +from tensorflow.python.ops import stateless_random_ops from tensorflow.python.platform import googletest @@ -138,6 +139,36 @@ class CategoricalTest(xla_test.XLATestCase): chi2 = self._chi2(probs, freqs) self.assertLess(chi2, 1e-3) + def testStatelessMultinomialIsInRange(self): + for dtype in self.float_types: + for output_dtype in self.output_dtypes(): + with self.cached_session() as sess: + with self.test_scope(): + seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) + x = stateless_random_ops.stateless_multinomial( + array_ops.ones(shape=[1, 20], dtype=dtype), + 1000, + seed_t, + output_dtype=output_dtype) + y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]}) + self.assertTrue((y >= 0).sum() == 1000) + self.assertTrue((y < 20).sum() == 1000) + + def testDeterminismMultinomial(self): + # Stateless values should be equal iff the seeds are equal (roughly) + num_samples = 10 + with self.cached_session(), self.test_scope(): + seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) + seeds = [(x, y) for x in range(5) for y in range(5)] * 3 + for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2], + [0.25, 0.75]]): + pure = stateless_random_ops.stateless_multinomial( + logits, num_samples, seed=seed_t) + values = [(seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds] + for s0, v0 in values: + for s1, v1 in values: + self.assertEqual(s0 == s1, np.all(v0 == v1)) + if __name__ == '__main__': googletest.main() diff --git a/tensorflow/compiler/tests/clustering_test.py b/tensorflow/compiler/tests/clustering_test.py index 88bd58b2da6b2892f898ad10f3467d8ce39d6388..ef2d7af69deeebd5f4c4c7225d7027f8f76bf861 100644 --- a/tensorflow/compiler/tests/clustering_test.py +++ b/tensorflow/compiler/tests/clustering_test.py @@ -43,7 +43,7 @@ class ClusteringTest(xla_test.XLATestCase): input1 = constant_op.constant(val1, name="const1") input2 = constant_op.constant(val2, name="const2") output = math_ops.add(input1, input2) - result = output.eval() + result = self.evaluate(output) self.assertAllClose(result, expected, rtol=1e-3) def testAddFromCpuMultiple(self): @@ -57,7 +57,7 @@ class ClusteringTest(xla_test.XLATestCase): with self.test_scope(): output = math_ops.add(input1, input2) for _ in xrange(10): - result = output.eval() + result = self.evaluate(output) self.assertAllClose(result, expected, rtol=1e-3) def testDeadlock(self): diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py index 2d225ad226cac368042b95eae8fc29e6fd8e82e0..30fbe6f701fdea9c908bc7aec8e8b10ca505d2f6 100644 --- a/tensorflow/compiler/tests/concat_ops_test.py +++ b/tensorflow/compiler/tests/concat_ops_test.py @@ -72,7 +72,7 @@ class ConcatTest(xla_test.XLATestCase): x2 = constant_op.constant(p2) with self.test_scope(): c = array_ops.concat([x1, x2], 0) - result = c.eval() + result = self.evaluate(c) self.assertAllEqual(result[:2, :], p1) self.assertAllEqual(result[2:, :], p2) @@ -150,7 +150,7 @@ class ConcatTest(xla_test.XLATestCase): [float(x) for x in grad_inp.flatten()], shape=output_shape) grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor]) concated_grad = array_ops.concat(grad, 1) - result = concated_grad.eval() + result = self.evaluate(concated_grad) self.assertAllEqual(result, grad_inp) def testGradientsSimpleAll(self): @@ -177,7 +177,7 @@ class ConcatTest(xla_test.XLATestCase): [float(x) for x in grad_inp.flatten()], shape=output_shape) grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor]) concated_grad = array_ops.concat(grad, 0) - result = concated_grad.eval() + result = self.evaluate(concated_grad) self.assertAllEqual(result, grad_inp) @@ -205,7 +205,7 @@ class ConcatTest(xla_test.XLATestCase): [float(x) for x in grad_inp.flatten()], shape=output_shape) grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor]) concated_grad = array_ops.concat(grad, 2) - result = concated_grad.eval() + result = self.evaluate(concated_grad) self.assertAllEqual(result, grad_inp) @@ -242,7 +242,7 @@ class ConcatTest(xla_test.XLATestCase): [float(x) for x in grad_inp.flatten()], shape=output_shape) grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor]) concated_grad = array_ops.concat(grad, concat_dim) - result = concated_grad.eval() + result = self.evaluate(concated_grad) self.assertAllEqual(result, grad_inp) @@ -280,7 +280,7 @@ class ConcatTest(xla_test.XLATestCase): with self.test_scope(): concat_list_t = array_ops.concat([c1, c2], 0) concat_tuple_t = array_ops.concat((c1, c2), 0) - self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval()) + self.assertAllEqual(concat_list_t.eval(), self.evaluate(concat_tuple_t)) def testConcatNoScalars(self): with self.cached_session(): diff --git a/tensorflow/compiler/tests/conv3d_test.py b/tensorflow/compiler/tests/conv3d_test.py index d59fd0236f4f7da2bbfb3409342c7f70f8f5d1f6..01cc1b6392845be2418c50d55be97487eb290843 100644 --- a/tensorflow/compiler/tests/conv3d_test.py +++ b/tensorflow/compiler/tests/conv3d_test.py @@ -85,7 +85,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase): 1.0, shape=f_shape, name="filter", dtype=dtypes.float32) output = nn_ops.conv3d_transpose( x, f, y_shape, strides=strides, padding="SAME") - value = output.eval() + value = self.evaluate(output) # We count the number of cells being added at the locations in the output. # At the center, #cells = kernel_depth * kernel_height * kernel_width @@ -135,7 +135,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase): 1.0, shape=f_shape, name="filter", dtype=dtypes.float32) output = nn_ops.conv3d_transpose( x, f, y_shape, strides=strides, padding="SAME") - value = output.eval() + value = self.evaluate(output) for n in xrange(x_shape[0]): for k in xrange(f_shape[3]): @@ -173,7 +173,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase): 1.0, shape=f_shape, name="filter", dtype=dtypes.float32) output = nn_ops.conv3d_transpose( x, f, y_shape, strides=strides, padding="VALID") - value = output.eval() + value = self.evaluate(output) cache_values = np.zeros(y_shape, dtype=np.float32) diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py index d1b90f098d7d6574999ba0af44b285f5ad5e4f8d..23c94cf24516fd26f6b2430b347aa0a909374cae 100644 --- a/tensorflow/compiler/tests/dense_layer_test.py +++ b/tensorflow/compiler/tests/dense_layer_test.py @@ -42,7 +42,7 @@ def GetRunMetadataLabels(run_metadata): def InLabels(labels, substr): """Returns true iff one of the labels contains substr.""" - return any([substr in x for x in labels]) + return any(substr in x for x in labels) class DenseLayerTest(test.TestCase): diff --git a/tensorflow/compiler/tests/fifo_queue_test.py b/tensorflow/compiler/tests/fifo_queue_test.py index 8c7edfd277c992c35a81dd5f261256a86352254e..91d77d2f791834346f43aecb60d116ddbf2faa6e 100644 --- a/tensorflow/compiler/tests/fifo_queue_test.py +++ b/tensorflow/compiler/tests/fifo_queue_test.py @@ -129,7 +129,7 @@ class FIFOQueueTest(xla_test.XLATestCase): enqueue_op.run() for i in xrange(len(elems)): - vals = dequeued_t.eval() + vals = self.evaluate(dequeued_t) self.assertEqual([elems[i]], vals) def testEnqueueAndBlockingDequeue(self): @@ -192,9 +192,9 @@ class FIFOQueueTest(xla_test.XLATestCase): self.assertEqual([], size.get_shape()) enqueue_op.run() - self.assertEqual(1, size.eval()) + self.assertEqual(1, self.evaluate(size)) dequeued_t.op.run() - self.assertEqual(0, size.eval()) + self.assertEqual(0, self.evaluate(size)) if __name__ == "__main__": diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py index 5b197afd655404e4e36a8b3442f8db60cb1d648d..b078053cdbd6d129645734492d34dd25d28ab3ef 100644 --- a/tensorflow/compiler/tests/ftrl_test.py +++ b/tensorflow/compiler/tests/ftrl_test.py @@ -50,14 +50,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase): ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([0.0, 0.0], var0.eval()) - self.assertAllClose([0.0, 0.0], var1.eval()) + self.assertAllClose([0.0, 0.0], self.evaluate(var0)) + self.assertAllClose([0.0, 0.0], self.evaluate(var1)) # Run Ftrl for a few steps for _ in range(steps): ftrl_update.run() - return var0.eval(), var1.eval() + return self.evaluate(var0), self.evaluate(var1) def equivAdagradTest_AdagradPart(self, steps, dtype): var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype) @@ -65,14 +65,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase): adagrad_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([0.0, 0.0], var0.eval()) - self.assertAllClose([0.0, 0.0], var1.eval()) + self.assertAllClose([0.0, 0.0], self.evaluate(var0)) + self.assertAllClose([0.0, 0.0], self.evaluate(var1)) # Run Adagrad for a few steps for _ in range(steps): adagrad_update.run() - return var0.eval(), var1.eval() + return self.evaluate(var0), self.evaluate(var1) def equivGradientDescentTest_FtrlPart(self, steps, dtype): var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype) @@ -85,14 +85,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase): ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([0.0, 0.0], var0.eval()) - self.assertAllClose([0.0, 0.0], var1.eval()) + self.assertAllClose([0.0, 0.0], self.evaluate(var0)) + self.assertAllClose([0.0, 0.0], self.evaluate(var1)) # Run Ftrl for a few steps for _ in range(steps): ftrl_update.run() - return var0.eval(), var1.eval() + return self.evaluate(var0), self.evaluate(var1) def equivGradientDescentTest_GradientDescentPart(self, steps, dtype): var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype) @@ -100,14 +100,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase): sgd_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([0.0, 0.0], var0.eval()) - self.assertAllClose([0.0, 0.0], var1.eval()) + self.assertAllClose([0.0, 0.0], self.evaluate(var0)) + self.assertAllClose([0.0, 0.0], self.evaluate(var1)) # Run GradientDescent for a few steps for _ in range(steps): sgd_update.run() - return var0.eval(), var1.eval() + return self.evaluate(var0), self.evaluate(var1) def testFtrlwithoutRegularization(self): for dtype in self.float_types: @@ -124,8 +124,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase): ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([0.0, 0.0], var0.eval()) - self.assertAllClose([0.0, 0.0], var1.eval()) + self.assertAllClose([0.0, 0.0], self.evaluate(var0)) + self.assertAllClose([0.0, 0.0], self.evaluate(var1)) # Run 3 steps FTRL for _ in range(3): @@ -134,12 +134,12 @@ class FtrlOptimizerTest(xla_test.XLATestCase): # Validate updated params self.assertAllCloseAccordingToType( np.array([-2.60260963, -4.29698515]), - var0.eval(), + self.evaluate(var0), float_rtol=1e-4, half_rtol=1e-2) self.assertAllCloseAccordingToType( np.array([-0.28432083, -0.56694895]), - var1.eval(), + self.evaluate(var1), float_rtol=1e-5, half_rtol=1e-2) @@ -158,8 +158,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase): ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([4.0, 3.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([4.0, 3.0], self.evaluate(var1)) # Run 3 steps FTRL for _ in range(3): @@ -167,10 +167,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase): # Validate updated params self.assertAllCloseAccordingToType( - np.array([-2.55607247, -3.98729396]), var0.eval(), 1e-5, 1e-5, + np.array([-2.55607247, -3.98729396]), + self.evaluate(var0), + 1e-5, + 1e-5, float_rtol=1e-4) self.assertAllCloseAccordingToType( - np.array([-0.28232238, -0.56096673]), var1.eval(), 1e-5, 1e-5) + np.array([-0.28232238, -0.56096673]), self.evaluate(var1), 1e-5, + 1e-5) def testFtrlWithL1(self): for dtype in self.float_types: @@ -187,8 +191,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase): ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([4.0, 3.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([4.0, 3.0], self.evaluate(var1)) # Run 10 steps FTRL for _ in range(10): @@ -197,12 +201,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase): # Validate updated params self.assertAllCloseAccordingToType( np.array([-7.66718769, -10.91273689]), - var0.eval(), + self.evaluate(var0), rtol=1e-4, bfloat16_rtol=1e-1, bfloat16_atol=1e-1) self.assertAllCloseAccordingToType( - np.array([-0.93460727, -1.86147261]), var1.eval(), rtol=1e-4) + np.array([-0.93460727, -1.86147261]), + self.evaluate(var1), + rtol=1e-4) def testFtrlWithL1_L2(self): for dtype in self.float_types: @@ -219,8 +225,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase): ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([4.0, 3.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([4.0, 3.0], self.evaluate(var1)) # Run 10 steps FTRL for _ in range(10): @@ -228,9 +234,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase): # Validate updated params self.assertAllCloseAccordingToType( - np.array([-0.24059935, -0.46829352]), var0.eval(), rtol=1e-5) + np.array([-0.24059935, -0.46829352]), + self.evaluate(var0), + rtol=1e-5) self.assertAllCloseAccordingToType( - np.array([-0.02406147, -0.04830509]), var1.eval(), rtol=1e-5) + np.array([-0.02406147, -0.04830509]), + self.evaluate(var1), + rtol=1e-5) def testFtrlWithL1_L2_L2Shrinkage(self): """Test the new FTRL op with support for l2 shrinkage. @@ -254,8 +264,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase): ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) - self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval()) + self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0)) + self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1)) # Run 10 steps FTRL for _ in range(10): @@ -263,9 +273,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase): # Validate updated params self.assertAllCloseAccordingToType( - np.array([-0.22578996, -0.44345799]), var0.eval(), rtol=1e-4) + np.array([-0.22578996, -0.44345799]), + self.evaluate(var0), + rtol=1e-4) self.assertAllCloseAccordingToType( - np.array([-0.14378493, -0.13229476]), var1.eval(), rtol=1e-4) + np.array([-0.14378493, -0.13229476]), + self.evaluate(var1), + rtol=1e-4) def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self): """Verifies that l2 shrinkage in FTRL does not change lr schedule.""" @@ -291,8 +305,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase): update1 = opt1.apply_gradients([(grads1, var1)]) variables.global_variables_initializer().run() - self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) - self.assertAllCloseAccordingToType([1.0, 2.0], var1.eval()) + self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0)) + self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var1)) # Run 10 steps FTRL for _ in range(10): @@ -301,7 +315,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase): # var0 is experiencing L2 shrinkage so it should be smaller than var1 # in magnitude. - self.assertTrue((var0.eval()**2 < var1.eval()**2).all()) + self.assertTrue((var0.eval()**2 < self.evaluate(var1)**2).all()) accum0 = list(opt0._slots["accum"].values())[0].eval() accum1 = list(opt1._slots["accum"].values())[0].eval() # L2 shrinkage should not change how we update grad accumulator. diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py index 6f51ae33a1b0fc8670ddf0cacb03a3b5a9176a91..dbea9849e217519874352b789588a2af62f1c826 100644 --- a/tensorflow/compiler/tests/jit_test.py +++ b/tensorflow/compiler/tests/jit_test.py @@ -75,7 +75,7 @@ def RunMetadataLabels(run_metadata): def InLabels(labels, substr): """Returns true iff one of the labels contains substr.""" - return any([substr in x for x in labels]) + return any(substr in x for x in labels) def MetadataHasXlaRunOp(run_metadata): diff --git a/tensorflow/compiler/tests/lrn_ops_test.py b/tensorflow/compiler/tests/lrn_ops_test.py index c6ad67993e8bc196a74c9a328df8c9200c92c575..5dddf6ae4e8c8a3d5e9eb7b2c62298df02a0093c 100644 --- a/tensorflow/compiler/tests/lrn_ops_test.py +++ b/tensorflow/compiler/tests/lrn_ops_test.py @@ -120,8 +120,8 @@ class LRNTest(xla_test.XLATestCase): with self.test_scope(): actual = gen_nn_ops.lrn_grad(out_grads, in_image, out_image, depth_radius, bias, alpha, beta) - expected_val = expected.eval() - actual_val = actual.eval() + expected_val = self.evaluate(expected) + actual_val = self.evaluate(actual) self.assertAllClose(actual_val, expected_val, rtol=1e-3) diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py index f77521a7c49dba39849869ddceb7c0e885147722..3416f7dbd6bdd264bf79785084f981f5b07cb8a9 100644 --- a/tensorflow/compiler/tests/momentum_test.py +++ b/tensorflow/compiler/tests/momentum_test.py @@ -61,37 +61,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase): self.assertFalse(slot1 in variables.trainable_variables()) # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Step 1: the momentum accumulators where 0. So we should see a normal # update: v -= grad * learning_rate mom_update.run() # Check that the momentum accumulators have been updated. - self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval()) - self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval()) + self.assertAllCloseAccordingToType( + np.array([0.1, 0.1]), self.evaluate(slot0)) + self.assertAllCloseAccordingToType( + np.array([0.01, 0.01]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( - np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval()) + np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), + self.evaluate(var0)) self.assertAllCloseAccordingToType( - np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval()) + np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), + self.evaluate(var1)) # Step 2: the momentum accumulators contain the previous update. mom_update.run() # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType( - np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()) + np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), + self.evaluate(slot0)) self.assertAllCloseAccordingToType( - np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval()) + np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), + self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([ 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0) - ]), var0.eval()) + ]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([ - 2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - ( - (0.9 * 0.01 + 0.01) * 2.0) - ]), var1.eval()) + 2.98 - ((0.9 * 0.01 + 0.01) * 2.0), + 3.98 - ((0.9 * 0.01 + 0.01) * 2.0) + ]), self.evaluate(var1)) def testNesterovMomentum(self): for dtype in self.float_types: @@ -115,8 +121,8 @@ class MomentumOptimizerTest(xla_test.XLATestCase): var0_np, accum0_np, var0_np * 0.8, 0.1, 0.9) var1_np, accum1_np = self._update_nesterov_momentum_numpy( var1_np, accum1_np, 0.9, 0.1, 0.9) - self.assertAllCloseAccordingToType(var0_np, var0.eval()) - self.assertAllCloseAccordingToType(var1_np, var1.eval()) + self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) def testTensorLearningRateAndMomentum(self): for dtype in self.float_types: @@ -141,37 +147,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase): self.assertFalse(slot1 in variables.trainable_variables()) # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Step 1: the momentum accumulators where 0. So we should see a normal # update: v -= grad * learning_rate mom_update.run() # Check that the momentum accumulators have been updated. - self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval()) - self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval()) + self.assertAllCloseAccordingToType( + np.array([0.1, 0.1]), self.evaluate(slot0)) + self.assertAllCloseAccordingToType( + np.array([0.01, 0.01]), self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( - np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval()) + np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), + self.evaluate(var0)) self.assertAllCloseAccordingToType( - np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval()) + np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), + self.evaluate(var1)) # Step 2: the momentum accumulators contain the previous update. mom_update.run() # Check that the momentum accumulators have been updated. self.assertAllCloseAccordingToType( - np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()) + np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), + self.evaluate(slot0)) self.assertAllCloseAccordingToType( - np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval()) + np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), + self.evaluate(slot1)) # Check that the parameters have been updated. self.assertAllCloseAccordingToType( np.array([ 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0) - ]), var0.eval()) + ]), self.evaluate(var0)) self.assertAllCloseAccordingToType( np.array([ - 2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - ( - (0.9 * 0.01 + 0.01) * 2.0) - ]), var1.eval()) + 2.98 - ((0.9 * 0.01 + 0.01) * 2.0), + 3.98 - ((0.9 * 0.01 + 0.01) * 2.0) + ]), self.evaluate(var1)) if __name__ == "__main__": diff --git a/tensorflow/compiler/tests/powersign_test.py b/tensorflow/compiler/tests/powersign_test.py index 86536da7fed0e2309beb32fee9c7c605491592ed..5b35c20027700b34500a31e174061d7087094b61 100644 --- a/tensorflow/compiler/tests/powersign_test.py +++ b/tensorflow/compiler/tests/powersign_test.py @@ -91,8 +91,8 @@ class PowerSignTest(xla_test.XLATestCase): variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 7 steps of powersign # first 4 steps with positive gradient @@ -125,8 +125,8 @@ class PowerSignTest(xla_test.XLATestCase): ) # Validate updated params - self.assertAllCloseAccordingToType(var0_np, var0.eval()) - self.assertAllCloseAccordingToType(var1_np, var1.eval()) + self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) def testDense(self): decay_steps = 10 diff --git a/tensorflow/compiler/tests/proximal_adagrad_test.py b/tensorflow/compiler/tests/proximal_adagrad_test.py index c41b4171e26af4f7ad0237d7407a5b3691299595..63cc51a470164915b2614a06d18ca1850bb64a3c 100644 --- a/tensorflow/compiler/tests/proximal_adagrad_test.py +++ b/tensorflow/compiler/tests/proximal_adagrad_test.py @@ -45,15 +45,17 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([0.0, 0.0], var0.eval()) - self.assertAllClose([0.0, 0.0], var1.eval()) + self.assertAllClose([0.0, 0.0], self.evaluate(var0)) + self.assertAllClose([0.0, 0.0], self.evaluate(var1)) # Run 3 steps Proximal Adagrad. for _ in range(3): update.run() - self.assertAllClose(np.array([-2.60260963, -4.29698515]), var0.eval()) - self.assertAllClose(np.array([-0.28432083, -0.56694895]), var1.eval()) + self.assertAllClose( + np.array([-2.60260963, -4.29698515]), self.evaluate(var0)) + self.assertAllClose( + np.array([-0.28432083, -0.56694895]), self.evaluate(var1)) opt_vars = opt.variables() self.assertStartsWith(opt_vars[0].name, var0._shared_name) self.assertStartsWith(opt_vars[1].name, var1._shared_name) @@ -74,14 +76,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([4.0, 3.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([4.0, 3.0], self.evaluate(var1)) # Run 3 steps Proximal Adagrad. for _ in range(3): update.run() - self.assertAllClose(np.array([-1.60261, -2.296985]), var0.eval()) - self.assertAllClose(np.array([3.715679, 2.433051]), var1.eval()) + self.assertAllClose(np.array([-1.60261, -2.296985]), self.evaluate(var0)) + self.assertAllClose(np.array([3.715679, 2.433051]), self.evaluate(var1)) def testProximalAdagradWithL1(self): with self.cached_session(), self.test_scope(): @@ -98,14 +100,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([4.0, 3.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([4.0, 3.0], self.evaluate(var1)) # Run 10 steps Proximal Adagrad for _ in range(10): update.run() - self.assertAllClose(np.array([-6.663634, -9.190331]), var0.eval()) - self.assertAllClose(np.array([2.959304, 1.029232]), var1.eval()) + self.assertAllClose(np.array([-6.663634, -9.190331]), self.evaluate(var0)) + self.assertAllClose(np.array([2.959304, 1.029232]), self.evaluate(var1)) def testProximalAdagradWithL1_L2(self): with self.cached_session(), self.test_scope(): @@ -122,15 +124,15 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([4.0, 3.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([4.0, 3.0], self.evaluate(var1)) # Run 10 steps Proximal Adagrad. for _ in range(10): update.run() - self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval()) - self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval()) + self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0)) + self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1)) def applyOptimizer(self, opt, steps=5): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) @@ -141,14 +143,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run ProximalAdagrad for a few steps for _ in range(steps): update.run() - return var0.eval(), var1.eval() + return self.evaluate(var0), self.evaluate(var1) def testEquivAdagradwithoutRegularization(self): with self.cached_session(), self.test_scope(): diff --git a/tensorflow/compiler/tests/proximal_gradient_descent_test.py b/tensorflow/compiler/tests/proximal_gradient_descent_test.py index 3d808e6b8a71ef9fa60b671d07bfd907e9f58efc..5aec433be765dd0a04bd7ab10d5c39a5a7f48c5c 100644 --- a/tensorflow/compiler/tests/proximal_gradient_descent_test.py +++ b/tensorflow/compiler/tests/proximal_gradient_descent_test.py @@ -42,15 +42,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([0.0, 0.0], var0.eval()) - self.assertAllClose([0.0, 0.0], var1.eval()) + self.assertAllClose([0.0, 0.0], self.evaluate(var0)) + self.assertAllClose([0.0, 0.0], self.evaluate(var1)) # Run 3 steps Proximal Gradient Descent. for _ in range(3): update.run() - self.assertAllClose(np.array([-0.9, -1.8]), var0.eval()) - self.assertAllClose(np.array([-0.09, -0.18]), var1.eval()) + self.assertAllClose(np.array([-0.9, -1.8]), self.evaluate(var0)) + self.assertAllClose(np.array([-0.09, -0.18]), self.evaluate(var1)) def testProximalGradientDescentwithoutRegularization2(self): with self.cached_session(), self.test_scope(): @@ -64,15 +64,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([4.0, 3.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([4.0, 3.0], self.evaluate(var1)) # Run 3 steps Proximal Gradient Descent for _ in range(3): update.run() - self.assertAllClose(np.array([0.1, 0.2]), var0.eval()) - self.assertAllClose(np.array([3.91, 2.82]), var1.eval()) + self.assertAllClose(np.array([0.1, 0.2]), self.evaluate(var0)) + self.assertAllClose(np.array([3.91, 2.82]), self.evaluate(var1)) def testProximalGradientDescentWithL1(self): with self.cached_session(), self.test_scope(): @@ -86,15 +86,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([4.0, 3.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([4.0, 3.0], self.evaluate(var1)) # Run 10 steps proximal gradient descent. for _ in range(10): update.run() - self.assertAllClose(np.array([-1.988, -3.988001]), var0.eval()) - self.assertAllClose(np.array([3.67, 2.37]), var1.eval()) + self.assertAllClose(np.array([-1.988, -3.988001]), self.evaluate(var0)) + self.assertAllClose(np.array([3.67, 2.37]), self.evaluate(var1)) def testProximalGradientDescentWithL1_L2(self): with self.cached_session(), self.test_scope(): @@ -108,15 +108,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([4.0, 3.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([4.0, 3.0], self.evaluate(var1)) # Run 10 steps Proximal Gradient Descent for _ in range(10): update.run() - self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval()) - self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval()) + self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0)) + self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1)) def applyOptimizer(self, opt, steps=5): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0]) @@ -127,14 +127,14 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run ProximalAdagrad for a few steps for _ in range(steps): update.run() - return var0.eval(), var1.eval() + return self.evaluate(var0), self.evaluate(var1) def testEquivGradientDescentwithoutRegularization(self): with self.cached_session(), self.test_scope(): diff --git a/tensorflow/compiler/tests/qr_op_test.py b/tensorflow/compiler/tests/qr_op_test.py index 236b1b881dcaffc1a5b0c6395f0605c1d7ef0269..b4d4193e35f9e0e3b23d0242ed076dd811f4ee2b 100644 --- a/tensorflow/compiler/tests/qr_op_test.py +++ b/tensorflow/compiler/tests/qr_op_test.py @@ -63,7 +63,7 @@ class QrOpTest(xla_test.XLATestCase, parameterized.TestCase): # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. xx = math_ops.matmul(x, x, adjoint_a=True) identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0) - precision = self.AdjustedNorm(xx.eval() - identity.eval()) + precision = self.AdjustedNorm(xx.eval() - self.evaluate(identity)) self.assertTrue(np.all(precision < 5.0)) def _test(self, dtype, shape, full_matrices): diff --git a/tensorflow/compiler/tests/resampler_ops_test.py b/tensorflow/compiler/tests/resampler_ops_test.py index f87ac3360c905d7956ab3716c47d42765949774d..d8ca0eab276b39f025d018edebb78eed7a8433bb 100644 --- a/tensorflow/compiler/tests/resampler_ops_test.py +++ b/tensorflow/compiler/tests/resampler_ops_test.py @@ -63,8 +63,8 @@ class ResamplerOpsTest(xla_test.XLATestCase): def testSimple(self): for dtype in self.float_types: input_shape = [1, 2, 2, 1] - input_rgb_data = [0, 5, 13, 54] - input_np = np.array(input_rgb_data, dtype=dtype).reshape(input_shape) + input_data = [0, 5, 13, 54] + input_np = np.array(input_data, dtype=dtype).reshape(input_shape) warp_shape = [1, 2] warp_data = [0.7, 0.6] @@ -151,6 +151,55 @@ class ResamplerOpsTest(xla_test.XLATestCase): expected_grad_data, expected_grad_warp) + def testOutOfBoundWarps(self): + # (x, y) are both less than 0. + for dtype in self.float_types: + input_shape = [1, 2, 2, 1] + input_data = [10, 5, 13, 54] + input_np = np.array(input_data, dtype=dtype).reshape(input_shape) + + warp_shape = [1, 2, 2] + warp_data = [-1, -1, 0.7, 0.6] + warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape) + expected = [[[0.0], [27.62]]] + self._assertForwardOpMatchesExpected(input_np, warp_np, expected) + + # One of (x, y) is less than 0. + for dtype in self.float_types: + input_shape = [1, 2, 2, 1] + input_data = [10, 5, 13, 54] + input_np = np.array(input_data, dtype=dtype).reshape(input_shape) + + warp_shape = [1, 2, 2] + warp_data = [-1, 0.1, 0.7, 0.6] + warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape) + expected = [[[0.0], [27.62]]] + self._assertForwardOpMatchesExpected(input_np, warp_np, expected) + + # Both of (x, y) are greater than image size. + for dtype in self.float_types: + input_shape = [1, 2, 2, 1] + input_data = [10, 5, 13, 54] + input_np = np.array(input_data, dtype=dtype).reshape(input_shape) + + warp_shape = [1, 2, 2] + warp_data = [-0.1, 0.1, 1.2, 2.1] + warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape) + expected = [[[0.0], [0.0]]] + self._assertForwardOpMatchesExpected(input_np, warp_np, expected) + + # One of (x, y) is greater than image size. + for dtype in self.float_types: + input_shape = [1, 2, 2, 1] + input_data = [10, 5, 13, 54] + input_np = np.array(input_data, dtype=dtype).reshape(input_shape) + + warp_shape = [1, 2, 2] + warp_data = [0.1, -0.1, 1.2, 0.1] + warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape) + expected = [[[0.0], [0.0]]] + self._assertForwardOpMatchesExpected(input_np, warp_np, expected) + if __name__ == '__main__': test.main() diff --git a/tensorflow/compiler/tests/rmsprop_test.py b/tensorflow/compiler/tests/rmsprop_test.py index 8840a1329a907bddc6ef1cb6dd1c2a6d234def5c..dc3e90b4afa41c08d899ee195d42fb91678bad1c 100644 --- a/tensorflow/compiler/tests/rmsprop_test.py +++ b/tensorflow/compiler/tests/rmsprop_test.py @@ -76,7 +76,7 @@ class RmspropTest(xla_test.XLATestCase): rms_opt = rmsprop.RMSPropOptimizer(learning_rate, centered=centered) rms_update = rms_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) - variables.global_variables_initializer().run() + self.evaluate(variables.global_variables_initializer()) mg0 = rms_opt.get_slot(var0, "mg") self.assertEqual(mg0 is not None, centered) @@ -92,12 +92,12 @@ class RmspropTest(xla_test.XLATestCase): self.assertTrue(mom1 is not None) # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 2.0], self.evaluate(var0)) + self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 3 steps of RMSProp for _ in range(3): - rms_update.run() + self.evaluate(rms_update) var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy( var0_np, @@ -118,14 +118,14 @@ class RmspropTest(xla_test.XLATestCase): # Validate updated params if centered: - self.assertAllCloseAccordingToType(mg0_np, mg0.eval()) - self.assertAllCloseAccordingToType(mg1_np, mg1.eval()) - self.assertAllCloseAccordingToType(rms0_np, rms0.eval()) - self.assertAllCloseAccordingToType(rms1_np, rms1.eval()) - self.assertAllCloseAccordingToType(mom0_np, mom0.eval()) - self.assertAllCloseAccordingToType(mom1_np, mom1.eval()) - self.assertAllCloseAccordingToType(var0_np, var0.eval()) - self.assertAllCloseAccordingToType(var1_np, var1.eval()) + self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0)) + self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1)) + self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0)) + self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1)) + self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0)) + self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1)) + self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) + self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1)) if __name__ == "__main__": diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py index 897db384b7e8067b0460b5f344201f101a4d8479..17639bd8a755b9e9f5acc77979ac7a4149f112db 100644 --- a/tensorflow/compiler/tests/scan_ops_test.py +++ b/tensorflow/compiler/tests/scan_ops_test.py @@ -71,7 +71,7 @@ def handle_options(func, x, axis, exclusive, reverse): class CumsumTest(xla_test.XLATestCase): - valid_dtypes = [np.float32] + valid_dtypes = [np.float32, np.int32] def axis_dtypes(self): return set(self.int_types).intersection([np.int32, np.int64]) @@ -149,7 +149,7 @@ class CumsumTest(xla_test.XLATestCase): class CumprodTest(xla_test.XLATestCase): - valid_dtypes = [np.float32] + valid_dtypes = [np.float32, np.int32] def axis_dtypes(self): return set(self.int_types).intersection([np.int32, np.int64]) diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py index 46ca371c8abf1cb4710717a183ee12820c4c4ca0..c8208adb587431353ab451d796a0a42480cbe196 100644 --- a/tensorflow/compiler/tests/tensor_array_ops_test.py +++ b/tensorflow/compiler/tests/tensor_array_ops_test.py @@ -79,7 +79,8 @@ class TensorArrayTest(xla_test.XLATestCase): c0 = w2.stack() self.assertAllEqual( - convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]), c0.eval()) + convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]), + self.evaluate(c0)) def testTensorArrayWritePack(self): for dtype in self.numeric_tf_types: @@ -97,7 +98,7 @@ class TensorArrayTest(xla_test.XLATestCase): c0 = w2.stack() - self.assertAllEqual([3, 0, 1], c0.eval().shape) + self.assertAllEqual([3, 0, 1], self.evaluate(c0).shape) def _testTensorArrayWriteConcat(self, tf_dtype): with self.cached_session(), self.test_scope(): @@ -113,8 +114,8 @@ class TensorArrayTest(xla_test.XLATestCase): c0 = w2.concat() self.assertAllEqual( - convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0], - [106.0, 107.0], [8.0, 9.0], [204.0, 205.0]]), c0.eval()) + convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0], [106.0, 107.0], + [8.0, 9.0], [204.0, 205.0]]), self.evaluate(c0)) def testTensorArrayWriteConcat(self): for dtype in self.numeric_tf_types: @@ -341,7 +342,7 @@ class TensorArrayTest(xla_test.XLATestCase): r0_bad = gen_data_flow_ops.tensor_array_read_v3( handle=w0.handle, index=0, dtype=dtype2, flow_in=w0.flow) with self.assertRaisesOpError("TensorArray dtype is "): - r0_bad.eval() + self.evaluate(r0_bad) # Test reading from a different index than the one we wrote to w0.read(1) @@ -422,7 +423,7 @@ class TensorArrayTest(xla_test.XLATestCase): w2 = h2.write(0, 5.0) r2 = w2.read(0) r = r1 + r2 - self.assertAllClose(9.0, r.eval()) + self.assertAllClose(9.0, self.evaluate(r)) def _testTensorArrayGradientWriteReadType(self, dtype): with self.cached_session() as session, self.test_scope(): @@ -526,7 +527,7 @@ class TensorArrayTest(xla_test.XLATestCase): with ops.control_dependencies([r0_readtwice]): r1_readtwice = w_readtwice.read(0) - self.assertAllEqual([1.0, -1.0], r1_readtwice.eval()) + self.assertAllEqual([1.0, -1.0], self.evaluate(r1_readtwice)) def _testTensorArrayGradientUnpackRead(self): with self.cached_session() as session, self.test_scope(): @@ -592,7 +593,7 @@ class TensorArrayTest(xla_test.XLATestCase): ta = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="foo", size=3) s = ta.size() - self.assertAllEqual(3, s.eval()) + self.assertAllEqual(3, self.evaluate(s)) def testWriteCloseTensorArray(self): with self.cached_session(), self.test_scope(): @@ -722,7 +723,7 @@ class TensorArrayTest(xla_test.XLATestCase): # r = acc2.stack() # grad = gradients_impl.gradients(r, [x])[0] - # self.assertAllClose(31.0, grad.eval()) + # self.assertAllClose(31.0, self.evaluate(grad)) def testSumOfTwoReadVariablesWithoutRepeatGrad(self): with self.cached_session() as session, self.test_scope(): @@ -912,7 +913,7 @@ class TensorArrayTest(xla_test.XLATestCase): self.assertEqual(0, ta.size().eval()) ta = ta.unstack(array_ops.zeros([0, 3, 5])) packed = ta.stack() - self.assertAllEqual([0, 3, 5], packed.eval().shape) + self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape) # Concatenating zero tensors along their first dimension gives a # first dimension of zero self.assertAllEqual([0, 5], ta.concat().eval().shape) @@ -1041,8 +1042,8 @@ class TensorArrayTest(xla_test.XLATestCase): (read0, read1, size0, size1)) # Tests that the control dependencies was added and executed. - self.assertEqual(1, v0.eval()) - self.assertEqual(1, v1.eval()) + self.assertEqual(1, self.evaluate(v0)) + self.assertEqual(1, self.evaluate(v1)) # Tests correct TensorArray. self.assertEqual(read0_v, 0) diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index 486b4d8a8c35097c0ad333b6fd87d34e5bf5b1e4..3458c7f1c40cd70187e209eb40db24245d595d04 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -211,7 +211,6 @@ cc_library( "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/client/lib:arithmetic", "//tensorflow/compiler/xla/client/lib:constants", - "//tensorflow/compiler/xla/client/lib:numeric", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc index 34f891890a4d40bf8db90cf2618f323b2108b56a..1de85004a51bea464f8f0166511402e5dd85ac14 100644 --- a/tensorflow/compiler/tf2xla/dump_graph.cc +++ b/tensorflow/compiler/tf2xla/dump_graph.cc @@ -61,8 +61,7 @@ string MakeUniqueFilename(string name) { string WriteTextProtoToUniqueFile( Env* env, const string& name, const char* proto_type, const ::tensorflow::protobuf::Message& proto) { - const string& dirname = - legacy_flags::GetDumpGraphFlags()->tf_dump_graph_prefix; + const string& dirname = GetDumpGraphFlags()->tf_dump_graph_prefix; Status status = env->RecursivelyCreateDir(dirname); if (!status.ok()) { LOG(WARNING) << "Failed to create " << dirname << " for dumping " diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc index 9ef9f49f422ec4dfaf538ac3c0754ba3609d3f88..3dfd3f854c8646ebbf06d3378201d22e8741b7eb 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc @@ -75,6 +75,25 @@ Status FunctionalizeControlFlow(Graph* graph, return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library); } +Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def, + FunctionLibraryDefinition* library) { + return FunctionalizeControlFlowForGraphDef(/*lookup_library=*/nullptr, + graph_def, library); +} + +Status FunctionalizeControlFlowForGraphDef( + const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def, + FunctionLibraryDefinition* library) { + FunctionDefLibrary function_lib = graph_def->library(); + Graph graph(OpRegistry::Global()); + + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph({}, *graph_def, &graph)); + TF_RETURN_IF_ERROR(FunctionalizeControlFlow(lookup_library, &graph, library)); + graph.ToGraphDef(graph_def); + std::swap(*graph_def->mutable_library(), function_lib); + return Status::OK(); +} + Status FunctionalizeControlFlowForFunction( const string& func_name, const string& new_func_name, const protobuf::Map& attrs, diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h index ba99205640ccdc83a3a4d50e3ec474907894a835..91d33fa405834d7f1f8f66180583580f4f2e448a 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h @@ -33,6 +33,12 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library, Graph* graph, FunctionLibraryDefinition* library); +Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def, + FunctionLibraryDefinition* library); +Status FunctionalizeControlFlowForGraphDef( + const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def, + FunctionLibraryDefinition* library); + // This pass looks at the graph and all associated FunctionDefs, and turns // traditional control flow structure (Switch/Merge/etc.) into functional // control flow structure (If/While). diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc index c3841f996f801e855da75b23f01d41674ec51c4d..9784985af83a18619d837528f99a60b98a501ec5 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc @@ -95,77 +95,87 @@ TEST(FunctionalizeControlFlow, Conditional) { } FunctionLibraryDefinition library(OpRegistry::Global(), {}); + GraphDef optimized_graph_def; + graph.ToGraphDef(&optimized_graph_def); + TF_ASSERT_OK( + FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library)); TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library)); + GraphDef converted_graph_def; + graph.ToGraphDef(&converted_graph_def); + + for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) { + string op_name; + NameAttrList then_fn; + NameAttrList else_fn; + TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn)); + InstantiationResultForTest else_result; + TF_EXPECT_OK( + InstantiateFunctionForTest(else_fn.name(), library, &else_result)); + + // Outer graph + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32); + auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32); + auto less = ops::Less(scope.WithOpName("cond/Less"), y, x); + auto if_op = ops::If(scope.WithOpName(op_name), less, + std::initializer_list{less, y, x}, {DT_INT32}, + then_fn, else_fn); + auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]); + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + TF_EXPECT_GRAPH_EQ(expected, graph_def); + } - GraphDef graph_def; - graph.ToGraphDef(&graph_def); - string op_name; - NameAttrList then_fn; - NameAttrList else_fn; - TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn)); - InstantiationResultForTest else_result; - TF_EXPECT_OK( - InstantiateFunctionForTest(else_fn.name(), library, &else_result)); - - // Outer graph - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32); - auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32); - auto less = ops::Less(scope.WithOpName("cond/Less"), y, x); - auto if_op = ops::If(scope.WithOpName(op_name), less, - std::initializer_list{less, y, x}, {DT_INT32}, - then_fn, else_fn); - auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]); - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - TF_EXPECT_GRAPH_EQ(expected, graph_def); - } - - // then body. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0); - auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); - auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); - auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0); - auto cond = ops::Const( - scope.WithOpName("cond").WithControlDependencies(identity), 17); - auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond); - auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - InstantiationResultForTest result; - TF_EXPECT_OK(InstantiateFunctionForTest(then_fn.name(), library, &result)); - - EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); - EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); - } + // then body. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0); + auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); + auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); + auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0); + auto cond = ops::Const( + scope.WithOpName("cond").WithControlDependencies(identity), 17); + auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond); + auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(then_fn.name(), library, &result)); + + EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); + EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), + result.arg_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } - // else body. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0); - auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); - auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); - auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0); - auto cond_1 = ops::Const( - scope.WithOpName("cond_1").WithControlDependencies(identity), 23); - auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1); - auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - InstantiationResultForTest result; - TF_EXPECT_OK(InstantiateFunctionForTest(else_fn.name(), library, &result)); - - EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); - EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); + // else body. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0); + auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); + auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); + auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0); + auto cond_1 = ops::Const( + scope.WithOpName("cond_1").WithControlDependencies(identity), 23); + auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1); + auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(else_fn.name(), library, &result)); + + EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); + EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), + result.arg_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } } } @@ -239,75 +249,77 @@ TEST(FunctionalizeControlFlow, OneLoopVar) { } FunctionLibraryDefinition library(OpRegistry::Global(), {}); + GraphDef optimized_graph_def; + graph.ToGraphDef(&optimized_graph_def); + TF_ASSERT_OK( + FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library)); TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library)); + GraphDef converted_graph_def; + graph.ToGraphDef(&converted_graph_def); + + for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) { + NameAttrList cond_fn, body_fn; + TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn)); + + // Outer graph + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32); + auto while_op = + ops::While(scope.WithOpName("while/LoopCond"), + std::initializer_list{source}, cond_fn, body_fn); + auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]); + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + TF_EXPECT_GRAPH_EQ(expected, graph_def); + } - GraphDef graph_def; - graph.ToGraphDef(&graph_def); - - NameAttrList cond_fn, body_fn; - TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn)); - - // Outer graph - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32); - auto while_op = - ops::While(scope.WithOpName("while/LoopCond"), - std::initializer_list{source}, cond_fn, body_fn); - auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]); - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - TF_EXPECT_GRAPH_EQ(expected, graph_def); - } - - // Condition graph - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto ten = ops::Const( - scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10); - auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten); - auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - InstantiationResultForTest result; - TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result)); - - EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); - EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); - } - - // Body graph. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg); - auto one = ops::Const( - scope.WithOpName("while/add/y").WithControlDependencies(identity), 1); - auto add = ops::Add(scope.WithOpName("while/add"), identity, one); - auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - InstantiationResultForTest result; - TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result)); + // Condition graph + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto ten = ops::Const( + scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10); + auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten); + auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(cond_fn.name(), library, &result)); + + EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); + EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } - EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); - EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); + // Body graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg); + auto one = ops::Const( + scope.WithOpName("while/add/y").WithControlDependencies(identity), 1); + auto add = ops::Add(scope.WithOpName("while/add"), identity, one); + auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(body_fn.name(), library, &result)); + + EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); + EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } } } -// @function.Defun(noinline=True) -// def increment_fn(x): -// return [x + 1] -// Define the above function, and add it to the given graph. It's used as the -// while loop body in NoinlineLoopBody test. -Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) { +FunctionDef GetNoinlineFunctionDef() { FunctionDef fdef = FunctionDefHelper::Create( "increment_fn", {"x:int32"}, {"add:int32"}, {}, { @@ -316,8 +328,17 @@ Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) { }, {{"add", "add_0:z:0"}}); (*fdef.mutable_attr())["_noinline"].set_b(true); + return fdef; +} + +// @function.Defun(noinline=True) +// def increment_fn(x): +// return [x + 1] +// Define the above function, and add it to the given graph. It's used as the +// while loop body in NoinlineLoopBody test. +Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) { FunctionDefLibrary fdef_lib; - *(fdef_lib.add_function()) = fdef; + *(fdef_lib.add_function()) = GetNoinlineFunctionDef(); TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdef_lib)); NodeDef increment_fn; increment_fn.set_name(node_name); @@ -376,55 +397,88 @@ TEST(FunctionalizeControlFlow, NoinlineLoopBody) { FunctionLibraryDefinition lookup_lib(graph.flib_def()); FunctionLibraryDefinition library(OpRegistry::Global(), {}); // Function increment_fn will be copied from lookup_lib to library. - TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library)); + GraphDef optimized_graph_def; + graph.ToGraphDef(&optimized_graph_def); - GraphDef graph_def; - graph.ToGraphDef(&graph_def); + *(optimized_graph_def.mutable_library()->add_function()) = + GetNoinlineFunctionDef(); - NameAttrList cond_fn, body_fn; - TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn)); + TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef( + &lookup_lib, &optimized_graph_def, &library)); + TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library)); + GraphDef converted_graph_def; + graph.ToGraphDef(&converted_graph_def); + + for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) { + NameAttrList cond_fn, body_fn; + TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn)); + + // Outer graph + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32); + auto while_op = + ops::While(scope.WithOpName("while/LoopCond"), + std::initializer_list{source}, cond_fn, body_fn); + GraphDef expected; + TF_ASSERT_OK(scope.ToGraphDef(&expected)); + TF_EXPECT_GRAPH_EQ(expected, graph_def); + } - // Outer graph - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32); - auto while_op = - ops::While(scope.WithOpName("while/LoopCond"), - std::initializer_list{source}, cond_fn, body_fn); - GraphDef expected; - TF_ASSERT_OK(scope.ToGraphDef(&expected)); - TF_EXPECT_GRAPH_EQ(expected, graph_def); + // Body graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + TF_ASSERT_OK( + AddNoinlineFunctionToGraph(noinline_node_name, scope.graph())); + auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg); + NodeDef retval; + retval.set_name("_retval0_RetVal"); + retval.set_op(FunctionLibraryDefinition::kRetOp); + *retval.add_input() = noinline_node_name; + (*retval.mutable_attr())["T"].set_type(DT_INT32); + (*retval.mutable_attr())["index"].set_i(0); + Status status; + scope.graph()->AddNode(retval, &status); + TF_ASSERT_OK(status); + + GraphDef expected; + TF_ASSERT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + // Verify that increment_fn has been copied to library. + TF_EXPECT_OK( + InstantiateFunctionForTest(body_fn.name(), library, &result)); + + EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); + EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); + // Ignore the function library when comparing the graphs. + expected.clear_library(); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } } +} - // Body graph. +TEST(FunctionalizeControlFlow, MissingFunctionDefInLibrary) { + const string& noinline_node_name = "while/increment_fn"; + Graph graph(OpRegistry::Global()); { Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32); + auto identity = ops::Identity(scope.WithOpName("while/Identity"), source); TF_ASSERT_OK(AddNoinlineFunctionToGraph(noinline_node_name, scope.graph())); - auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg); - NodeDef retval; - retval.set_name("_retval0_RetVal"); - retval.set_op(FunctionLibraryDefinition::kRetOp); - *retval.add_input() = noinline_node_name; - (*retval.mutable_attr())["T"].set_type(DT_INT32); - (*retval.mutable_attr())["index"].set_i(0); - Status status; - scope.graph()->AddNode(retval, &status); - TF_ASSERT_OK(status); - - GraphDef expected; - TF_ASSERT_OK(scope.ToGraphDef(&expected)); + TF_ASSERT_OK(scope.ToGraph(&graph)); + } - InstantiationResultForTest result; - // Verify that increment_fn has been copied to library. - TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result)); + FunctionLibraryDefinition lookup_lib(graph.flib_def()); + FunctionLibraryDefinition library(OpRegistry::Global(), {}); + GraphDef graph_def; + graph.ToGraphDef(&graph_def); + graph_def.clear_library(); - EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); - EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); - // Ignore the function library when comparing the graphs. - expected.clear_library(); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); - } + Status status = + FunctionalizeControlFlowForGraphDef(&lookup_lib, &graph_def, &library); + EXPECT_EQ(tensorflow::error::NOT_FOUND, status.code()); } // Tests functionalizing OneLoopVar where the loop value is not used post the @@ -467,65 +521,72 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) { } FunctionLibraryDefinition library(OpRegistry::Global(), {}); + GraphDef optimized_graph_def; + graph.ToGraphDef(&optimized_graph_def); + TF_ASSERT_OK( + FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library)); TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library)); + GraphDef converted_graph_def; + graph.ToGraphDef(&converted_graph_def); + + for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) { + NameAttrList cond_fn, body_fn; + TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn)); + + // Outer graph + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32); + auto while_op = + ops::While(scope.WithOpName("while/LoopCond"), + std::initializer_list{source}, cond_fn, body_fn); + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + TF_EXPECT_GRAPH_EQ(expected, graph_def); + } - GraphDef graph_def; - graph.ToGraphDef(&graph_def); - - NameAttrList cond_fn, body_fn; - TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn)); - - // Outer graph - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32); - auto while_op = - ops::While(scope.WithOpName("while/LoopCond"), - std::initializer_list{source}, cond_fn, body_fn); - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - TF_EXPECT_GRAPH_EQ(expected, graph_def); - } - - // Condition graph - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto ten = ops::Const( - scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10); - auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten); - auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - InstantiationResultForTest result; - TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result)); - - EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); - EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); - } - - // Body graph. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg); - auto one = ops::Const( - scope.WithOpName("while/add/y").WithControlDependencies(identity), 1); - auto add = ops::Add(scope.WithOpName("while/add"), identity, one); - auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - InstantiationResultForTest result; - TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result)); + // Condition graph + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto ten = ops::Const( + scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10); + auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten); + auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(cond_fn.name(), library, &result)); + + EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); + EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } - EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); - EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); + // Body graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg); + auto one = ops::Const( + scope.WithOpName("while/add/y").WithControlDependencies(identity), 1); + auto add = ops::Add(scope.WithOpName("while/add"), identity, one); + auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(body_fn.name(), library, &result)); + + EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types); + EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } } } @@ -608,86 +669,95 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) { } FunctionLibraryDefinition library(OpRegistry::Global(), {}); + GraphDef optimized_graph_def; + graph.ToGraphDef(&optimized_graph_def); + TF_ASSERT_OK( + FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library)); TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library)); + GraphDef converted_graph_def; + graph.ToGraphDef(&converted_graph_def); + + for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) { + NameAttrList cond_fn, body_fn; + TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn)); + + // Outer graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32); + auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32); + auto while_op = + ops::While(scope.WithOpName("while/LoopCond"), + std::initializer_list{x, y}, cond_fn, body_fn); + auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]); + auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]); + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + TF_EXPECT_GRAPH_EQ(expected, graph_def); + } - GraphDef graph_def; - graph.ToGraphDef(&graph_def); - - NameAttrList cond_fn, body_fn; - TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn)); - - // Outer graph. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32); - auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32); - auto while_op = - ops::While(scope.WithOpName("while/LoopCond"), - std::initializer_list{x, y}, cond_fn, body_fn); - auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]); - auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]); - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - TF_EXPECT_GRAPH_EQ(expected, graph_def); - } - - // Condition graph. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); - auto three = ops::Const(scope.WithOpName("while/cond/three") + // Condition graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); + auto three = ops::Const(scope.WithOpName("while/cond/three") + .WithControlDependencies(arg0.output), + 3); + auto cond_add = + ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three); + auto ten = ops::Const(scope.WithOpName("while/cond/ten") .WithControlDependencies(arg0.output), - 3); - auto cond_add = - ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three); - auto ten = ops::Const( - scope.WithOpName("while/cond/ten").WithControlDependencies(arg0.output), - 10); - auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten); - auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - InstantiationResultForTest result; - TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result)); - - EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types); - EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); - } - - // Body graph. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); - - auto identity_x = ops::Identity(scope.WithOpName("while/Identity/x"), arg0); - auto identity_y = ops::Identity(scope.WithOpName("while/Identity/y"), arg1); - - auto one = ops::Const( - scope.WithOpName("while/add/one").WithControlDependencies(identity_x), - 1); - auto two = ops::Const( - scope.WithOpName("while/mul/two").WithControlDependencies(identity_x), - 2); + 10); + auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten); + auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0); - auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one); - auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two); - auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0); - auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1); + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(cond_fn.name(), library, &result)); - InstantiationResultForTest result; - TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result)); + EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types); + EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } - EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types); - EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); + // Body graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); + + auto identity_x = + ops::Identity(scope.WithOpName("while/Identity/x"), arg0); + auto identity_y = + ops::Identity(scope.WithOpName("while/Identity/y"), arg1); + + auto one = ops::Const( + scope.WithOpName("while/add/one").WithControlDependencies(identity_x), + 1); + auto two = ops::Const( + scope.WithOpName("while/mul/two").WithControlDependencies(identity_x), + 2); + + auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one); + auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two); + auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0); + auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(body_fn.name(), library, &result)); + + EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types); + EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } } } @@ -841,177 +911,192 @@ TEST(FunctionalizeControlFlow, Complex) { } FunctionLibraryDefinition library(OpRegistry::Global(), {}); + GraphDef optimized_graph_def; + graph.ToGraphDef(&optimized_graph_def); + TF_ASSERT_OK( + FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library)); TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library)); + GraphDef converted_graph_def; + graph.ToGraphDef(&converted_graph_def); - GraphDef graph_def; - graph.ToGraphDef(&graph_def); - - NameAttrList outer_cond_fn, outer_body_fn; - TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn)); - - // Outer graph. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32); - auto three = ops::Const(scope.WithOpName("three"), 3); - auto y = ops::Add(scope.WithOpName("y"), x, three); - - auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32, - TensorShape({})); - - auto zero = ops::Const(scope.WithOpName("outer/Const"), 0); - - auto while_op = ops::While(scope.WithOpName("outer/LoopCond"), - std::initializer_list{zero, y, x, var}, - outer_cond_fn, outer_body_fn); - auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]); - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - TF_EXPECT_GRAPH_EQ(expected, graph_def); - } - - // Outer condition graph. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); - auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); - auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3); - - auto ten = ops::Const( - scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output), - 10); - auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten); - auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - InstantiationResultForTest result; - TF_EXPECT_OK( - InstantiateFunctionForTest(outer_cond_fn.name(), library, &result)); - - EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}), - result.arg_types); - EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); - } - - // Outer body graph. - NameAttrList inner_cond_fn, inner_body_fn; - { - InstantiationResultForTest result; - TF_EXPECT_OK( - InstantiateFunctionForTest(outer_body_fn.name(), library, &result)); - - // Find the inner condition and body names. - TF_EXPECT_OK( - FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn)); - - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); - auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); - auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3); - - auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0); - auto one_j = ops::Const( - scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1); - auto while_op = - ops::While(scope.WithOpName("outer/LoopCond_1"), - std::initializer_list{one_j, arg1, arg2, arg3}, - inner_cond_fn, inner_body_fn); - - auto one_outer = ops::Const( - scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1); - auto add_i = - ops::Add(scope.WithOpName("outer/add") - .WithControlDependencies(absl::Span{ - while_op[0].op(), while_op[1].op()}), - identity_i, one_outer); - - auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0); - auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1); - auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}), - result.arg_types); - EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); - } - - // Inner condition graph. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); - auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); - auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3); - - auto five = ops::Const( - scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), 5); - auto less_j = ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five); - auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0); - - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); - - InstantiationResultForTest result; + for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) { + NameAttrList outer_cond_fn, outer_body_fn; TF_EXPECT_OK( - InstantiateFunctionForTest(inner_cond_fn.name(), library, &result)); - - EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}), - result.arg_types); - EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); - } - - // Inner body graph. - { - Scope scope = Scope::NewRootScope().ExitOnError(); - auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); - auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); - auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); - auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3); - - auto identity_j = - ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0); - auto identity_k = - ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1); - - auto mul_jk = - ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k); - auto add_jkx = ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2); - auto assign = ops::AssignAddVariableOp( - scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx); - - auto one = ops::Const( - scope.WithOpName("outer/inner/One") - .WithControlDependencies( - absl::Span{assign.operation}), - 1); - auto add_j = - ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one); + FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn)); + + // Outer graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32); + auto three = ops::Const(scope.WithOpName("three"), 3); + auto y = ops::Add(scope.WithOpName("y"), x, three); + + auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32, + TensorShape({})); + + auto zero = ops::Const(scope.WithOpName("outer/Const"), 0); + + auto while_op = ops::While(scope.WithOpName("outer/LoopCond"), + std::initializer_list{zero, y, x, var}, + outer_cond_fn, outer_body_fn); + auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]); + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + TF_EXPECT_GRAPH_EQ(expected, graph_def); + } - auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0); - auto retval1 = - ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1); - auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2); + // Outer condition graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); + auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); + auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3); + + auto ten = ops::Const( + scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output), + 10); + auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten); + auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(outer_cond_fn.name(), library, &result)); + + EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}), + result.arg_types); + EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } - GraphDef expected; - TF_EXPECT_OK(scope.ToGraphDef(&expected)); + // Outer body graph. + NameAttrList inner_cond_fn, inner_body_fn; + { + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(outer_body_fn.name(), library, &result)); + + // Find the inner condition and body names. + TF_EXPECT_OK( + FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn)); + + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); + auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); + auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3); + + auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0); + auto one_j = ops::Const( + scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1); + auto while_op = + ops::While(scope.WithOpName("outer/LoopCond_1"), + std::initializer_list{one_j, arg1, arg2, arg3}, + inner_cond_fn, inner_body_fn); + + auto one_outer = ops::Const( + scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), + 1); + auto add_i = + ops::Add(scope.WithOpName("outer/add") + .WithControlDependencies(absl::Span{ + while_op[0].op(), while_op[1].op()}), + identity_i, one_outer); + + auto retval0 = + ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0); + auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1); + auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}), + result.arg_types); + EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), + result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } - InstantiationResultForTest result; - TF_EXPECT_OK( - InstantiateFunctionForTest(inner_body_fn.name(), library, &result)); + // Inner condition graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); + auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); + auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3); + + auto five = ops::Const( + scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), + 5); + auto less_j = + ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five); + auto retval = + ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(inner_cond_fn.name(), library, &result)); + + EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}), + result.arg_types); + EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } - EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}), - result.arg_types); - EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types); - TF_EXPECT_GRAPH_EQ(expected, result.gdef); + // Inner body graph. + { + Scope scope = Scope::NewRootScope().ExitOnError(); + auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0); + auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1); + auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2); + auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3); + + auto identity_j = + ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0); + auto identity_k = + ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1); + + auto mul_jk = + ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k); + auto add_jkx = + ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2); + auto assign = ops::AssignAddVariableOp( + scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx); + + auto one = ops::Const( + scope.WithOpName("outer/inner/One") + .WithControlDependencies( + absl::Span{assign.operation}), + 1); + auto add_j = + ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one); + + auto retval0 = + ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0); + auto retval1 = + ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1); + auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2); + + GraphDef expected; + TF_EXPECT_OK(scope.ToGraphDef(&expected)); + + InstantiationResultForTest result; + TF_EXPECT_OK( + InstantiateFunctionForTest(inner_body_fn.name(), library, &result)); + + EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}), + result.arg_types); + EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), + result.ret_types); + TF_EXPECT_GRAPH_EQ(expected, result.gdef); + } } } diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc index ad85940920ebb82e72331516e3fe46c79f853892..3e398fff951a211f5af42d26983bc7473bddde63 100644 --- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc @@ -21,10 +21,13 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" +#include "tensorflow/compiler/xla/client/lib/prng.h" #include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/types.pb.h" namespace tensorflow { namespace { @@ -57,8 +60,6 @@ class CategoricalOp : public XlaOpKernel { const int64 batch_size = logits_shape.dim_size(0); const int64 num_classes = logits_shape.dim_size(1); - xla::XlaBuilder* builder = ctx->builder(); - xla::Shape uniform_shape; int class_dimension; if (num_samples > 1) { @@ -83,16 +84,16 @@ class CategoricalOp : public XlaOpKernel { xla::ShapeUtil::MakeShape(uniform_xla_type, uniform_shape_array); class_dimension = 1; } - xla::XlaOp uniforms = - xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)), - XlaHelpers::One(builder, input_type(0)), uniform_shape); + xla::PrimitiveType type; + OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(0), &type)); + xla::XlaOp log_uniforms = GetLogUniforms(uniform_shape, type, ctx); // Use Gumbel softmax trick to generate categorical samples. // See: // https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/ // TODO(b/68769470): Switch to using a cumulative sum approach. auto softmax_entries = - xla::Sub(logits, xla::Log(-xla::Log(uniforms)), + xla::Sub(logits, log_uniforms, /*broadcast_dimensions=*/{0, class_dimension}); xla::PrimitiveType xla_output_type; @@ -107,6 +108,16 @@ class CategoricalOp : public XlaOpKernel { ctx->SetOutput(0, argmax); } + virtual xla::XlaOp GetLogUniforms(xla::Shape uniform_shape, + xla::PrimitiveType type, + XlaOpKernelContext* ctx) { + xla::XlaBuilder* builder = ctx->builder(); + auto uniforms = + xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)), + XlaHelpers::One(builder, input_type(0)), uniform_shape); + return xla::Log(-xla::Log(uniforms)); + } + private: TF_DISALLOW_COPY_AND_ASSIGN(CategoricalOp); }; @@ -115,5 +126,48 @@ class CategoricalOp : public XlaOpKernel { REGISTER_XLA_OP(Name("Multinomial").CompileTimeConstantInput("num_samples"), CategoricalOp); +class StatelessCategoricalOp : public CategoricalOp { + public: + explicit StatelessCategoricalOp(OpKernelConstruction* ctx) + : CategoricalOp(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_)); + } + + xla::XlaOp GetLogUniforms(xla::Shape uniform_shape, xla::PrimitiveType type, + XlaOpKernelContext* ctx) override { + xla::XlaOp seed = ctx->Input(2); + auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {}); + auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {}); + + xla::XlaBuilder* builder = ctx->builder(); + if (uniform_shape.element_type() == xla::BF16) { + uniform_shape.set_element_type(xla::F32); + } + auto uniforms = xla::StatelessRngUniform( + {seed0, seed1}, uniform_shape, XlaHelpers::Zero(builder, DT_FLOAT), + XlaHelpers::One(builder, DT_FLOAT)); + return xla::ConvertElementType(xla::Log(-xla::Log(uniforms)), type); + } + + void Compile(XlaOpKernelContext* ctx) override { + TensorShape seed_shape = ctx->InputShape(2); + OP_REQUIRES(ctx, seed_shape.dims() == 1 && seed_shape.dim_size(0) == 2, + errors::InvalidArgument("seed must have shape [2], not ", + seed_shape.DebugString())); + CategoricalOp::Compile(ctx); + } + + private: + DataType dtype_; + + TF_DISALLOW_COPY_AND_ASSIGN(StatelessCategoricalOp); +}; + +REGISTER_XLA_OP(Name("StatelessMultinomial") + .CompileTimeConstantInput("num_samples") + .TypeConstraint("T", {DT_FLOAT, DT_BFLOAT16}) + .TypeConstraint("Tseed", DT_INT32), + StatelessCategoricalOp); + } // anonymous namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc index c9a1be494066e4f935a1d818bc86c86333e34fae..b1046fcc0001a3eb450c82a8545e2cfdf4e43fd0 100644 --- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc +++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc @@ -24,7 +24,6 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/lib/constants.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/core/framework/node_def_util.h" diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc index c68b0bfd7961892294c2931e5c4c44de534a7740..29687c7b82f92d9f336854c4575746589c63b64f 100644 --- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc @@ -17,7 +17,6 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/util/tensor_format.h" diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc index e310db2162da0997204f85bc3ca42e7b0460e1e3..42bf4b06e5da7c6f99ad32ae36131dffd124d103 100644 --- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc +++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc @@ -119,6 +119,10 @@ class ArgMaxCustomCallOp : public XlaOpKernel { ", but got shape: ", input_shape.DebugString())); } + const DataType dtype = output_type(0); + xla::PrimitiveType output_type; + OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype, &output_type)); + output = xla::ConvertElementType(output, output_type); ctx->SetOutput(0, output); } diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc index 8dfd7de591c4a3c4768dd60b41e03d294ad49397..a99b74565dab4587bee999e3d73340ff58d21f77 100644 --- a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc @@ -16,7 +16,6 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/tensor_shape.h" diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc index c0ca881ff82cee04e0c5e35f9a2d5732fabdd8a6..4f980b6d14ed667bdf4756ed740894098cae5919 100644 --- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc @@ -16,7 +16,6 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" #include "tensorflow/compiler/xla/client/xla_builder.h" namespace tensorflow { diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc index 415ce9b77ffeac8a6a5f3c23537afb16c1d3567c..8822e29f7e77b1cbc6fa6ca61d0062d9b1b0c36e 100644 --- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc index 107fa62967a55dffcfff8728b65338564e5202d2..132160de707911f26389034e16236985bb18e6ad 100644 --- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc @@ -113,11 +113,21 @@ class MeanOp : public XlaReductionOp { xla::Add(scalar_lhs, scalar_rhs); } - xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder, - const xla::XlaOp& reduce_output, - int64 num_elements_reduced) override { - auto divisor = XlaHelpers::IntegerLiteral(builder, input_type(0), - num_elements_reduced); + xla::XlaOp BuildFinalizer( + xla::XlaBuilder* /*builder*/, const xla::XlaOp& input, + const xla::XlaOp& reduce_output, + const std::vector& dimensions_to_reduce) override { + if (dimensions_to_reduce.empty()) { + return reduce_output; + } + auto divisor = xla::GetDimensionSize(input, dimensions_to_reduce[0]); + for (int i = 1; i < dimensions_to_reduce.size(); i++) { + auto size = xla::GetDimensionSize(input, dimensions_to_reduce[i]); + divisor = xla::Mul(divisor, size); + } + xla::PrimitiveType type; + TF_CHECK_OK(DataTypeToPrimitiveType(input_type(0), &type)); + divisor = xla::ConvertElementType(divisor, type); return reduce_output / divisor; } }; diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h index 466e79828d111ee7cadcf713703e8f252c63e62c..8f1667df5b72e9ecf97e5771670ef209dee287a3 100644 --- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h +++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h @@ -48,13 +48,14 @@ class XlaReductionOp : public XlaOpKernel { const xla::XlaOp& scalar_rhs) = 0; // Applies a transformation to the output of the reduction. The desired - // computation should be added to 'builder'. Argument 'reduce_output' is the - // output of the reduction. 'num_elements_reduced' is the number of elements - // that contributed to the reduction. Returns the transformed reduction - // output, Defaults to returning 'reduce_output' unchanged. - virtual xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder, - const xla::XlaOp& reduce_output, - int64 num_elements_reduced); + // computation should be added to 'builder'. Argument 'input' is the original + // input of the reduction; 'reduce_output' is the output of the reduction. + // Returns the transformed reduction output, Defaults to returning + // 'reduce_output' unchanged. + virtual xla::XlaOp BuildFinalizer( + xla::XlaBuilder* builder, const xla::XlaOp& input, + const xla::XlaOp& reduce_output, + const std::vector& dimensions_to_reduce); void Compile(XlaOpKernelContext* ctx) override; diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc index 118f2798d559f43acb7f6394a7337426164325ef..e96cabbb853be744dbba7f19fbbd227bb52ebb06 100644 --- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc +++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc @@ -37,9 +37,10 @@ XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx, // Unless BuildFinalizer is overridden the reduction has no // finalizer. -xla::XlaOp XlaReductionOp::BuildFinalizer(xla::XlaBuilder* builder, - const xla::XlaOp& reduce_output, - int64 num_elements_reduced) { +xla::XlaOp XlaReductionOp::BuildFinalizer( + xla::XlaBuilder* /*builder*/, const xla::XlaOp& /*input*/, + const xla::XlaOp& reduce_output, + const std::vector& /*dimensions_to_reduce*/) { return reduce_output; } @@ -71,7 +72,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) { absl::InlinedVector bitmap(data_shape.dims(), false); std::vector xla_axes; - int64 num_elements_reduced = 1LL; for (int64 i = 0; i < axes_tensor_shape.num_elements(); ++i) { int64 index = axes[i]; OP_REQUIRES(ctx, @@ -82,7 +82,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) { index = (index + data_shape.dims()) % data_shape.dims(); bitmap[index] = true; xla_axes.push_back(index); - num_elements_reduced *= data_shape.dim_size(index); } std::vector final_shape; @@ -119,7 +118,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) { auto reduce = xla::Reduce(data, initial, reduction_computation, xla_axes); auto deconverted = XlaHelpers::ConvertElementType(b, reduce, input_type(0)); - auto finalized = BuildFinalizer(b, deconverted, num_elements_reduced); + auto finalized = BuildFinalizer(b, data, deconverted, xla_axes); auto result = keep_dims_ ? xla::Reshape(finalized, final_shape) : finalized; ctx->SetOutput(0, result); } diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc index 847704608fb32b43ffb61f87556d5231b9e39cde..8a8f33c8f39e47d7bd1f59413be880c51d273cf1 100644 --- a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc @@ -44,9 +44,6 @@ namespace { using xla::XlaOp; -// TODO(b/112295522): note that sampling from image boundary is not currently -// being handled properly. - // Calculates the bilinear weight tensor, given basis ratio (px, py) of the // sampling position: // W = [(1-px)*(1-py), px*(1-py), (1-px)*py, px*py] @@ -421,12 +418,13 @@ class ResamplerOp : public XlaOpKernel { OP_REQUIRES(ctx, warp_shape.dim_size(last_warp_dim) == 2, errors::InvalidArgument( "the last dimension of warp must be exactly size 2.")); + xla::PrimitiveType warp_type = ctx->input_xla_type(1); XlaOp data = ctx->Input("data"); XlaOp warp = ctx->Input("warp"); // Find the coordinates of the top left corner for the 2x2 region to be - // sampled from. The dimensions are (batch, dim_0, ... dim_n, 2) where the + // sampled from. The dimensions are [batch, dim_0, ... dim_n, 2] where the // last dimension of size 2 in turn is [x, y]. XlaOp top_left = xla::ConvertElementType(warp, xla::U32); @@ -457,10 +455,56 @@ class ResamplerOp : public XlaOpKernel { dot_dims.add_lhs_contracting_dimensions(warp_shape.dims() - 1); dot_dims.add_rhs_contracting_dimensions(warp_shape.dims() - 1); + // The dimension is [batch, dim_0, ...dim_n, data_channels]. auto blended_pixels = xla::DotGeneral(weights, neighbors_data, dot_dims, /*precision_config=*/nullptr); - ctx->SetOutput(0, blended_pixels); + // Handle out of boundary cases by constructing a predicate mask array based + // on the in-bound condition, and output 0 for the blended pixel value if + // out-bound. The dimension is the same as top_left: [batch, dim_0, + // ...dim_n, 2] where the last dimension of size 2 is the [x, y] coordinate. + + auto is_ge_zero = xla::Ge(warp, xla::ZerosLike(warp)); + + auto is_lt_image_size = xla::Lt( + warp, + xla::ConvertElementType( + xla::ConstantR1( + ctx->builder(), + {/*width=*/static_cast(data_shape.dim_size(2) - 1), + /*height=*/static_cast(data_shape.dim_size(1) - 1)}), + warp_type), + /*broadcast_dimensions=*/{warp_shape.dims() - 1}); + + auto is_in_bound_x_y = xla::And(is_ge_zero, is_lt_image_size); + // Reduce along last dimension. The resulting dimension is: + // [batch, dim_0, ...dim_n]. + auto is_in_bound = xla::Reduce( + is_in_bound_x_y, xla::ConstantR0(ctx->builder(), true), + xla::CreateScalarAndComputation(xla::PrimitiveType::PRED, + ctx->builder()), + {last_warp_dim}); + + // Broadcast 'is_in_bound' to the same dimension as 'blended_pixels', which + // is the dimension of the result: + // [batch, dim_0, ...dim_n, data_channels]. + auto warp_dims = warp_shape.dim_sizes(); + std::vector result_dims(warp_dims.begin(), warp_dims.end() - 1); + result_dims.push_back(data_channels); + xla::Shape broadcasted_shape = + xla::ShapeUtil::MakeShape(xla::PrimitiveType::PRED, result_dims); + + std::vector broadcasted_dims(warp_dims.size() - 1); + std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0); + auto broadcasted_is_in_bound = + xla::BroadcastInDim(is_in_bound, broadcasted_shape, broadcasted_dims); + + // Set out of bound samples to zero. + auto zeros = + xla::Broadcast(xla::Zero(ctx->builder(), data_type), result_dims); + auto result = xla::Select(broadcasted_is_in_bound, blended_pixels, zeros); + + ctx->SetOutput(0, result); } }; @@ -473,6 +517,8 @@ class ResamplerGradOp : public XlaOpKernel { OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &output_dtype)); } + // TODO(b/112295522): note that sampling from image boundary is not currently + // being handled properly. void Compile(XlaOpKernelContext* ctx) override { TensorShape data_shape_tf = ctx->InputShape("data"); OP_REQUIRES(ctx, data_shape_tf.dims() == 4, diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc index 7ff3e9163811434e8d621795c22bf8304ba7a1ed..d7b38e86cc985d608116488f9e76756a8e904f9c 100644 --- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc @@ -18,7 +18,6 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/constants.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/framework/tensor_shape.h" diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc index b5fd7850bfca01868273c40cbf86188bd815be5b..7f4fef146f6169cf6a578e7dc9cbb87b536e4cb8 100644 --- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc @@ -39,8 +39,8 @@ namespace { // TODO(phawkins): implement double-sized windowed reductions in XLA and remove // the type constraint. -constexpr std::array kScanOpTypes = { - {DT_HALF, DT_BFLOAT16, DT_FLOAT}}; +constexpr std::array kScanOpTypes = { + {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_INT32}}; class ScanOp : public XlaOpKernel { public: diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc index 60b011ba6d9b64a89e4228ba2a213f72b67a462d..b1fa2915d59e4e5e2f2523e20e9a37898d087117 100644 --- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc @@ -18,7 +18,7 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_helpers.h" #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/core/framework/op_kernel.h" diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h index 66206909a92fddbac4e77e5d2d8164fcbb46f317..a1d359e97c4fad3ca74d44a358cba0e8190cdc22 100644 --- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h +++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h @@ -26,7 +26,7 @@ limitations under the License. // Forward-declare, rather than include, to reduce code size for users that // never use this functionality. namespace xla { -class ProgramShape; +class ProgramShapeProto; class HloProfilePrinterData; } @@ -84,7 +84,7 @@ class XlaCompiledCpuFunction { void set_result_names(const char** result_names) { result_names_ = result_names; } - void set_program_shape(const xla::ProgramShape* program_shape) { + void set_program_shape(const xla::ProgramShapeProto* program_shape) { program_shape_ = program_shape; } const xla::HloProfilePrinterData* hlo_profile_printer_data() const { @@ -122,7 +122,7 @@ class XlaCompiledCpuFunction { const char** result_names_ = nullptr; // [Optional] Arg and result shapes. - const xla::ProgramShape* program_shape_ = nullptr; + const xla::ProgramShapeProto* program_shape_ = nullptr; // [Optional] Profile printer data. Null if profiling is disabled. const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr; @@ -264,7 +264,7 @@ class XlaCompiledCpuFunction { // Returns the shape of the args and results. May return nullptr if the // program shape isn't available. - const xla::ProgramShape* ProgramShape() const { return program_shape_; } + const xla::ProgramShapeProto* ProgramShape() const { return program_shape_; } bool hlo_profiling_enabled() const { return hlo_profile_printer_data_ != nullptr; @@ -305,7 +305,7 @@ class XlaCompiledCpuFunction { // Optional metadata. const char** arg_names_ = nullptr; const char** result_names_ = nullptr; - const xla::ProgramShape* program_shape_ = nullptr; + const xla::ProgramShapeProto* program_shape_ = nullptr; const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr; }; diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc index 9a34cd8c6ae2dc6d52a3cc69168df96f5322c6da..af378bc95c096082ff5cd963b9d6156f4351cd8d 100644 --- a/tensorflow/compiler/tf2xla/xla_helpers.cc +++ b/tensorflow/compiler/tf2xla/xla_helpers.cc @@ -26,7 +26,6 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_kernel.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/lib/constants.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/types.h" diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc index 86a78ee429e8913edb4a948727fa692083c472f4..fabbcd04fed96ad814d04c2df9394f43bfe0cf99 100644 --- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc +++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc @@ -133,7 +133,8 @@ XlaJitCompiledCpuFunction::Compile( jit->executable_ = std::move(executable); jit->buffer_infos_ = std::move(buffer_infos); jit->arg_index_table_ = std::move(arg_index_table); - jit->program_shape_ = std::move(program_shape); + jit->program_shape_ = + absl::make_unique(program_shape->ToProto()); jit->static_data_.set_raw_function(raw_function); jit->static_data_.set_buffer_infos(jit->buffer_infos_.data()); jit->static_data_.set_num_buffers(jit->buffer_infos_.size()); diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h index d3c8f22a8078d03d15447ed200c914390f40b04f..a5392057177e983e11787c31bb496a8947add1e6 100644 --- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h +++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h @@ -80,8 +80,10 @@ class XlaJitCompiledCpuFunction { std::vector arg_names_; std::vector result_names_; - // The backing data for the program shape. - std::unique_ptr program_shape_; + // The backing data for the program shape. The proto form of program shape is + // used because the program shape is serialized and embedded in the object + // file. + std::unique_ptr program_shape_; }; } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc index 6d49298a6f3e8a726695fafc42f3c5341fe98b5f..4496255d003a588a46cc27dd0d77491341915cd0 100644 --- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc +++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc @@ -116,7 +116,7 @@ TEST(XlaJitCompiledCpuFunction, Sum) { // Check program shape. using xla::ShapeUtil; const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {}); - const xla::ProgramShape* program_shape = function.ProgramShape(); + const xla::ProgramShapeProto* program_shape = function.ProgramShape(); ASSERT_TRUE(program_shape != nullptr); ASSERT_EQ(program_shape->parameters_size(), 2); EXPECT_TRUE(ShapeUtil::Compatible(program_shape->parameters(0), s32)); diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc index 49a6ebbbdeb5b533b30f5571e23f0aba3da87a61..14237df69081016817fbd1a5332f22996e7f264d 100644 --- a/tensorflow/compiler/tf2xla/xla_op_registry.cc +++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc @@ -130,8 +130,7 @@ XlaOpRegistry::~XlaOpRegistry() = default; // Lazily register the CPU and GPU JIT devices the first time // GetCompilationDevice is called. static void* registration_init = [®istry]() { - legacy_flags::MarkForCompilationPassFlags* flags = - legacy_flags::GetMarkForCompilationPassFlags(); + MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags(); bool cpu_global_jit = flags->tf_xla_cpu_global_jit; mutex_lock lock(registry.mutex_); diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index d914e97b6bd4506251dc4be504d6ab427590e615..4360e0857964b0ac63fc887e269b04a4b00d854a 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -226,12 +226,14 @@ cc_library( "index_util.cc", "layout_util.cc", "primitive_util.cc", + "shape.cc", "shape_util.cc", ], hdrs = [ "index_util.h", "layout_util.h", "primitive_util.h", + "shape.h", "shape_util.h", ], visibility = ["//visibility:public"], @@ -254,6 +256,23 @@ cc_library( ], ) +tf_cc_test( + name = "shape_test", + srcs = ["shape_test.cc"], + deps = [ + ":shape_util", + ":status_macros", + ":test", + ":test_helpers", + ":types", + ":util", + ":xla_data_proto", + "//tensorflow/core:lib", + "//tensorflow/core:test_main", + "@com_google_absl//absl/strings", + ], +) + tf_cc_test( name = "shape_util_test", srcs = ["shape_util_test.cc"], diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h index 782c966b4c57672d137569a318fb20ace14d493b..e4aca98f67d50287a83afc6f41a59458f3df2da2 100644 --- a/tensorflow/compiler/xla/array2d.h +++ b/tensorflow/compiler/xla/array2d.h @@ -104,7 +104,7 @@ std::unique_ptr> MakeLinspaceArray2D(double from, double to, int64 count = n1 * n2; NativeT step = static_cast((count > 1) ? (to - from) / (count - 1) : 0); - auto set = [&array, n1, n2](int64 index, NativeT value) { + auto set = [&array, n2](int64 index, NativeT value) { (*array)(index / n2, index % n2) = value; }; for (int64 i = 0; i < count - 1; ++i) { diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD index 42da0ebf4992884187bbe21701a44d8ba2fccd64..e1f3674c4f8eb61c3ad481bf057edaeba050cd82 100644 --- a/tensorflow/compiler/xla/client/BUILD +++ b/tensorflow/compiler/xla/client/BUILD @@ -191,6 +191,7 @@ cc_library( hdrs = ["xla_computation.h"], visibility = ["//visibility:public"], deps = [ + "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD index f833ddcd3235e08e2d0d3c0b9921e96ef871c89e..c5733bc66deb8d55a9186ad1893abaf17ed6909e 100644 --- a/tensorflow/compiler/xla/client/lib/BUILD +++ b/tensorflow/compiler/xla/client/lib/BUILD @@ -164,7 +164,6 @@ cc_library( deps = [ ":constants", ":math", - ":numeric", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:xla_builder", @@ -178,8 +177,9 @@ cc_library( srcs = ["sorting.cc"], hdrs = ["sorting.h"], deps = [ - ":numeric", + "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:xla_builder", ], @@ -188,10 +188,6 @@ cc_library( xla_test( name = "sorting_test", srcs = ["sorting_test.cc"], - blacklisted_backends = [ - "cpu", - "gpu", - ], tags = ["enable_for_xla_interpreter"], deps = [ ":sorting", diff --git a/tensorflow/compiler/xla/client/lib/numeric.h b/tensorflow/compiler/xla/client/lib/numeric.h index efd8cdc25724198633e0bf1c48c4e7d9e4b4c9e1..f62fdab4b0e5e84347cfaa1424a8c2e5c58dd3ce 100644 --- a/tensorflow/compiler/xla/client/lib/numeric.h +++ b/tensorflow/compiler/xla/client/lib/numeric.h @@ -22,9 +22,6 @@ limitations under the License. namespace xla { -// Returns a rank 1 tensor of `type` containing values [0, 1, 2, ...]. -XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size); - // Returns an m x n matrix with 1s on the diagonal elements, zeros everywhere // else. XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n); diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc index c6f68c8ee2f5198017c37abeb9551478f52a99f4..85b9e1827dcef5ed907d893277deb5a52f8f30e9 100644 --- a/tensorflow/compiler/xla/client/lib/prng.cc +++ b/tensorflow/compiler/xla/client/lib/prng.cc @@ -18,7 +18,6 @@ limitations under the License. #include "absl/base/casts.h" #include "tensorflow/compiler/xla/client/lib/constants.h" #include "tensorflow/compiler/xla/client/lib/math.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/util.h" diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc index 0475fd9c94f6e390b5169cfe2cbba8eae28ddc18..e8553a08bb014e790822a14e128686b60b8d6b7c 100644 --- a/tensorflow/compiler/xla/client/lib/sorting.cc +++ b/tensorflow/compiler/xla/client/lib/sorting.cc @@ -14,7 +14,9 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/client/lib/sorting.h" -#include "tensorflow/compiler/xla/client/lib/numeric.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/util.h" namespace xla { @@ -23,13 +25,12 @@ XlaOp TopK(XlaOp input, int64 k) { return builder->ReportErrorOrReturn([&]() -> StatusOr { TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input)); int last_dim = input_shape.dimensions_size() - 1; - int last_dim_size = input_shape.dimensions(last_dim); - XlaOp iota_s32 = Iota(builder, S32, last_dim_size); + Shape iota_shape = + ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions())); + XlaOp iota_s32 = Iota(builder, iota_shape, last_dim); auto input_dims = input_shape.dimensions(); - std::vector broadcast_dims(input_dims.begin(), input_dims.end() - 1); - XlaOp broadcast_s32 = Broadcast(iota_s32, broadcast_dims); - XlaOp sort_result = Sort(Neg(input), {broadcast_s32}); + XlaOp sort_result = Sort(Neg(input), {iota_s32}); std::vector start_indices(input_shape.dimensions_size(), 0); std::vector limit_indices(input_dims.begin(), input_dims.end()); limit_indices[last_dim] = k; diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc index fef98c9923096e21a755c6d730de2c7c10852b2d..ebb30d3acc492a115f4980aaa4d2d08f73683864 100644 --- a/tensorflow/compiler/xla/client/lib/sorting_test.cc +++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc @@ -56,5 +56,13 @@ XLA_TEST_F(SortingTest, TopKFullSort) { ComputeAndCompareR1(&builder, inputs, {}); } +XLA_TEST_F(SortingTest, TopKFullSortWithDuplicates) { + XlaBuilder builder(TestName()); + XlaOp a; + auto a_data = CreateR1Parameter({1, 1, 2, 2, 1}, 0, "a", &builder, &a); + xla::GetTupleElement(xla::TopK(a, 5), 1); + ComputeAndCompareR1(&builder, {2, 3, 0, 1, 4}, {a_data.get()}); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc index 0a587725d20507555382ef0657bdc08369a7fbac..f17bc456a62a714167cf092d012b03df5b18e50e 100644 --- a/tensorflow/compiler/xla/client/xla_builder.cc +++ b/tensorflow/compiler/xla/client/xla_builder.cc @@ -239,6 +239,19 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle, visited->insert(op_handle); } +Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num, + ShapeIndex dynamic_size_param_index, + int64 target_param_num, + ShapeIndex target_param_index, + int64 target_dim_num) { + TF_RETURN_IF_ERROR(dynamic_parameter_binding_.Bind( + DynamicParameterBinding::DynamicParameter{dynamic_size_param_num, + dynamic_size_param_index}, + DynamicParameterBinding::DynamicDimension{ + target_param_num, target_param_index, target_dim_num})); + return Status::OK(); +} + XlaComputation XlaBuilder::BuildAndNoteError() { DCHECK(parent_builder_ != nullptr); auto build_status = Build(); @@ -275,7 +288,8 @@ StatusOr XlaBuilder::Build(int64 root_id) { HloComputationProto entry; SetProtoIdAndName(&entry, name_, kNameSeparator, GetNextId()); - TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(), GetProgramShape(root_id)); + TF_ASSIGN_OR_RETURN(ProgramShape program_shape, GetProgramShape(root_id)); + *entry.mutable_program_shape() = program_shape.ToProto(); entry.set_root_id(root_id); for (auto& instruction : instructions_) { @@ -297,6 +311,9 @@ StatusOr XlaBuilder::Build(int64 root_id) { } module->add_computations()->Swap(&entry); + *(module->mutable_dynamic_parameter_binding()) = + dynamic_parameter_binding_.ToProto(); + // Clear data held by this builder. this->instructions_.clear(); this->handle_to_index_.clear(); @@ -1303,6 +1320,15 @@ XlaOp XlaBuilder::AfterAll(absl::Span tokens) { if (tokens.empty()) { return InvalidArgument("AfterAll requires at least one operand"); } + for (int i = 0; i < tokens.size(); ++i) { + const XlaOp& operand = tokens[i]; + TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand)); + if (!ShapeUtil::IsToken(operand_shape)) { + return InvalidArgument( + "All operands to AfterAll must be tokens; operand %d has shape %s", + i, ShapeUtil::HumanString(operand_shape)); + } + } HloInstructionProto instr; *instr.mutable_shape() = ShapeUtil::MakeTokenShape(); return AddInstruction(std::move(instr), HloOpcode::kAfterAll, tokens); @@ -2356,7 +2382,7 @@ StatusOr XlaBuilder::BuildConstantSubGraph( SetProtoIdAndName(&entry, StrCat(name_, "_compute_constant"), kNameSeparator, GetNextId()); entry.set_root_id(root->id()); - ProgramShape* program_shape = entry.mutable_program_shape(); + ProgramShapeProto* program_shape = entry.mutable_program_shape(); *program_shape->mutable_result() = root->shape(); // We use std::set to keep the instruction ids in ascending order (which is diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h index 68314a026eab0db3eaf321f0fa53c016d79882ba..78c90dbccc486370377408d54406f4a896f60816 100644 --- a/tensorflow/compiler/xla/client/xla_builder.h +++ b/tensorflow/compiler/xla/client/xla_builder.h @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -263,35 +264,30 @@ class XlaBuilder { // evaluating the computation. StatusOr IsConstant(const XlaOp& operand) const; + // Sets up binding which indicates that the `target_dim_num` in the subshape + // `target_param_index` of parameter `target_param_num` is a dynamic dimension + // and its real dynamic size is represented by `dynamic_param_index` in + // parameter `dynamic_param_num`. + // + // TODO(b/119520625): Remove this API once we have more dynamic shape infra + // ready. + Status SetDynamicBinding(int64 dynamic_size_param_num, + ShapeIndex dynamic_size_param_index, + int64 target_param_num, + ShapeIndex target_param_index, int64 target_dim_num); + private: // Build helper which takes the id of the root operation.. StatusOr Build(int64 root_id); - // Enqueues a "retrieve parameter value" instruction for a parameter that was - // passed to the computation. + // Description for the methods below can be found in the corresponding public + // functions section in this file. + XlaOp Parameter(int64 parameter_number, const Shape& shape, const string& name); - // Enqueues a constant with the value of the given literal onto the - // computation. XlaOp ConstantLiteral(const LiteralSlice& literal); - // Enqueues a constant onto the computation. Methods are templated on the - // native host type (NativeT) which corresponds to a specific XLA - // PrimitiveType as given in the following table: - // - // Native Type PrimitiveType - // ----------------------------- - // bool PRED - // int32 S32 - // int64 S64 - // uint32 U32 - // uint64 U64 - // float F32 - // double F64 - // - // Note: not all primitive types defined in xla_data.proto have a - // corresponding native type yet. template XlaOp ConstantR0(NativeT value); template @@ -321,181 +317,78 @@ class XlaBuilder { template XlaOp ConstantR4FromArray4D(const Array4D& values); - // Enqueues a rank one constant (vector) onto the computation. The vector has - // size 'length' and every element has the value 'value'. template XlaOp ConstantR1(int64 length, NativeT value); - // Adds dimensions to an array by duplicating the data in the array. - // - // The new dimensions are inserted on the left, i.e. if - // broadcast_sizes has values {a0, ..., aN} and the operand shape - // has dimensions {b0, ..., bM} then the shape of the output has - // dimensions {a0, ..., aN, b0, ..., bM}. - // - // The new dimensions index into copies of the operand, i.e. - // - // output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM] XlaOp Broadcast(const XlaOp& operand, absl::Span broadcast_sizes); XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape, const absl::Span broadcast_dimensions); - // Enqueues a pad operation onto the computation that pads the given value on - // the edges as well as between the elements of the input. padding_config - // specifies the padding amount for each dimension. XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value, const PaddingConfig& padding_config); - // Enqueues an operation onto the computation that flattens the operand based - // on the dimension order (major/slowest-varying to minor/fastest-varying) - // given, followed by reshaping it into the shape with the given dimension - // sizes (also major to minor). Conceptually, this is a limited form of - // "shape casting". XlaOp Reshape(const XlaOp& operand, absl::Span dimensions, absl::Span new_sizes); - // Enqueues an operation onto the computation that collapses the operand, from - // first to last dimension (C order), then reshapes it to the given dimension - // sizes. Conceptually, this is a limited form of "shape casting". XlaOp Reshape(const XlaOp& operand, absl::Span new_sizes); - // Wrapper for Reshape. - // Enqueues an operation to collapse the provided dimensions; e.g. an - // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to - // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must - // be a consecutive, in-order subsequence of the operand dimensions. - // - // Note that collapsing a single dimension does nothing: - // - // {256} collapsing {0} => {256} - // {1} collapsing {0} => {1} - // - // Collapsing multiple dimensions produces a single result dimension: - // - // {256, 2} collapsing {0,1} => {512} - // {256, 2, 3} collapsing {0,1} => {512, 3} - // - // This could potentially cause data to be moved -- it provides a more - // structured form of reshaping than an arbitrary Reshape operation. XlaOp Collapse(const XlaOp& operand, absl::Span dimensions); - // Enqueues a slice operation onto the computation that slices the operand - // from the start indices to the limit indices; e.g. - // - // x - // [ 0 1 2 3 ] - // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ] - // [ 8 9 a b ] - // - // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D - // range notation. - // The strides parameter determines the stride over the slice XlaOp Slice(const XlaOp& operand, absl::Span start_indices, absl::Span limit_indices, absl::Span strides); - // Enqueues a slice operation in a given dimension, taking all other - // dimensions as they are; e.g. if dimno is 1 from start_index 2 to - // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand - // for: - // - // array[:, 2:4:1, :] XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index, int64 stride, int64 dimno); - // Enqueues a slice operation onto the computation that slices the 'operand' - // from dynamic start indices which are passed in 'start_indices'. - // The size of the slice in each dimension is passed in 'slice_sizes', - // which specify the end point of exclusive slice intervals in each - // dimension [start, start + size). - // The shape of 'start_indices' must be rank == 1, with dimension size - // equal to the rank of the 'operand'. - // Slice index calculations are computed modulo input dimension sizes to - // prevent dynamic start indices from generating out-of-bound array accesses. XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices, absl::Span slice_sizes); - // Enqueues a dynamic update slice operation onto the computation, which - // updates a slice of 'operand' with 'update' at dynamic 'start_indices'. - // The shape of 'update' determines the shape of the slice of 'operand' - // which is updated. - // The indices specified in 'start_indices' specify the offset of the slice - // of 'operand' which is updated. - // - // update = {10, 11} // calculated at runtime. - // [1 2 3] start = {1, 1} // calculated at runtime. [1 2 3 ] - // [4 5 6] => DynamicUpdateslice(data, update, start) => [4 10 11] - // [7 8 9] [7 8 9 ] - // - // The shape of 'start_indices' must be rank == 1, with dimension size - // equal to the rank of the 'operand'. - // Slice index calculations are computed modulo update dimension sizes to - // prevent dynamic start indices from generating out-of-bound array accesses. XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update, const XlaOp& start_indices); - // Enqueues a concatenate instruction onto the computation. 'operands' must - // have >= 1 entry. XlaOp ConcatInDim(absl::Span operands, int64 dimension); - // Enqueue a tracing operation onto the computation; the computation will emit - // a logging message with the operand. void Trace(const string& tag, const XlaOp& operand); - // Enqueues a conditional-move-like select operation onto the computation; - // predicated on pred, selects between on_true and on_false. XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false); - // Enqueues a tuple-creation instruction onto the computation. XlaOp Tuple(absl::Span elements); - // Enqueues a tuple-element-get instruction onto the computation. XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index); - // Enqueues an equal-to comparison instruction onto the computation. XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a not-equal comparison instruction onto the computation. XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a greater-or-equal comparison instruction onto the computation. XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a greater-than comparison instruction onto the computation. XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a less-than comparison instruction onto the computation. XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a less-or-equal comparison instruction onto the computation. XlaOp Le(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a dot instruction onto the computation. XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs, const PrecisionConfig* precision_config = nullptr); - // Enqueues a general dot instruction onto the computation. XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs, const DotDimensionNumbers& dimension_numbers, const PrecisionConfig* precision_config = nullptr); - // Enqueues a convolution instruction onto the computation, which uses the - // default convolution dimension numbers. XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, Padding padding, int64 feature_group_count = 1, const PrecisionConfig* precision_config = nullptr); - // Enqueues a convolution instruction onto the computation, with the caller - // provided padding configuration in the format returned by MakePadding(). XlaOp ConvWithGeneralPadding( const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, @@ -503,8 +396,6 @@ class XlaBuilder { int64 feature_group_count = 1, const PrecisionConfig* precision_config = nullptr); - // Enqueues a convolution instruction onto the computation, with the caller - // provided dimension numbers configuration. XlaOp ConvWithGeneralDimensions( const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, Padding padding, @@ -512,8 +403,6 @@ class XlaBuilder { int64 feature_group_count = 1, const PrecisionConfig* precision_config = nullptr); - // Enqueues a convolution instruction onto the computation, with the caller - // provided padding configuration as well as the dimension numbers. XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, absl::Span> padding, @@ -521,8 +410,6 @@ class XlaBuilder { int64 feature_group_count = 1, const PrecisionConfig* precision_config = nullptr); - // Enqueues a convolution instruction onto the computation, with the caller - // provided padding configuration, dilation factors and dimension numbers. XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs, absl::Span window_strides, absl::Span> padding, @@ -532,80 +419,53 @@ class XlaBuilder { int64 feature_group_count = 1, const PrecisionConfig* precision_config = nullptr); - // Enqueues an FFT instruction onto the computation, of the given type and - // with the given FFT length. XlaOp Fft(const XlaOp& operand, FftType fft_type, absl::Span fft_length); - // Enqueues an infeed instruction onto the computation, which writes data of - // the given shape to the infeed buffer of the device. XlaOp Infeed(const Shape& shape, const string& config = ""); XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape, const string& config = ""); - // Enqueues an outfeed instruction onto the computation. This instruction - // generates outgoing data transfers for the given data. - // - // shape_with_layout communicates the laid out shape that we want to outfeed - // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error - // will occur. void Outfeed(const XlaOp& operand, const Shape& shape_with_layout, const string& outfeed_config); XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token, const Shape& shape_with_layout, const string& outfeed_config); - // Enqueues a call instruction onto the computation. XlaOp Call(const XlaComputation& computation, absl::Span operands); - // Enqueues a custom call instruction onto the computation. XlaOp CustomCall( const string& call_target_name, absl::Span operands, const Shape& shape_with_layout, const string& opaque, absl::optional> operand_shapes_with_layout); - // The following methods enqueue element-wise binary arithmetic operations - // onto the computation. The shapes of the operands have to match unless one - // of the operands is a scalar, or an explicit broadcast dimension is given - // (see g3doc for more details). - - // Enqueues a complex compose instruction onto the computation. XlaOp Complex(const XlaOp& real, const XlaOp& imag, absl::Span broadcast_dimensions = {}); - // Enqueues a complex conjugate instruction onto the computation. XlaOp Conj(const XlaOp& operand); - // Enqueues an add instruction onto the computation. XlaOp Add(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a subtract instruction onto the computation. XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a multiply instruction onto the computation. XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a divide instruction onto the computation. XlaOp Div(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a remainder instruction onto the computation. XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a max instruction onto the computation. XlaOp Max(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues a min instruction onto the computation. XlaOp Min(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Element-wise logical operators XlaOp And(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); @@ -624,32 +484,23 @@ class XlaBuilder { XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Reduces an array among the provided dimensions, given "computation" as a - // reduction operator. XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value, const XlaComputation& computation, absl::Span dimensions_to_reduce); - // Reduces several arrays simultaneously among the provided dimensions, given - // "computation" as a reduction operator. XlaOp Reduce(absl::Span operands, absl::Span init_values, const XlaComputation& computation, absl::Span dimensions_to_reduce); - // Convenience wrapper around the above that reduces all the dimensions in the - // operand shape. XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value, const XlaComputation& computation); - // Enqueues a windowed reduce instruction onto the computation. XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value, const XlaComputation& computation, absl::Span window_dimensions, absl::Span window_strides, Padding padding); - // As ReduceWindow(), but the padding is given in the format - // returned by MakePadding(). XlaOp ReduceWindowWithGeneralPadding( const XlaOp& operand, const XlaOp& init_value, const XlaComputation& computation, @@ -659,48 +510,22 @@ class XlaBuilder { absl::Span window_dilations, absl::Span> padding); - // Returns the sum of the operand value within each subgroup of replicas. All - // replicas supply one input to the sum and all replicas receive the resulting - // sum for each subgroup. XlaOp CrossReplicaSum(const XlaOp& operand, absl::Span replica_groups = {}); - // Enqueues an operation that do an AllReduce of the operand cross cores. Here - // AllReduce means doing a reduction on the input operand cross cores and then - // broadcasting the reduction result to those cores. The reduction function is - // defined by `computation`, which should be a commutative computation on - // scalars, e.g., add, min, or max. The way that AllReduce is applied is - // configured by: - // - // - `replica_groups`: each ReplicaGroup contains a list of replica id. If - // empty, all replicas belong to one group. Allreduce will be applied within - // subgroups. For example, we have 4 replicas, then - // replica_groups={{0,2},{1,3}} means, replica 0 and 2 are in subgroup 0, - // replica 1 and 3 are in subgroup 1. - // - // - `channel_id`: for Allreduce nodes from different modules, if they have - // the same channel_id, they will be 'Allreduce'd. If empty, Allreduce will - // not be applied cross modules. - // - // TODO(b/117564385): Rename this to AllReduce when it's ready to use. XlaOp CrossReplicaSum( const XlaOp& operand, const XlaComputation& computation, absl::Span replica_groups = {}, const absl::optional& channel_id = absl::nullopt); - // Enqueues an operation that do an Alltoall of the operand cross cores. XlaOp AllToAll(const XlaOp& operand, int64 split_dimension, int64 concat_dimension, int64 split_count, const std::vector& replica_groups); - // Enqueues an operation that do an CollectivePermute of the operand cross - // cores. XlaOp CollectivePermute( const XlaOp& operand, const std::vector>& source_target_pairs); - // Enqueues an operation that scatters the `source` array to the selected - // indices of each window. XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select, absl::Span window_dimensions, absl::Span window_strides, @@ -708,8 +533,6 @@ class XlaBuilder { const XlaOp& init_value, const XlaComputation& scatter); - // As SelectAndScatter(), but the padding is given in the format - // returned by MakePadding(). XlaOp SelectAndScatterWithGeneralPadding( const XlaOp& operand, const XlaComputation& select, absl::Span window_dimensions, @@ -717,217 +540,119 @@ class XlaBuilder { absl::Span> padding, const XlaOp& source, const XlaOp& init_value, const XlaComputation& scatter); - // Enqueues an abs instruction onto the computation. XlaOp Abs(const XlaOp& operand); - // Enqueues a atan2 instruction onto the computation. XlaOp Atan2(const XlaOp& y, const XlaOp& x, absl::Span broadcast_dimensions = {}); - // Enqueues an exp instruction onto the computation. XlaOp Exp(const XlaOp& operand); - // Enqueues an expm1 instruction onto the computation. XlaOp Expm1(const XlaOp& operand); - // Enqueues a floor instruction onto the computation. XlaOp Floor(const XlaOp& operand); - // Enqueues a ceil instruction onto the computation. XlaOp Ceil(const XlaOp& operand); - // Enqueues a round instruction onto the computation, rounding to nearest even - // with half-way cases rounding away from zero. XlaOp Round(const XlaOp& operand); - // Enqueues an log instruction (natural logarithm) onto the computation. XlaOp Log(const XlaOp& operand); - // Enqueues an log1p instruction (log(x+1)) onto the computation. XlaOp Log1p(const XlaOp& operand); - // Enqueues a sign instruction onto the computation. XlaOp Sign(const XlaOp& operand); - // Enqueues a count leading zeros instruction onto the computation. XlaOp Clz(const XlaOp& operand); - // Enqueues a cosine instruction onto the computation. XlaOp Cos(const XlaOp& operand); - // Enqueues a sine instruction onto the computation. XlaOp Sin(const XlaOp& operand); - // Enqueues a tanh instruction onto the computation. XlaOp Tanh(const XlaOp& operand); - // Enqueues a real-part instruction onto the computation. XlaOp Real(const XlaOp& operand); - // Enqueues an imaginary-part instruction onto the computation. XlaOp Imag(const XlaOp& operand); - // Enqueues a lhs^rhs computation onto the computation. XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs, absl::Span broadcast_dimensions = {}); - // Enqueues an operator that tests if the operand's values are finite, i.e., - // not Inf or NaN. Defined only for floating-point types. Returns an array of - // booleans with the same shape where entries are true iff the corresponding - // entry was NaN. XlaOp IsFinite(const XlaOp& operand); - // Enqueues an iota operation onto the computation. XlaOp Iota(const Shape& shape, int64 iota_dimension); - // Enqueues a rank-1 iota operation onto the computation. XlaOp Iota(PrimitiveType type, int64 size); - // Enqueues a convert instruction onto the computation that changes the - // element type of the operand array to primitive_type. XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type); - // Enqueues a no-op instruction onto the computation that changes - // the element type of the operand array to primitive_type. The - // bit-widths of the source and destination element types must be - // identical. XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type); - // Enqueues a negate instruction onto the computation. XlaOp Neg(const XlaOp& operand); - // Enqueues a transpose instruction onto the computation. XlaOp Transpose(const XlaOp& operand, absl::Span permutation); - // Enqueues a reverse instruction onto the computation. The order of the - // elements in the given dimensions is reversed (i.e., the element at index i - // is moved to index dimension_size - 1 - i). XlaOp Rev(const XlaOp& operand, absl::Span dimensions); - // Enqueues a sort (as increasing order) instruction onto the computation. - // If only keys are provided: - // * If the keys are an rank-1 tensor (an array), the result is a sorted array - // of keys, in ascending order. - // * If the keys have higher rank, the keys are sorted along the provided - // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension - // value of 0 will indepenently sort every column, and a dimension value of 1 - // will independently sort each row. If no dimension number is provided, then - // the last dimension is chosen by default. - // - // If both keys and values are provided: - // * The keys and all values must be tensors with the same dimensions. The - // element types of the tensors may be different. - // * The result is a tuple that consists of a sorted tensor of keys (along the - // provided dimension, as above) as the first element, and tensors with their - // corresponding values as the other elements. XlaOp Sort(const XlaOp& keys, absl::Span values = {}, int64 dimension = -1); - // Enqueues a clamp instruction onto the computation. XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max); - // Enqueues a map instruction onto the computation. XlaOp Map(absl::Span operands, const XlaComputation& computation, absl::Span dimensions, absl::Span static_operands = {}); - // Enqueues a N(mu, sigma) random number generation instruction onto the - // computation. XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape); - // Enqueues a U(a, b) random number generation instruction onto the - // computation. Returns values in the semi-open interval [a, b). XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape); - // Enqueues a while node onto the computation. XlaOp While(const XlaComputation& condition, const XlaComputation& body, const XlaOp& init); - // Enqueues a conditional node onto the computation. XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand, const XlaComputation& true_computation, const XlaOp& false_operand, const XlaComputation& false_computation); - // Enqueues a ReducePrecision node onto the computation. XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits, const int mantissa_bits); - // Enqueues a Gather node onto the computation. XlaOp Gather(const XlaOp& input, const XlaOp& start_indices, const GatherDimensionNumbers& dimension_numbers, absl::Span slice_sizes); - // Enqueues a Scatter node onto the computation. XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices, const XlaOp& updates, const XlaComputation& update_computation, const ScatterDimensionNumbers& dimension_numbers); - // Enqueues a Send node onto the computation for device-to-device - // communication, to send the given operand to a Recv instruction that shares - // the same channel handle. void Send(const XlaOp& operand, const ChannelHandle& handle); XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token, const ChannelHandle& handle); - // Enqueues a Send node which sends data to the host. XlaOp SendToHost(const XlaOp& operand, const XlaOp& token, const Shape& shape_with_layout, const ChannelHandle& handle); - // Enqueues a Recv node which receives data from the host. XlaOp RecvFromHost(const XlaOp& token, const Shape& shape, const ChannelHandle& handle); - // Enqueues an AfterAll operation with no operands producing a token-shaped - // value. XlaOp CreateToken(); - // Enqueues an AfterAll operation with no operands producing a token-shaped - // value. XlaOp AfterAll(absl::Span tokens); - // Enqueues a Recv node onto the computation. The data comes from a Send - // instruction that shares the same channel handle and its shape must - // be the same as the given shape. XlaOp Recv(const Shape& shape, const ChannelHandle& handle); XlaOp RecvWithToken(const XlaOp& token, const Shape& shape, const ChannelHandle& handle); - // Normalizes operand across spatial and batch dimensions for each feature. - // - // Returns a tuple (normalized, batch_mean, batch_var) where `normalized` - // is the normalized result and batch_mean and batch_var are the mean and - // variance, respectively, across batch for the operand. XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale, const XlaOp& offset, float epsilon, int64 feature_index); - // Normalizes operand across spatial and batch dimensions for each feature. - // - // `BatchNormInference` is equivalent to calling `BatchNormTraining` without - // computing `mean` and `variance` for each batch inside the operation. It - // uses the input `mean` and `variance` instead as estimated values. The - // purpose of this op is to reduce latency in inference, hence the name - // `BatchNormInference`. - // - // The output has the same shape as `operand`, and contains the normalized - // values for each batch. XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale, const XlaOp& offset, const XlaOp& mean, const XlaOp& variance, float epsilon, int64 feature_index); - // Calculates the gradients of a batch norm op. - // - // The inputs `batch_mean` and `batch_var` represent the mean and variance - // across the batch. - // - // Returns a tuple of three elements: - // - grad_operand: Gradient with respect to input `operand` - // - grad_offset: Gradient with respect to input `offset` - // - grad_scale: Gradient with respect to input `scale` XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale, const XlaOp& batch_mean, const XlaOp& batch_var, const XlaOp& grad_output, float epsilon, @@ -1019,6 +744,9 @@ class XlaBuilder { // The instructions of this computation. std::vector instructions_; + // Dynamic parameter configuration of this computation. + DynamicParameterBinding dynamic_parameter_binding_; + // A map from XlaOp::Handle to the index in the instructions_ vector where the // instruction is held. absl::flat_hash_map handle_to_index_; @@ -1393,6 +1121,7 @@ class XlaScopedShardingAssignment { // Free functions for building XlaOps. The intention is that these will // become the public API for building XlaOps rather than calling methods on // XlaBuilder directly. +// // Enqueues a "retrieve parameter value" instruction for a parameter that was // passed to the computation. @@ -2138,6 +1867,7 @@ XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale, XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension); // Implementation details below this point. +// template XlaOp XlaBuilder::ConstantR0(NativeT value) { diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc index 8aa85c3cd63c9b0aeb55d2cebbb989b6432ac959..e534fb67fd9bda8fb83af8ca036b35e158271c0d 100644 --- a/tensorflow/compiler/xla/client/xla_builder_test.cc +++ b/tensorflow/compiler/xla/client/xla_builder_test.cc @@ -446,5 +446,14 @@ TEST_F(XlaBuilderTest, ProtoMatches) { EXPECT_EQ(c0_string, c1_string); } +TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) { + XlaBuilder b(TestName()); + AfterAll(&b, {CreateToken(&b), ConstantR0(&b, 1.0)}); + Status status = b.Build().status(); + ASSERT_IS_NOT_OK(status); + EXPECT_THAT(status.error_message(), + ::testing::HasSubstr("All operands to AfterAll must be tokens")); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_computation.cc index c9870b65b91c1ebd7d44143faf215a2d5c2a2fc5..f317892c12529b2ee8a81788f6bbcae3b3d6489d 100644 --- a/tensorflow/compiler/xla/client/xla_computation.cc +++ b/tensorflow/compiler/xla/client/xla_computation.cc @@ -25,7 +25,7 @@ namespace xla { StatusOr XlaComputation::GetProgramShape() const { TF_RET_CHECK(proto_.has_host_program_shape()); - return proto_.host_program_shape(); + return ProgramShape(proto_.host_program_shape()); } StatusOr> XlaComputation::Snapshot() const { diff --git a/tensorflow/compiler/xla/client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h index 71598ef8b296a760b0ee818fce0a59aed5cfc6b4..3ccbfb28bd0c5939ee40878e9cc298688882ac62 100644 --- a/tensorflow/compiler/xla/client/xla_computation.h +++ b/tensorflow/compiler/xla/client/xla_computation.h @@ -19,6 +19,7 @@ limitations under the License. #include #include "tensorflow/compiler/xla/service/hlo.pb.h" +#include "tensorflow/compiler/xla/shape.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py index fb135f5ceda67ce6c001de15b8f3f084ca164826..1fea816a803bfb75b9721393cef8c4dfc249268d 100644 --- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py +++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py @@ -18,12 +18,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import math - import numpy as _np # Avoids becoming a part of public Tensorflow API. from tensorflow.compiler.xla import xla_data_pb2 -from tensorflow.compiler.xla.python_api import xla_shape from tensorflow.core.framework import attr_value_pb2 @@ -64,22 +61,18 @@ class Sharding(object): tile_assignment_devices=[core])) @classmethod - def tile(cls, tile_shape, tile_assignment): + def tile(cls, tile_assignment): """Returns a Tiled sharding attribute. This causes an op to be partially computed on multiple cores in the XLA device. Args: - tile_shape: A xla_shape.Shape describing the tile shape that each core - will compute. - The tile shape does not need to be divisible by the tile assignment. tile_assignment: An np.ndarray describing the topology of the tiling and which device will compute which part of the topology. Raises: - TypeError: tile_assignment was not of np.array type or tile_shape was - not of xla_shape.Shape type. + TypeError: tile_assignment was not of np.array type. TODO(jmolloy): This concept is nefarious and is not something we really want to expose to users (especially as the @@ -87,14 +80,11 @@ class Sharding(object): """ if not isinstance(tile_assignment, _np.ndarray): raise TypeError('Tile assignment must be of type np.ndarray') - if not isinstance(tile_shape, xla_shape.Shape): - raise TypeError('Tile shape must be of type xla_shape.Shape') dims = list(tile_assignment.shape) flattened_devices = tile_assignment.reshape(-1, order='C') return Sharding( proto=xla_data_pb2.OpSharding( type=xla_data_pb2.OpSharding.OTHER, - tile_shape=tile_shape.message, tile_assignment_dimensions=dims, tile_assignment_devices=list(flattened_devices))) @@ -118,14 +108,8 @@ class Sharding(object): shape = tensor.shape.as_list() if shape[split_dimension] < num_devices: raise ValueError('Split dimension was smaller than the required number ' - 'of splits: shape=%r, dimension=%r, num_devices=%r', - shape, split_dimension, num_devices) - - tile_shape = shape - tile_shape[split_dimension] = int( - math.ceil(tile_shape[split_dimension] / num_devices)) - tile_shape_proto = xla_data_pb2.Shape( - element_type=xla_data_pb2.F32, dimensions=tile_shape) + 'of splits: shape=%r, dimension=%r, num_devices=%r' % + (shape, split_dimension, num_devices)) tile_assignment_dims = [1] * len(shape) tile_assignment_dims[split_dimension] = num_devices @@ -133,7 +117,6 @@ class Sharding(object): return Sharding( proto=xla_data_pb2.OpSharding( type=xla_data_pb2.OpSharding.OTHER, - tile_shape=tile_shape_proto, tile_assignment_dimensions=tile_assignment_dims, tile_assignment_devices=range(num_devices))) @@ -149,7 +132,6 @@ class Sharding(object): type=xla_data_pb2.OpSharding.TUPLE, tuple_shardings=tuple_shardings) else: proto = self._proto - attr_value = attr_value_pb2.AttrValue(s=proto.SerializeToString()) # TODO(jmolloy): This need to be seriously revisited before declaring this # API available for public use. @@ -194,8 +176,8 @@ def assign_device(tensor, device): return tensor -def tile(tensor, tile_shape, tile_assignment): - Sharding.tile(tile_shape, tile_assignment).apply_to_tensor(tensor) +def tile(tensor, tile_assignment): + Sharding.tile(tile_assignment).apply_to_tensor(tensor) return tensor diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml index 756b932332885efc86ac7650e50767acac6da01c..12b7094705e75305dc43a013576f4549dd5f4185 100644 --- a/tensorflow/compiler/xla/g3doc/_book.yaml +++ b/tensorflow/compiler/xla/g3doc/_book.yaml @@ -3,11 +3,11 @@ upper_tabs: - include: /_upper_tabs_left.yaml - include: /api_docs/_upper_tabs_api.yaml # Dropdown menu -- name: Ecosystem - path: /ecosystem +- name: Resources + path: /resources is_default: true menu: - - include: /ecosystem/_menu_toc.yaml + - include: /resources/_menu_toc.yaml lower_tabs: # Subsite tabs other: diff --git a/tensorflow/compiler/xla/g3doc/_index.yaml b/tensorflow/compiler/xla/g3doc/_index.yaml index 7934cd11ba22d3f47e172726f54ce51d15eb2cad..858de427119bfcfa82d0b1158776bf269129fd92 100644 --- a/tensorflow/compiler/xla/g3doc/_index.yaml +++ b/tensorflow/compiler/xla/g3doc/_index.yaml @@ -17,7 +17,7 @@ landing_page: - classname: devsite-landing-row-cards items: - heading: XLA - TensorFlow, compiled - image_path: /ecosystem/images/tf-logo-card-16x9.png + image_path: /resources/images/tf-logo-card-16x9.png path: https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html buttons: - label: Read on Google Developers blog @@ -28,7 +28,7 @@ landing_page: - label: Watch the video path: https://www.youtube.com/watch?v=kAOanJczHA0 - heading: XLA on GitHub - image_path: /ecosystem/images/github-card-16x9.png + image_path: /resources/images/github-card-16x9.png path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla buttons: - label: View on GitHub diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md index 73a9db75f6bf090bba5c3534f14d8ebfa421b5bb..bc87a60c6ec70aa27de534f61f0188078a6a3577 100644 --- a/tensorflow/compiler/xla/g3doc/operation_semantics.md +++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md @@ -13,6 +13,22 @@ arbitrary-dimensional array. For convenience, special cases have more specific and familiar names; for example a *vector* is a 1-dimensional array and a *matrix* is a 2-dimensional array. +## AfterAll + +See also +[`XlaBuilder::AfterAll`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h). + +AfterAll takes a variadic number of tokens and produces a single token. Tokens +are primitive types which can be threaded between side-effecting operations to +enforce ordering. `AfterAll` can be used as a join of tokens for ordering a +operation after a set operations. + + `AfterAll(operands)` + +Arguments | Type | Semantics +---------- | ------- | ------------------------- +`operands` | `XlaOp` | variadic number of tokens + ## AllToAll See also diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h index 6e0390763da15167b85597462f3e21b8e1eaf732..6c298e57252449ce3f1f9055436e918f2d9f17f1 100644 --- a/tensorflow/compiler/xla/layout_util.h +++ b/tensorflow/compiler/xla/layout_util.h @@ -21,6 +21,7 @@ limitations under the License. #include #include "absl/types/span.h" +#include "tensorflow/compiler/xla/shape.h" #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc index fcc59f6d213b66193a4fdc763f4995aec8370cd6..36ad7c64866e77187d40f22b364d80230651696b 100644 --- a/tensorflow/compiler/xla/literal.cc +++ b/tensorflow/compiler/xla/literal.cc @@ -1123,7 +1123,6 @@ void DenseArrayToStringHelper(const LiteralBase& literal, } } pieces->push_back(brace_to_string("}")); - return; } }; diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc index dac6067861733225bacc0ea2d2bbe26f627c250f..bd93517728b052aed854df0f9d9c5447bc3b156f 100644 --- a/tensorflow/compiler/xla/literal_test.cc +++ b/tensorflow/compiler/xla/literal_test.cc @@ -1587,9 +1587,9 @@ TEST_F(LiteralUtilTest, DecomposeTuple) { Literal nested_tuple = LiteralUtil::MakeTuple( {&tuple_elements[0], &tuple_elements[1], &nil_literal}); - EXPECT_FALSE(ShapeUtil::IsNil(nested_tuple.shape())); + EXPECT_FALSE(ShapeUtil::IsEmptyTuple(nested_tuple.shape())); std::vector elements = nested_tuple.DecomposeTuple(); - EXPECT_TRUE(ShapeUtil::IsNil(nested_tuple.shape())); + EXPECT_TRUE(ShapeUtil::IsEmptyTuple(nested_tuple.shape())); ASSERT_EQ(elements.size(), 3); @@ -1640,7 +1640,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) { EXPECT_EQ(literal.Get({1}, /*shape_index=*/{2, 1}), 44.0); for (const Literal& element : elements) { - EXPECT_TRUE(ShapeUtil::IsNil(element.shape())); + EXPECT_TRUE(ShapeUtil::IsEmptyTuple(element.shape())); } } diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc index 4d2a37cfac3e0e89d189f168031e5db44ca5d410..2768ed618d81146aa1d65e5aa847f99bbd7b454a 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.cc +++ b/tensorflow/compiler/xla/python/local_computation_builder.cc @@ -487,12 +487,13 @@ StatusOr LocalComputation::CompileForXrt( xrt::XLAComputation c; auto config = c.mutable_config(); - auto shapes = config->mutable_program_shape(); + ProgramShape shapes; for (auto& shape : argument_shapes) { - *shapes->add_parameters() = shape; + *shapes.add_parameters() = shape; } - TF_ASSIGN_OR_RETURN(*shapes->mutable_result(), GetReturnValueShape()); - LayoutUtil::SetToDefaultLayout(shapes); + TF_ASSIGN_OR_RETURN(*shapes.mutable_result(), GetReturnValueShape()); + LayoutUtil::SetToDefaultLayout(&shapes); + *config->mutable_program_shape() = shapes.ToProto(); auto snapshot = computation().Snapshot().ValueOrDie(); *c.mutable_hlo_snapshot() = *snapshot; diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 779a7000993c18f0cefa3e5c1b0e44a63f5ac8fb..1bd04d2785913c59929478974883b9669e1c1185 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -292,6 +292,7 @@ cc_library( name = "hlo", srcs = [ "dfs_hlo_visitor.cc", + "dynamic_parameter_binding.cc", "hlo_computation.cc", "hlo_input_output_alias_config.cc", "hlo_instruction.cc", @@ -305,6 +306,7 @@ cc_library( hdrs = [ "dfs_hlo_visitor.h", "dfs_hlo_visitor_with_default.h", + "dynamic_parameter_binding.h", "hlo_clone_context.h", "hlo_computation.h", "hlo_domain_metadata.h", @@ -350,6 +352,25 @@ cc_library( ], ) +tf_cc_test( + name = "dynamic_parameter_binding_test", + srcs = ["dynamic_parameter_binding_test.cc"], + deps = [ + ":hlo", + ":hlo_dce", + ":hlo_memory_scheduler", + ":hlo_ordering", + ":hlo_parser", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:test", + "@com_google_absl//absl/algorithm:container", + ], +) + tf_cc_test( name = "dfs_hlo_visitor_with_default_test", srcs = ["dfs_hlo_visitor_with_default_test.cc"], @@ -1709,7 +1730,9 @@ cc_library( ":hlo", ":hlo_pass", ":hlo_query", + ":pattern_matcher", ":while_loop_analysis", + "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", @@ -1722,9 +1745,14 @@ tf_cc_test( name = "while_loop_simplifier_test", srcs = ["while_loop_simplifier_test.cc"], deps = [ + ":algebraic_simplifier", ":hlo", + ":hlo_cse", ":hlo_dce", ":hlo_matchers", + ":hlo_pass", + ":hlo_pass_pipeline", + ":tuple_simplifier", ":while_loop_simplifier", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla/tests:hlo_test_base", @@ -2858,6 +2886,46 @@ tf_cc_test( ], ) +cc_library( + name = "hlo_get_dimension_size_rewriter", + srcs = ["hlo_get_dimension_size_rewriter.cc"], + hdrs = ["hlo_get_dimension_size_rewriter.h"], + deps = [ + ":hlo", + ":hlo_pass", + ":shape_inference", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:literal_util", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/core:lib", + "@com_google_absl//absl/algorithm:container", + ], +) + +tf_cc_test( + name = "hlo_get_dimension_size_rewriter_test", + srcs = ["hlo_get_dimension_size_rewriter_test.cc"], + deps = [ + ":hlo", + ":hlo_get_dimension_size_rewriter", + ":hlo_matchers", + ":hlo_parser", + "//tensorflow/compiler/xla:literal", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:literal_test_util", + "//tensorflow/compiler/xla/tests:test_utils", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:lib", + "//tensorflow/core:test", + ], +) + cc_library( name = "device_memory_allocator", srcs = [ @@ -2916,6 +2984,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/strings", "@llvm//:core", "@llvm//:transform_utils", @@ -3321,9 +3390,9 @@ cc_library( ":tuple_util", ":while_loop_analysis", ":while_util", + "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", - "//tensorflow/core:lib", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 89e62bd2f0dc02d2d0947ae47e3bb0c9955f103e..56bf3a9f69d718db1b2845c6901a893a2fe1660b 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -84,7 +84,8 @@ bool TransposeIsBitcast(const HloInstruction* transpose) { // reshape may still be a bitcast. For example, a reshape from [28x28] to [784]. bool ReshapeOrCopyIsBitcast( const HloInstruction* instr, - const AlgebraicSimplifier::ValidBitcastCallback& valid_bitcast_callback) { + const AlgebraicSimplifierOptions::ValidBitcastCallback& + valid_bitcast_callback) { CHECK(HloOpcode::kReshape == instr->opcode() || HloOpcode::kCopy == instr->opcode()); @@ -180,21 +181,13 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { const bool changed() const { return changed_; } // Runs the visitor on a computation. - static bool Run( - HloComputation* computation, bool is_layout_sensitive, - AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback, - bool enable_dot_strength_reduction, bool enable_conv_simplification); + static bool Run(HloComputation* computation, + const AlgebraicSimplifierOptions& options); private: - explicit AlgebraicSimplifierVisitor( - HloComputation* computation, bool is_layout_sensitive, - AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback, - bool enable_dot_strength_reduction, bool enable_conv_simplification) - : computation_(computation), - is_layout_sensitive_(is_layout_sensitive), - valid_bitcast_callback_(std::move(valid_bitcast_callback)), - enable_dot_strength_reduction_(enable_dot_strength_reduction), - enable_conv_simplification_(enable_conv_simplification) {} + explicit AlgebraicSimplifierVisitor(HloComputation* computation, + const AlgebraicSimplifierOptions& options) + : computation_(computation), options_(options) {} // Transforms Dots where at least one input is a vector or has a degenerate // dimension and converts it into a multiply and reduce. This should enable @@ -233,10 +226,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { HloInstruction* new_instruction); // Returns whether the shape of the output of the given instructions are the - // same for the purposes of simplification. If is_layout_sensitive_ is true, - // then this tests shape equality including layout (ShapeUtil::Equal). If - // is_layout_sensitive_ is false, then the tests shape compatibility - // (ShapeUtil::Compatible). + // same for the purposes of simplification. If options_.is_layout_sensitive() + // is true, then this tests shape equality including layout + // (ShapeUtil::Equal). If options_.is_layout_sensitive() is false, then the + // tests shape compatibility (ShapeUtil::Compatible). bool SameShape(const HloInstruction* lhs, const HloInstruction* rhs) const; // Returns whether it was possible to transform `root` to a clamp instruction. @@ -325,22 +318,12 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { // traversing. HloComputation* computation_; + // The backend-specific options selected for the algebraic simplifier. + const AlgebraicSimplifierOptions& options_; + // Whether algebraic simplification has occurred. bool changed_ = false; - // Whether layout is considered during transformation. - bool is_layout_sensitive_; - - // Callback used to determine if a bitcast is possible. - AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback_; - - // Disable dot strength reduction on platforms where it causes a slowdown. - bool enable_dot_strength_reduction_; - - // Disable convolution -> dot simplification on platforms where it causes a - // slowdown. - bool enable_conv_simplification_; - // Cached computation for adding two scalar F32. HloComputation* scalar_add_computation_ = nullptr; }; @@ -348,19 +331,15 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault { } // namespace bool AlgebraicSimplifierVisitor::Run( - HloComputation* computation, bool is_layout_sensitive, - AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback, - bool enable_dot_strength_reduction, bool enable_conv_simplification) { - AlgebraicSimplifierVisitor visitor( - computation, is_layout_sensitive, std::move(valid_bitcast_callback), - enable_dot_strength_reduction, enable_conv_simplification); + HloComputation* computation, const AlgebraicSimplifierOptions& options) { + AlgebraicSimplifierVisitor visitor(computation, options); TF_CHECK_OK(computation->Accept(&visitor)); return visitor.changed_; } bool AlgebraicSimplifierVisitor::SameShape(const HloInstruction* lhs, const HloInstruction* rhs) const { - if (is_layout_sensitive_) { + if (options_.is_layout_sensitive()) { return ShapeUtil::Equal(lhs->shape(), rhs->shape()); } else { return ShapeUtil::Compatible(lhs->shape(), rhs->shape()); @@ -504,8 +483,8 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) { return Status::OK(); } - if (is_layout_sensitive_ && - ReshapeOrCopyIsBitcast(copy, valid_bitcast_callback_)) { + if (options_.is_layout_sensitive() && + ReshapeOrCopyIsBitcast(copy, options_.valid_bitcast_callback())) { ReplaceWithBitcast(copy); } @@ -1215,7 +1194,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) { return ReplaceInstruction(dot, dot_of_gather_optimized); } - if (enable_dot_strength_reduction_ && !is_layout_sensitive_) { + if (options_.enable_dot_strength_reduction() && + !options_.is_layout_sensitive()) { TF_ASSIGN_OR_RETURN(bool did_strength_reduction, HandleDotStrengthReduction(dot)); if (did_strength_reduction) { @@ -1910,8 +1890,8 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) { } // Make this a bitcast if possible. - if (is_layout_sensitive_ && - ReshapeOrCopyIsBitcast(reshape, valid_bitcast_callback_)) { + if (options_.is_layout_sensitive() && + ReshapeOrCopyIsBitcast(reshape, options_.valid_bitcast_callback())) { ReplaceWithBitcast(reshape); return Status::OK(); } @@ -2501,6 +2481,108 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) { return ReplaceWithNewInstruction( sort, HloInstruction::CreateTuple(sort->operands())); } + if (!options_.enable_permutation_sort_replacement()) { + return Status::OK(); + } + // Check if we are sorting a permutation. In that case, we know that the keys + // will be sorted to the identity permutation, and we can represent the + // changes to the 'values' parameter as a scatter. + if (sort->operand_count() == 2 && + operand->opcode() == HloOpcode::kGetTupleElement) { + const HloInstruction* other_sort = operand->operand(0); + // Check whether the 'values' parameter is the result of another sort with + // the same sort dimension. + if (other_sort->opcode() == HloOpcode::kSort && + other_sort->operand_count() >= 2 && + other_sort->dimensions(0) == dimension_to_sort && + other_sort->operand(operand->tuple_index())->opcode() == + HloOpcode::kIota) { + auto* iota = + Cast(other_sort->operand(operand->tuple_index())); + // The sort operand needs to be an integral iota, and the iota dimension + // needs to be the dimension that was sorted. + if (iota->iota_dimension() == dimension_to_sort && + ShapeUtil::ElementIsIntegral(iota->shape())) { + // We use the following construction method for a Scatter that applies + // the permutation from 'keys' to the 'values' parameter. + // - Take the "keys" parameter of the second sort and reshape it to have + // another "1" dimension at the end. + // - Concatenate it with iotas of the same extended shape with all + // different iota_dimensions except the dimension_to_sort in the order + // of iota_dimensions/dimension_to_sort, so e.g. with rank 3 and + // dimension_to_sort = 1, we would have concatenate of (iota with + // iota_dimension=0, keys, iota with iota_dimension = 2) + // - Use this as the indices parameter of scatter, and set updates + // of the scatter to be a reshaped 'values' parameter of sort (adding + // 'rank' many 1 dimensions at the end). + int64 rank = ShapeUtil::Rank(operand->shape()); + Shape extended_shape = operand->shape(); + extended_shape.add_dimensions(1); + extended_shape.mutable_layout()->add_minor_to_major(rank); + auto reshaped_permutation = computation_->AddInstruction( + HloInstruction::CreateReshape(extended_shape, operand)); + std::vector concat_operands; + for (int64 i = 0; i < rank; ++i) { + if (i == dimension_to_sort) { + concat_operands.push_back(reshaped_permutation); + } else { + concat_operands.push_back(computation_->AddInstruction( + HloInstruction::CreateIota(extended_shape, i))); + } + } + Shape concat_shape = operand->shape(); + concat_shape.add_dimensions(rank); + concat_shape.mutable_layout()->add_minor_to_major(rank); + auto scatter_indices = + rank > 1 ? computation_->AddInstruction( + HloInstruction::CreateConcatenate( + concat_shape, concat_operands, rank)) + : reshaped_permutation; + + // We don't care about the operand, it will be completely overridden by + // the updates. + auto scatter_operand = computation_->AddInstruction( + HloInstruction::CreateIota(sort->operand(1)->shape(), 0)); + + // Construct the updates operand of scatter. + Shape update_shape = sort->operand(1)->shape(); + for (int64 i = 0; i < rank; ++i) { + update_shape.add_dimensions(1); + update_shape.mutable_layout()->add_minor_to_major(rank + i); + } + auto scatter_updates = + computation_->AddInstruction(HloInstruction::CreateReshape( + update_shape, sort->mutable_operand(1))); + + // Construct the updates computation, which simply replaces the operand + // values with the update values. + HloComputation::Builder b("update_replace_computation"); + Shape scalar_shape = ShapeUtil::MakeShape(S32, {}); + b.AddInstruction( + HloInstruction::CreateParameter(0, scalar_shape, "scalar_lhs")); + auto scalar_rhs = b.AddInstruction( + HloInstruction::CreateParameter(1, scalar_shape, "scalar_rhs")); + auto update_replace_computation = + computation_->parent()->AddEmbeddedComputation(b.Build(scalar_rhs)); + + ScatterDimensionNumbers dim_numbers; + dim_numbers.set_index_vector_dim(rank); + for (int64 i = 0; i < rank; ++i) { + dim_numbers.add_update_window_dims(rank + i); + dim_numbers.add_scatter_dims_to_operand_dims(i); + } + auto scatter = + computation_->AddInstruction(HloInstruction::CreateScatter( + sort->operand(1)->shape(), scatter_operand, scatter_indices, + scatter_updates, update_replace_computation, dim_numbers)); + return ReplaceWithNewInstruction( + sort, HloInstruction::CreateTuple( + {computation_->AddInstruction(HloInstruction::CreateIota( + operand->shape(), dimension_to_sort)), + scatter})); + } + } + } return Status::OK(); } @@ -2525,7 +2607,7 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) { return ReplaceInstruction(transpose, operand); } - if (is_layout_sensitive_ && TransposeIsBitcast(transpose)) { + if (options_.is_layout_sensitive() && TransposeIsBitcast(transpose)) { ReplaceWithBitcast(transpose); return Status::OK(); } @@ -2674,13 +2756,13 @@ StatusOr AlgebraicSimplifierVisitor::SimplifyConvToDot( const ConvolutionDimensionNumbers& dnums = convolution->convolution_dimension_numbers(); - if (!enable_conv_simplification_) { + if (!options_.enable_conv_simplification()) { return false; } // TODO(b/31337498): For now, we cowardly refuse to do this optimization in // layout-insensitive mode, for fear of adding nontrivial reshapes. - if (!is_layout_sensitive_) { + if (!options_.is_layout_sensitive()) { return false; } @@ -2770,9 +2852,9 @@ StatusOr AlgebraicSimplifierVisitor::SimplifyConvToDot( // We cannot insert bitcasts if the layouts will not be compatible. // TODO(b/33178038): Consider inserting a transpose if a bitcast would be // invalid. - if (!valid_bitcast_callback_(input_shape, new_input_shape) || - !valid_bitcast_callback_(filter_shape, new_filter_shape) || - !valid_bitcast_callback_(dot_output_shape, convolution_shape)) { + if (!options_.valid_bitcast_callback()(input_shape, new_input_shape) || + !options_.valid_bitcast_callback()(filter_shape, new_filter_shape) || + !options_.valid_bitcast_callback()(dot_output_shape, convolution_shape)) { return false; } @@ -2878,9 +2960,7 @@ StatusOr AlgebraicSimplifier::Run(HloModule* module) { "AlgebraicSimplifier::Run(), before:\n" + module->ToString()); bool changed = false; for (auto* comp : module->MakeNonfusionComputations()) { - if (AlgebraicSimplifierVisitor::Run( - comp, is_layout_sensitive_, valid_bitcast_callback_, - enable_dot_strength_reduction_, enable_conv_simplification_)) { + if (AlgebraicSimplifierVisitor::Run(comp, options_)) { changed = true; } } diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h index 9f8d0ee88bdebcf17310cd0407b1b99e4b0a7b5f..d2775b9fafa7e4c625f5d181114e80e7369f9c78 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.h +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h @@ -23,8 +23,7 @@ limitations under the License. namespace xla { -// A pass which performs algebraic simplifications. -class AlgebraicSimplifier : public HloModulePass { +class AlgebraicSimplifierOptions { public: // Given shapes 'from_shape' and 'to_shape', determines if it is valid to // bitcast from 'from_shape' to 'to_shape' after considering platform @@ -34,18 +33,63 @@ class AlgebraicSimplifier : public HloModulePass { using ValidBitcastCallback = std::function; + explicit AlgebraicSimplifierOptions( + ValidBitcastCallback valid_bitcast_callback) + : valid_bitcast_callback_(std::move(valid_bitcast_callback)) {} + // If valid_bitcast_callback returns true, then the pass will replace reshapes + // and transposes with bitcasts. + const ValidBitcastCallback& valid_bitcast_callback() const { + return valid_bitcast_callback_; + } + + // If is_layout_sensitive is true, then the simplifier preserves layout during + // transformation. Otherwise, layout is ignored. + void set_is_layout_sensitive(bool is_layout_sensitive) { + is_layout_sensitive_ = is_layout_sensitive; + } + bool is_layout_sensitive() const { return is_layout_sensitive_; } + + // Enable dot simplification on platforms where it is profitable. + void set_enable_dot_strength_reduction(bool enable_dot_strength_reduction) { + enable_dot_strength_reduction_ = enable_dot_strength_reduction; + } + bool enable_dot_strength_reduction() const { + return enable_dot_strength_reduction_; + } + + // Enable convolution simplification on platforms where it is profitable. + void set_enable_conv_simplification(bool enable_conv_simplification) { + enable_conv_simplification_ = enable_conv_simplification; + } + bool enable_conv_simplification() const { + return enable_conv_simplification_; + } + + // If enable_permutation_sort_replacement is true, a sort op that is known to + // sort a permutation will be replaced with a scatter op. + void set_enable_permutation_sort_replacement( + bool enable_permutation_sort_replacement) { + enable_permutation_sort_replacement_ = enable_permutation_sort_replacement; + } + bool enable_permutation_sort_replacement() const { + return enable_permutation_sort_replacement_; + } + + private: + ValidBitcastCallback valid_bitcast_callback_; + bool is_layout_sensitive_{false}; + bool enable_dot_strength_reduction_{true}; + bool enable_conv_simplification_{true}; + bool enable_permutation_sort_replacement_{false}; +}; + +// A pass which performs algebraic simplifications. +class AlgebraicSimplifier : public HloModulePass { + public: // If is_layout_sensitive is true, then the simplifier preserves layout during - // transformation. Otherwise, layout is ignored. If valid_bitcast_callback - // returns true, then the pass will replace reshapes and transposes with - // bitcasts. - AlgebraicSimplifier(bool is_layout_sensitive, - ValidBitcastCallback valid_bitcast_callback, - bool enable_dot_strength_reduction = true, - bool enable_conv_simplification = true) - : is_layout_sensitive_(is_layout_sensitive), - valid_bitcast_callback_(std::move(valid_bitcast_callback)), - enable_dot_strength_reduction_(enable_dot_strength_reduction), - enable_conv_simplification_(enable_conv_simplification) {} + // transformation. Otherwise, layout is ignored. + explicit AlgebraicSimplifier(const AlgebraicSimplifierOptions& options) + : options_(options) {} ~AlgebraicSimplifier() override = default; absl::string_view name() const override { return "algsimp"; } @@ -54,14 +98,7 @@ class AlgebraicSimplifier : public HloModulePass { StatusOr Run(HloModule* module) override; private: - bool is_layout_sensitive_; - ValidBitcastCallback valid_bitcast_callback_; - - // Enable dot simplification on platforms where it is profitable. - bool enable_dot_strength_reduction_; - - // Enable convolution simplification on platforms where it is profitable. - bool enable_conv_simplification_; + AlgebraicSimplifierOptions options_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc index e4c4da1b0e7aef0e3476e4d232e410da25794e13..8b8ba2a77d9bec7a6baf6929a0566906727be319 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc @@ -45,15 +45,18 @@ using ::testing::ElementsAre; namespace op = xla::testing::opcode_matchers; -AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() { +AlgebraicSimplifierOptions::ValidBitcastCallback bitcasting_callback() { return [](const Shape&, const Shape&) { return true; }; } -AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() { +AlgebraicSimplifierOptions::ValidBitcastCallback non_bitcasting_callback() { return [](const Shape&, const Shape&) { return false; }; } -class AlgebraicSimplifierTest : public HloTestBase {}; +class AlgebraicSimplifierTest : public HloTestBase { + protected: + AlgebraicSimplifierOptions default_options_{non_bitcasting_callback()}; +}; // Test that A + 0 is simplified to A TEST_F(AlgebraicSimplifierTest, AddZero) { @@ -70,8 +73,7 @@ TEST_F(AlgebraicSimplifierTest, AddZero) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kAdd); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -92,8 +94,7 @@ TEST_F(AlgebraicSimplifierTest, MulZero) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kMultiply); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_EQ(computation->root_instruction(), zero); } @@ -115,8 +116,7 @@ TEST_F(AlgebraicSimplifierTest, SelectTrue) { auto computation = module->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kSelect); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_EQ(computation->root_instruction(), param0); } @@ -138,8 +138,7 @@ TEST_F(AlgebraicSimplifierTest, SelectFalse) { auto computation = module->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kSelect); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_EQ(computation->root_instruction(), param1); } @@ -159,8 +158,7 @@ TEST_F(AlgebraicSimplifierTest, SelectIdentical) { auto computation = module->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kSelect); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_EQ(computation->root_instruction(), param1); } @@ -196,8 +194,7 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) { builder.AddInstruction(HloInstruction::CreateReduce(r1f32, reduce0, zero, dims1, add_computation)); m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); HloInstruction* root = m->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Reduce(param, zero)); @@ -219,8 +216,7 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kAdd); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Add(param0, op::Constant())); @@ -246,8 +242,7 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kAdd); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Add(param0, op::Add(constant1, constant2))); @@ -269,8 +264,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kAdd); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -306,8 +300,7 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kMap); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Add(param0, op::Broadcast(zero))); @@ -329,8 +322,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kAdd); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -345,8 +337,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_THAT(root, op::Constant()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Broadcast(op::Constant())); @@ -362,8 +353,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_THAT(root, op::Constant()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Constant()); @@ -378,8 +368,7 @@ TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_THAT(root, op::Constant()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Iota()); @@ -400,8 +389,7 @@ TEST_F(AlgebraicSimplifierTest, SubZero) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kSubtract); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -422,8 +410,7 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kSubtract); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Add(param0, op::Negate(constant))); @@ -450,8 +437,7 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) { EXPECT_THAT(computation->root_instruction(), op::Divide(op::Divide(param0, param1), param2)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -479,8 +465,7 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) { EXPECT_THAT(computation->root_instruction(), op::Divide(param0, op::Divide(param1, param2))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -513,8 +498,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) { computation->root_instruction(), op::Divide(op::Divide(param0, param1), op::Divide(param2, param3))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT( @@ -541,8 +525,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) { EXPECT_THAT(computation->root_instruction(), op::Divide(param0, op::Exp(param1))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -570,8 +553,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) { EXPECT_THAT(computation->root_instruction(), op::Divide(param0, op::Power(param1, param2))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -600,8 +582,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) { EXPECT_THAT(computation->root_instruction(), op::Divide(param0, op::Power(param1, param2))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); ASSERT_THAT(computation->root_instruction(), @@ -623,8 +604,7 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) { auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -648,8 +628,7 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) { inner_power, exp2)); auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Power(base, op::Multiply(exp1, exp2))); @@ -673,8 +652,7 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) { inner_power, exp2)); m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie()); } @@ -693,8 +671,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root, div); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -715,8 +692,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root, div); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -740,8 +716,7 @@ TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root, cplx); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -765,8 +740,7 @@ TEST_F(AlgebraicSimplifierTest, RealOfComplex) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root, real); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -790,8 +764,7 @@ TEST_F(AlgebraicSimplifierTest, ImagOfComplex) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root, imag); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param1); @@ -818,8 +791,7 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root, add); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Add(param1, param2)); @@ -846,8 +818,7 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) { EXPECT_THAT(computation->root_instruction(), op::Divide(op::Exp(param0), op::Exp(param1))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -875,8 +846,7 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) { EXPECT_THAT(computation->root_instruction(), op::Multiply(op::Exp(param0), op::Exp(param1))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -902,8 +872,7 @@ TEST_F(AlgebraicSimplifierTest, PowExp) { EXPECT_THAT(computation->root_instruction(), op::Power(op::Exp(param0), param1)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -929,8 +898,7 @@ TEST_F(AlgebraicSimplifierTest, LnPow) { EXPECT_THAT(computation->root_instruction(), op::Log(op::Power(param0, param1))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -953,8 +921,7 @@ TEST_F(AlgebraicSimplifierTest, LnExp) { EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_EQ(computation->root_instruction(), param0); @@ -983,8 +950,7 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) { EXPECT_THAT(computation->root_instruction(), op::Log(op::Divide(op::Exp(param0), op::Exp(param1)))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1)); @@ -1007,8 +973,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) { EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); HloInstruction* root = computation->root_instruction(); @@ -1032,8 +997,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) { EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); HloInstruction* root = computation->root_instruction(); @@ -1061,8 +1025,7 @@ TEST_F(AlgebraicSimplifierTest, Pow1) { EXPECT_THAT(computation->root_instruction(), op::Power(param0, one)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_EQ(computation->root_instruction(), param0); @@ -1084,8 +1047,7 @@ TEST_F(AlgebraicSimplifierTest, Pow2) { EXPECT_THAT(computation->root_instruction(), op::Power(param0, two)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0)); @@ -1107,8 +1069,7 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) { EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); HloInstruction* root = computation->root_instruction(); @@ -1153,8 +1114,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) { ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2))); m->AddEntryComputation(builder.Build()); - HloPassFix simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + HloPassFix simplifier(default_options_); EXPECT_THAT(m->entry_computation()->root_instruction(), op::Convolution(lhs, rhs)); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); @@ -1196,8 +1156,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) { HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))), window, add_computation)); m->AddEntryComputation(builder.Build()); - HloPassFix simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + HloPassFix simplifier(default_options_); EXPECT_THAT(m->entry_computation()->root_instruction(), op::ReduceWindow(param, op::Constant())); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); @@ -1226,8 +1185,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) { m->AddEntryComputation(builder.Build()); EXPECT_THAT(m->entry_computation()->root_instruction(), op::Pad(param, op::Constant())); - HloPassFix simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + HloPassFix simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(m->entry_computation()->root_instruction(), op::Broadcast(op::Constant())); @@ -1253,8 +1211,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) { EXPECT_THAT(m->entry_computation()->root_instruction(), op::Reshape(op::Broadcast(op::Reshape(op)))); - HloPassFix simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + HloPassFix simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(m->entry_computation()->root_instruction(), op); @@ -1273,8 +1230,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) { EXPECT_THAT(computation->root_instruction(), op::Convert(input)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), input); @@ -1294,8 +1250,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) { EXPECT_THAT(computation->root_instruction(), op::Copy(param0)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), param0); @@ -1316,14 +1271,16 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) { auto computation = m->AddEntryComputation(builder.Build()); EXPECT_THAT(computation->root_instruction(), op::Copy(param)); - AlgebraicSimplifier simplifier1(/*is_layout_sensitive=*/true, - non_bitcasting_callback()); + AlgebraicSimplifierOptions options(non_bitcasting_callback()); + options.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier1(options); ASSERT_FALSE(simplifier1.Run(m.get()).ValueOrDie()); // Verify that the copy is not replaced. EXPECT_THAT(computation->root_instruction(), op::Copy(param)); - AlgebraicSimplifier simplifier2(/*is_layout_sensitive=*/true, - bitcasting_callback()); + AlgebraicSimplifierOptions options2(bitcasting_callback()); + options2.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier2(options2); ASSERT_TRUE(simplifier2.Run(m.get()).ValueOrDie()); // Verify that the copy is replaced. EXPECT_THAT(computation->root_instruction(), op::Bitcast(param)); @@ -1343,8 +1300,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) { EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), param0); @@ -1375,8 +1331,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) { computation->root_instruction(), op::Concatenate(empty_literal, param0, param0, empty_slice, param1)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -1423,8 +1378,7 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) { auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT( @@ -1455,8 +1409,7 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) { EXPECT_THAT(computation->root_instruction(), op::Concatenate(empty_literal, empty_slice)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_EQ(computation->root_instruction(), empty_literal); @@ -1479,8 +1432,7 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) { auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1)); } @@ -1504,8 +1456,9 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) { EXPECT_THAT(computation->root_instruction(), op::Copy(param0)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true, - non_bitcasting_callback()); + AlgebraicSimplifierOptions options(non_bitcasting_callback()); + options.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier(options); EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie()); // Copy has not been removed. @@ -1531,8 +1484,9 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) { EXPECT_THAT(computation->root_instruction(), op::Copy(param0)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true, - non_bitcasting_callback()); + AlgebraicSimplifierOptions options(non_bitcasting_callback()); + options.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier(options); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); // Copy has been removed. @@ -1559,8 +1513,9 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) { EXPECT_THAT(computation->root_instruction(), op::Reshape(param0)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true, - non_bitcasting_callback()); + AlgebraicSimplifierOptions options(non_bitcasting_callback()); + options.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier(options); EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie()); // Reshape is not replaced with a bitcast. @@ -1588,8 +1543,8 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) { auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifier simplifier( + (AlgebraicSimplifierOptions(bitcasting_callback()))); EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie()); // Verify that that reshape(transpose(rng)) is replace by a single rng of the @@ -1639,8 +1594,9 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) { op::Tuple(transformable_reshape, dimensions_wrong_reshape, layout_wrong_reshape)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + options.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier(options); simplifier.Run(m.get()).ValueOrDie(); // Verify that only the first reshape is replaced. @@ -1667,8 +1623,8 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) { builder.AddInstruction( HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {4}), add)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifier simplifier( + (AlgebraicSimplifierOptions(bitcasting_callback()))); m->AddEntryComputation(builder.Build()); EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie()); } @@ -1692,8 +1648,8 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) { HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add, /*broadcast_dimensions=*/{0, 1})); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifier simplifier( + (AlgebraicSimplifierOptions(bitcasting_callback()))); m->AddEntryComputation(builder.Build()); EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie()); } @@ -1717,8 +1673,9 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) { EXPECT_THAT(computation->root_instruction(), op::Transpose(param)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + options.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier(options); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); // Verify that the reshape is replaced. @@ -1744,8 +1701,9 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) { EXPECT_THAT(computation->root_instruction(), op::Transpose(param)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + options.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier(options); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); // Verify that the reshape is replaced. @@ -1771,8 +1729,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Reshape(param0))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Reshape(param0)); @@ -1798,8 +1755,9 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) { EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true, - non_bitcasting_callback()); + AlgebraicSimplifierOptions options(non_bitcasting_callback()); + options.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier(options); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Copy(param0)); @@ -1823,8 +1781,7 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) { EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Transpose(param0)); @@ -1848,8 +1805,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) { EXPECT_THAT(computation->root_instruction(), op::Broadcast(op::Reshape(param0))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0)); @@ -1871,8 +1827,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Broadcast(param0))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0)); @@ -1893,8 +1848,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Broadcast(param))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -1916,8 +1870,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Broadcast(param))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Broadcast(param)); @@ -1940,8 +1893,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Broadcast(param))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Broadcast(param)); @@ -1966,8 +1918,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Broadcast(param))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), @@ -1986,8 +1937,7 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Iota()); @@ -2006,8 +1956,7 @@ TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) { EXPECT_THAT(computation->root_instruction(), op::Iota()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); auto root = computation->root_instruction(); @@ -2029,8 +1978,7 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2_6) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); @@ -2048,8 +1996,7 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Iota()); @@ -2070,8 +2017,7 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Iota()); @@ -2093,8 +2039,7 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4x2_6x8) { EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota())); @@ -2122,8 +2067,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) { EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), param); @@ -2153,8 +2097,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) { auto module = CreateNewVerifiedModule(); HloComputation* computation = module->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); auto has_negative_padding = [](const HloInstruction* pad) { for (auto& padding_dimension : pad->padding_config().dimensions()) { @@ -2189,8 +2132,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) { EXPECT_THAT(computation->root_instruction(), op::Reshape(param)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), param); @@ -2212,8 +2154,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) { EXPECT_THAT(computation->root_instruction(), op::Slice(param)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), param); @@ -2241,8 +2182,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) { EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Slice(param)); @@ -2273,8 +2213,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeToReshapeOfSlice) { EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Slice(param))); @@ -2298,8 +2237,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) { EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param))); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie()); } @@ -2312,12 +2250,84 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) { builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys)); auto module = CreateNewVerifiedModule(); HloComputation* computation = module->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), keys); } +TEST_F(AlgebraicSimplifierTest, ReplacePermutationSortWithScatter) { + const char* hlo_string = R"( + HloModule permutation_sort + + ENTRY sort_computation { + keys = f32[64,8732]{1,0} parameter(0) + values = s32[64,8732]{1,0} iota(), iota_dimension=1 + sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1} + gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1 + ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={1} + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnVerifiedModule(hlo_string)); + + AlgebraicSimplifierOptions options(non_bitcasting_callback()); + options.set_enable_permutation_sort_replacement(true); + AlgebraicSimplifier simplifier(options); + EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); + auto root = module->entry_computation()->root_instruction(); + EXPECT_THAT(root, + op::Tuple(op::Iota(), + op::Scatter(op::Iota(), + op::Concatenate(op::Iota(), op::Reshape()), + op::Reshape()))); +} + +TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortIfNonIntegral) { + // Same as ReplacePermutationSortWithScatter except that the iota has F32 + // type. + const char* hlo_string = R"( + HloModule permutation_sort + + ENTRY sort_computation { + keys = f32[64,8732]{1,0} parameter(0) + values = f32[64,8732]{1,0} iota(), iota_dimension=1 + sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values), dimensions={1} + gte = f32[64,8732]{1,0} get-tuple-element(sort), index=1 + ROOT sort2 = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(gte, values), dimensions={1} + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnVerifiedModule(hlo_string)); + + AlgebraicSimplifierOptions options(non_bitcasting_callback()); + options.set_enable_permutation_sort_replacement(true); + AlgebraicSimplifier simplifier(options); + EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie()); +} + +TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortWrongDimensions) { + // Same as ReplacePermutationSortWithScatter except that the sort dimensions + // don't match. + const char* hlo_string = R"( + HloModule permutation_sort + + ENTRY sort_computation { + keys = f32[64,8732]{1,0} parameter(0) + values = s32[64,8732]{1,0} iota(), iota_dimension=1 + sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1} + gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1 + ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={0} + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnVerifiedModule(hlo_string)); + + AlgebraicSimplifierOptions options(non_bitcasting_callback()); + options.set_enable_permutation_sort_replacement(true); + AlgebraicSimplifier simplifier(options); + EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie()); +} + TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) { auto builder = HloComputation::Builder(TestName()); @@ -2334,8 +2344,7 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) { keys, {values0, values1})); auto module = CreateNewVerifiedModule(); HloComputation* computation = module->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Tuple(keys, values0, values1)); @@ -2356,8 +2365,7 @@ TEST_F(AlgebraicSimplifierTest, AndTrue) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kAnd); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -2378,8 +2386,7 @@ TEST_F(AlgebraicSimplifierTest, AndTrue2) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kAnd); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -2400,8 +2407,7 @@ TEST_F(AlgebraicSimplifierTest, AndFalse) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kAnd); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, const_false); @@ -2422,8 +2428,7 @@ TEST_F(AlgebraicSimplifierTest, AndFalse2) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kAnd); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, const_false); @@ -2444,8 +2449,7 @@ TEST_F(AlgebraicSimplifierTest, OrTrue) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kOr); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, const_true); @@ -2466,8 +2470,7 @@ TEST_F(AlgebraicSimplifierTest, OrTrue2) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kOr); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, const_true); @@ -2488,8 +2491,7 @@ TEST_F(AlgebraicSimplifierTest, OrFalse) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kOr); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -2510,8 +2512,7 @@ TEST_F(AlgebraicSimplifierTest, OrFalse2) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kOr); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_EQ(root, param0); @@ -2641,8 +2642,7 @@ TEST_P(ConvInputPaddingTest, DoTest) { auto module = CreateNewVerifiedModule(); module->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); if (testcase.expected_conv_window.empty()) { ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie()); } else { @@ -2759,8 +2759,7 @@ TEST_P(ConvFilterPaddingTest, DoIt) { auto module = CreateNewVerifiedModule(); module->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); if (testcase.expected_conv_window.empty()) { ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie()); } else { @@ -2908,8 +2907,9 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) { auto module = CreateNewUnverifiedModule(); auto* computation = module->AddEntryComputation(b.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true, - bitcasting_callback()); + AlgebraicSimplifierOptions simplifier_options(bitcasting_callback()); + simplifier_options.set_is_layout_sensitive(true); + AlgebraicSimplifier simplifier(simplifier_options); if (!simplifier.Run(module.get()).ValueOrDie()) { return "NO_CHANGE"; } @@ -3032,8 +3032,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) { EXPECT_EQ(root, slice); EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); @@ -3071,8 +3070,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) { EXPECT_EQ(root, reshape); EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); root = computation->root_instruction(); @@ -3138,8 +3136,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) { auto computation = module->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root, reduce_window); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); // Running simplification again should not result in any further changes. @@ -3224,8 +3221,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) { auto computation = module->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root, reduce_window); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); // Running simplification again should not result in any further changes. @@ -3258,8 +3254,7 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) { auto module = CreateNewVerifiedModule(); auto computation = module->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie()); HloInstruction* root = computation->root_instruction(); @@ -3295,8 +3290,7 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) { m->AddEmbeddedComputation(std::move(dot_computation)); m->AddEntryComputation(call_builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); } @@ -3313,8 +3307,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) { auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Tuple(op::Constant(), op::Constant())); @@ -3337,8 +3330,7 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) { /*slice_sizes=*/{10, 100, 1000})); auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::Parameter()); } @@ -3371,8 +3363,7 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) { 3, ShapeUtil::MakeShape(U32, {3}), "update_indices")))); auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), op::DynamicSlice(op::Parameter(), op::Parameter())); @@ -3394,8 +3385,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Broadcast(op::Constant())); @@ -3421,8 +3411,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Broadcast(op::Parameter(0))); @@ -3442,8 +3431,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Iota()); @@ -3464,8 +3452,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) { auto computation = m->AddEntryComputation(builder.Build()); HloInstruction* root = computation->root_instruction(); EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); root = computation->root_instruction(); EXPECT_THAT(root, op::Iota()); @@ -3486,8 +3473,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) { TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_string)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + AlgebraicSimplifier simplifier(options); EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); auto root = module->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Reshape(op::Constant())); @@ -3507,8 +3494,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) { TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_string)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + AlgebraicSimplifier simplifier(options); EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); auto root = module->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Reshape(op::Constant())); @@ -3528,8 +3515,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) { TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_string)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + AlgebraicSimplifier simplifier(options); EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie()); } @@ -3547,8 +3534,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) { TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_string)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + AlgebraicSimplifier simplifier(options); EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); auto root = module->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Parameter()); @@ -3569,8 +3556,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) { TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_string)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + AlgebraicSimplifier simplifier(options); EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); auto root = module->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Parameter(1)); @@ -3591,8 +3578,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) { TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_string)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + AlgebraicSimplifier simplifier(options); EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); auto root = module->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Slice(op::Parameter(2))); @@ -3613,8 +3600,8 @@ TEST_F(AlgebraicSimplifierTest, NegateNegate) { TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_string)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + AlgebraicSimplifier simplifier(options); EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); auto root = module->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Parameter(0)); @@ -3633,8 +3620,8 @@ TEST_F(AlgebraicSimplifierTest, NotNot) { TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_string)); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - bitcasting_callback()); + AlgebraicSimplifierOptions options(bitcasting_callback()); + AlgebraicSimplifier simplifier(options); EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie()); auto root = module->entry_computation()->root_instruction(); EXPECT_THAT(root, op::Parameter(0)); @@ -3733,8 +3720,7 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) { output_shape, pad, zero, window, add_computation)); auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get())); ASSERT_TRUE(run_successful); @@ -3815,8 +3801,7 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) { builder.AddInstruction(HloInstruction::CreateDot( dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2))); auto computation = module->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get())); const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1; const bool computation_should_be_modified = @@ -3845,7 +3830,7 @@ struct DotOfConcatTestSpec { }; class DotOfConcatSimplificationTest - : public HloTestBase, + : public AlgebraicSimplifierTest, public ::testing::WithParamInterface {}; // Test that we transform @@ -3893,8 +3878,7 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) { dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2))); auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get())); ASSERT_TRUE(run_successful); @@ -3958,8 +3942,7 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) { dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2))); auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get())); ASSERT_TRUE(run_successful); EXPECT_TRUE( @@ -4000,8 +3983,7 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) { const HloComputation* const computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie()); EXPECT_THAT(computation->root_instruction(), operand); } @@ -4021,7 +4003,7 @@ struct DotOfGatherTestSpec { }; class DotOfGatherSimplificationTest - : public HloTestBase, + : public AlgebraicSimplifierTest, public ::testing::WithParamInterface {}; // input: dot(DS(ctA), ctB)) @@ -4078,8 +4060,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) { dot_shape, ds, rhs, dot_dnums, DefaultPrecisionConfig(2))); auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get())); ASSERT_TRUE(run_successful); EXPECT_TRUE( @@ -4149,8 +4130,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) { dot_shape, lhs, ds, dot_dnums, DefaultPrecisionConfig(2))); auto computation = m->AddEntryComputation(builder.Build()); - AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false, - non_bitcasting_callback()); + AlgebraicSimplifier simplifier(default_options_); TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get())); ASSERT_TRUE(run_successful); EXPECT_TRUE( diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc index 4b3ab56dea0e9c29e0ecacf694878792318e9953..8d7c62447852fd946440c41389300a92377c471f 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment.cc @@ -746,8 +746,7 @@ StatusOr> BufferAssigner::Run( LogicalBuffer::AlignmentFunction color_alignment, bool allow_input_output_aliasing, bool allocate_buffers_for_constants, BufferLiveness::Colorer colorer, ReuseAllocationFunction reuse_checker) { - BufferAssigner assigner(allow_input_output_aliasing, - allocate_buffers_for_constants, std::move(colorer), + BufferAssigner assigner(allocate_buffers_for_constants, std::move(colorer), std::move(reuse_checker)); return assigner.CreateAssignment(module, std::move(hlo_ordering), std::move(buffer_size), diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h index d8e1612b899f10a5793f9c65c59a41024dfdddd1..0a9fdede803e84ca42472259084615c031b206eb 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.h +++ b/tensorflow/compiler/xla/service/buffer_assignment.h @@ -545,12 +545,10 @@ class BufferAssigner { ReuseAllocationFunction reuse_checker = nullptr); private: - BufferAssigner(bool allow_input_output_aliasing, - bool allocate_buffers_for_constants, + BufferAssigner(bool allocate_buffers_for_constants, BufferLiveness::Colorer colorer, ReuseAllocationFunction reuse_checker) - : allow_input_output_aliasing_(allow_input_output_aliasing), - allocate_buffers_for_constants_(allocate_buffers_for_constants), + : allocate_buffers_for_constants_(allocate_buffers_for_constants), colorer_(colorer), reuse_checker_(reuse_checker) {} virtual ~BufferAssigner() = default; @@ -640,10 +638,6 @@ class BufferAssigner { LogicalBuffer::Color::Hasher> SplitBuffersByColor(const absl::flat_hash_set& buffers); - // If true, buffer assignments assumes that input parameter buffers and output - // buffers can be shared if their sizes match. - bool allow_input_output_aliasing_; - // If true, allocate buffers for constant instructions. bool allocate_buffers_for_constants_; diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc index b1fc50cb1881241a0a53b024b06342308cabdd62..8f482e6ba8c3e71c9980be5e6947ea61f3b4ef29 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc @@ -137,8 +137,7 @@ class BufferAssignmentTest : public HloTestBase { } std::unique_ptr RunBufferAssignmentWithInstructionSequence( - HloModule* module, - absl::Span instruction_sequence, + HloModule* module, absl::Span instruction_sequence, int64 alignment = 1) { HloSchedule schedule(module); schedule.set_sequence(module->entry_computation(), instruction_sequence); @@ -1853,7 +1852,7 @@ class WhileBufferAssignmentTest : public HloTestBase { std::unique_ptr RunBufferAssignment(HloModule* module, int64 alignment = 1) { HloSchedule schedule = - ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie(); + ScheduleModule(module, ByteSizeOf).ConsumeValueOrDie(); return BufferAssigner::Run( module, absl::make_unique(schedule), ByteSizeOf, @@ -2162,7 +2161,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) { // nodes are traversed during BufferAssignment. TF_ASSERT_OK_AND_ASSIGN( HloSchedule schedule, - ScheduleModule(*module, [](const BufferValue& buffer) { + ScheduleModule(module.get(), [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/sizeof(void*)); })); @@ -2391,15 +2390,16 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) { RunCopyInsertion(module.get()); HloSchedule schedule = - ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie(); + ScheduleModule(module.get(), ByteSizeOf).ConsumeValueOrDie(); // To trigger b/38494731, we want a specific Hlo schedule for the // root computation, so we overwrite that entry with a manually // crafted sequence. - schedule.set_sequence(module->entry_computation(), - {input1, weights1, one, output1, while1->operand(0), - while1, input0, weights0, zero, output0, - while0->operand(0), while0, gte0, gte1, root_add}); + schedule.set_sequence( + module->entry_computation(), + {input1, weights1, one, output1, while1->mutable_operand(0), while1, + input0, weights0, zero, output0, while0->mutable_operand(0), while0, + gte0, gte1, root_add}); // If this ASSERT fails, we constructed a bogus sequence above and this test // itself is buggy. diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc index 67132274c0dcbfda831c79836d052bb51b753ec7..0237f16673eca32828c3c774a049860fc83182a5 100644 --- a/tensorflow/compiler/xla/service/compile_only_service.cc +++ b/tensorflow/compiler/xla/service/compile_only_service.cc @@ -86,15 +86,15 @@ CompileOnlyService::CompileAheadOfTime( Executable::DumpToDirectory(per_host_path, filename, hlo_snapshot)); } - const auto& program_shape = instance.computation.host_program_shape(); ExecutionOptions execution_options; *execution_options.mutable_debug_options() = debug_options; *execution_options.mutable_shape_with_output_layout() = *instance.result_layout; TF_ASSIGN_OR_RETURN( std::unique_ptr module_config, - CreateModuleConfig(program_shape, instance.argument_layouts, - &execution_options)); + CreateModuleConfig( + ProgramShape(instance.computation.host_program_shape()), + instance.argument_layouts, &execution_options)); TF_ASSIGN_OR_RETURN( std::unique_ptr hlo_module, diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h index c899ffb9dc562426ef14c0d414469c04debeec70..844b42a38d7539cccd5c4e30071c0ea6693e3bba 100644 --- a/tensorflow/compiler/xla/service/computation_placer.h +++ b/tensorflow/compiler/xla/service/computation_placer.h @@ -105,8 +105,6 @@ class ComputationPlacer { // Map from platform kind to computation placer singleton. static std::map* GetPlatformComputationPlacers(); - se::Platform::Id platform_id_; - TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer); }; diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc index 4e547d925f62dce1d2dd23a39a28ca8c23ba9f2f..df6059663876dfde71f4c75d3931b3d2de72c1df 100644 --- a/tensorflow/compiler/xla/service/copy_insertion.cc +++ b/tensorflow/compiler/xla/service/copy_insertion.cc @@ -442,7 +442,6 @@ class CopyRemover { const HloOrdering& ordering, HloModule* module) : module_(module), alias_analysis_(alias_analysis), - ordering_(ordering), buffer_value_tracker_(*module, alias_analysis, ordering) {} // Try to elide the given copy. The copy is elided if the instruction is not @@ -1003,7 +1002,6 @@ class CopyRemover { HloModule* module_; const HloAliasAnalysis& alias_analysis_; - const HloOrdering& ordering_; // Object tracking the HLO values contained in each HLO buffer. BufferValueTracker buffer_value_tracker_; diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index 2763d18121a0c1328ea0c11d825476923ae2b15d..ce4c2a9cc69240b9565b35a3f2504d7fc9373917 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -96,6 +96,7 @@ cc_library( "@com_google_absl//absl/types:span", "//tensorflow/compiler/tf2xla:cpu_function_runtime", "//tensorflow/compiler/xla/service:map_inliner", + "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter", "//tensorflow/compiler/xla/service:scatter_expander", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:protobuf_util", diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc index 73b03440cbb936017257b8a92f16dcc25d41e21c..796a7cf94d02b0ad42366387a9d3f8d589b8840a 100644 --- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc +++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc @@ -61,19 +61,6 @@ Disabling these as a starting point. // TODO(b/64227304) Creating a custom pass pipeline will replace this. namespace { -class FilteredFunctionPassManager : public llvm::legacy::FunctionPassManager { - public: - FilteredFunctionPassManager(llvm::Module* m, bool disable_expensive_passes) - : llvm::legacy::FunctionPassManager(m), - disable_expensive_passes_(disable_expensive_passes) {} - void add(llvm::Pass* p) override { - llvm::legacy::FunctionPassManager::add(p); - } - - private: - bool disable_expensive_passes_; -}; - class FilteredPassManager : public llvm::legacy::PassManager { public: explicit FilteredPassManager(bool disable_expensive_passes) @@ -96,8 +83,7 @@ class FilteredPassManager : public llvm::legacy::PassManager { std::unique_ptr CompilerFunctor::operator()( llvm::Module& module) const { FilteredPassManager module_passes(disable_expensive_passes_); - FilteredFunctionPassManager function_passes(&module, - disable_expensive_passes_); + llvm::legacy::FunctionPassManager function_passes(&module); VLOG(2) << "IR before optimizations"; XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(module)); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 4ce5a8a29255a763c83941efb6de9b7c652cedb4..2bf24c15c1f050b200b1d9af2d95286f9a9dbe4c 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -76,6 +76,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_cse.h" #include "tensorflow/compiler/xla/service/hlo_dce.h" #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h" +#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" @@ -249,6 +250,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn( &pipeline, module->config().debug_options(), ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION); + pipeline.AddPass(); pipeline.AddPass(); // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner @@ -268,10 +270,10 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn( /*rewrite_training_op=*/true, /*rewrite_inference_op=*/true, /*rewrite_grad_op=*/true); - pass.AddPass( - /*is_layout_sensitive=*/false, - [](const Shape&, const Shape&) { return false; }, - /*enable_dot_strength_reduction=*/false); + AlgebraicSimplifierOptions options( + [](const Shape&, const Shape&) { return false; }); + options.set_enable_dot_strength_reduction(false); + pass.AddPass(options); pass.AddPass(); // BatchNormExpander can create zero-sized ops, so zero-sized HLO @@ -334,10 +336,11 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn( pass.AddInvariantChecker( /*layout_sensitive=*/true, /*allow_mixed_precision=*/false); - pass.AddPass>( - /*is_layout_sensitive=*/true, - [](const Shape&, const Shape&) { return true; }, - /*enable_dot_strength_reduction=*/false); + AlgebraicSimplifierOptions options( + [](const Shape&, const Shape&) { return true; }); + options.set_is_layout_sensitive(true); + options.set_enable_dot_strength_reduction(false); + pass.AddPass>(options); pass.AddPass(); pass.AddPass(/*is_layout_sensitive=*/true); } @@ -587,9 +590,9 @@ StatusOr> CpuCompiler::RunBackend( // Select an order for emitting the HLO instructions for each // computation. Using this sequence enables tighter buffer liveness analysis // and reduced memory usage (as compared to using DependencyHloOrdering). - TF_ASSIGN_OR_RETURN( - HloSchedule schedule, - ScheduleModule(*module, BufferSizeBytesFunction(), DFSMemoryScheduler)); + TF_ASSIGN_OR_RETURN(HloSchedule schedule, + ScheduleModule(module.get(), BufferSizeBytesFunction(), + DFSMemoryScheduler)); // Run buffer allocation on the HLO graph. TF_ASSIGN_OR_RETURN( @@ -779,7 +782,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr module_group, XLA_VLOG_LINES(2, module->ToString()); TF_ASSIGN_OR_RETURN(HloSchedule schedule, - ScheduleModule(*module, BufferSizeBytesFunction())); + ScheduleModule(module, BufferSizeBytesFunction())); // Run buffer analysis on the HLO graph. This analysis figures out which // temporary buffers are required to run the computation. diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 29abf38e439d919ff93629ed992cb3ff93a929bd..818b2b0d0db2893e11fa46c7867e6c74bbbb6905 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -51,8 +51,7 @@ namespace cpu { CpuExecutable::CpuExecutable( std::unique_ptr jit, std::unique_ptr assignment, - std::unique_ptr hlo_module, - const string& entry_function_name, + std::unique_ptr hlo_module, const string& entry_function_name, std::unique_ptr hlo_profile_printer_data, std::unique_ptr hlo_profile_index_map) : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data), diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 3c3c047bfe8ee0d1ad90ede2432a86264f47870b..3b91b15ba9b5603b50f78f489e9a3fdad354c083 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -49,7 +49,7 @@ class CpuExecutable : public Executable { public: CpuExecutable(std::unique_ptr jit, std::unique_ptr assignment, - std::unique_ptr hlo_module, + std::unique_ptr hlo_module, const string& entry_function_name, std::unique_ptr hlo_profile_printer_data, std::unique_ptr hlo_profile_index_map); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc index b8ace5702688096822573c7afae234cbcbe77b28..92debb83e33b1400a59e5eef0f90971392ab7b22 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc @@ -22,7 +22,6 @@ limitations under the License. namespace { const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size"; -const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce"; const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor"; const char* const kXlaEnableExperimentalLlvmIrGemm = "xla_enable_experimental_llvm_ir_gemm"; diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 620c45fa391e69ef88269d44709404e6f71b30cb..4032c2da2f33ee61da8771ae6225a14172cbe6e8 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -111,7 +111,7 @@ IrEmitter::IrEmitter( StatusOr IrEmitter::EmitComputation( HloComputation* computation, const string& function_name_prefix, bool is_top_level_computation, - const std::vector* instruction_order) { + const std::vector* instruction_order) { string function_name = name_uniquer_.GetUniqueName(function_name_prefix); VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix << "]; ordered? " << (instruction_order != nullptr); @@ -140,7 +140,7 @@ StatusOr IrEmitter::EmitComputation( // readcyclecounter if it is unavailable. bool use_rdtscp = arch_type_ == llvm::Triple::ArchType::x86 || arch_type_ == llvm::Triple::ArchType::x86_64; - profiling_state_ = ProfilingState(use_rdtscp, GetProfileCountersArgument()); + profiling_state_ = ProfilingState(use_rdtscp); if (instruction_order == nullptr) { TF_RETURN_IF_ERROR(computation->Accept(this)); } else { @@ -1379,33 +1379,6 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) { return Status::OK(); } -// Fills up the free variables in 'index_with_free_var' with values from -// 'filler_index'. The size of free variables must be the same as the -// size of 'filler_index'. -// -// This is often used after dimension reduction, where -// 'index_with_free_var' has one or more dimensions reduced, which serves as -// free variables (represented as nullptr). For example, if we have a 4 -// dimensional input and index for the dimension being reduced is -// 2 (third dimension), we will have an index like [i, j, NULL, k] -// after reduced dimension. -// -// Here we fill up that free variable by 'filler_index', which contains -// the value in the reduced dimension. -static llvm_ir::IrArray::Index FillReducedDimensionIndex( - llvm_ir::IrArray::Index index_with_free_var, - llvm_ir::IrArray::Index filler_index) { - llvm_ir::IrArray::Index::const_iterator it = filler_index.begin(); - - for (size_t i = 0; i < index_with_free_var.size(); ++i) { - if (index_with_free_var[i] == nullptr) { - index_with_free_var[i] = *it++; - } - } - CHECK(filler_index.end() == it); - return index_with_free_var; -} - Status IrEmitter::HandleParameter(HloInstruction* parameter) { VLOG(2) << "HandleParameter: " << parameter->ToString(); return EmitTargetAddressForOp(parameter); @@ -2194,14 +2167,6 @@ Status IrEmitter::HandlePad(HloInstruction* pad) { return Status::OK(); } -// If `hlo` is a Transpose, returns its operand; otherwise returns `hlo` itself. -static const HloInstruction* StripTranspose(const HloInstruction& hlo) { - if (hlo.IsRank2Transpose()) { - return hlo.operand(0); - } - return &hlo; -} - Status IrEmitter::HandleFusion(HloInstruction* fusion) { auto* root = fusion->fused_expression_root(); if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion, assignment_)) { @@ -2600,10 +2565,17 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) { return Status::OK(); } -Status IrEmitter::HandleAfterAll(HloInstruction* gen_token) { - TF_RET_CHECK(ByteSizeOf(gen_token->shape()) == 0); +Status IrEmitter::HandleAfterAll(HloInstruction* after_all) { + TF_RET_CHECK(ByteSizeOf(after_all->shape()) == 0); // No code to generate, but we need to emit an address for book-keeping. - TF_RETURN_IF_ERROR(EmitTargetAddressForOp(gen_token)); + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(after_all)); + return Status::OK(); +} + +Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) { + // AddDedendency just forwards its zero-th operand. + emitted_value_[add_dependency] = + GetEmittedValueFor(add_dependency->operand(0)); return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h index 136b88ff75ea8a5f48b42d3476219f18f5ecb39a..559a8162a2d53f28ea6817653503c216af90a610 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h @@ -101,7 +101,7 @@ class IrEmitter : public DfsHloVisitorWithDefault, StatusOr EmitComputation( HloComputation* computation, const string& function_name_prefix, bool is_top_level_computation, - const std::vector* instruction_order); + const std::vector* instruction_order); llvm::IRBuilder<>* b() { return &b_; } @@ -159,7 +159,8 @@ class IrEmitter : public DfsHloVisitorWithDefault, Status HandleConcatenate(HloInstruction* concatenate) override; Status HandleConditional(HloInstruction* conditional) override; Status HandleScatter(HloInstruction* scatter) override; - Status HandleAfterAll(HloInstruction* gen_token) override; + Status HandleAfterAll(HloInstruction* after_all) override; + Status HandleAddDependency(HloInstruction* add_dependency) override; Status HandleRng(HloInstruction* rng) override; Status FinishVisit(HloInstruction* root) override; @@ -467,9 +468,8 @@ class IrEmitter : public DfsHloVisitorWithDefault, // profiling a computation. class ProfilingState { public: - ProfilingState() : use_rdtscp_(false), prof_counters_(nullptr) {} - ProfilingState(bool use_rdtscp, llvm::Value* prof_counters) - : use_rdtscp_(use_rdtscp), prof_counters_(prof_counters) {} + ProfilingState() : use_rdtscp_(false) {} + explicit ProfilingState(bool use_rdtscp) : use_rdtscp_(use_rdtscp) {} // Record the cycle counter before an HLO executes. void RecordCycleStart(llvm::IRBuilder<>* b, HloInstruction* hlo); @@ -494,9 +494,6 @@ class IrEmitter : public DfsHloVisitorWithDefault, // intrinsic? bool use_rdtscp_; - // The argument which corresponds to the profile counter buffer. - llvm::Value* prof_counters_; - // The first read cycle counter in the program. llvm::Value* first_read_cycle_start_ = nullptr; diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc index 669eeb95f3299623a7556bfbb8045fd77f5d0745..722aa3120ef4d8c957873ac58c361f19632dde1f 100644 --- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc +++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc @@ -17,6 +17,7 @@ limitations under the License. #include #include #include +#include #include #include #include @@ -41,61 +42,60 @@ void KeyValueSort(std::pair* row_to_sort, int64 num_elements) { std::sort(row_to_sort, row_to_sort + num_elements); } -// For floating point numbers, we want a total order comparator. -NaN and NaN -// should appear at the beginning and end of the ordering, and -0.0 should -// appear before 0.0. Also we want to have a stable sort, so if the keys are the -// same, we compare the index values. -template -bool LessThan(KeyType lhs, int64 lhs_index, KeyType rhs, int64 rhs_index) { - bool lhs_is_negative = std::signbit(lhs); - bool rhs_is_negative = std::signbit(rhs); - // If the signs are different, we can just compare the signs. - if (lhs_is_negative != rhs_is_negative) { - return lhs_is_negative && !rhs_is_negative; - } - bool lhs_nan = std::isnan(lhs); - bool rhs_nan = std::isnan(rhs); - // Exactly one number is nan? - if (lhs_nan != rhs_nan) { - if (lhs_nan) { - return lhs_is_negative; - } - return !rhs_is_negative; +// We would like a total order of floating point numbers so that the +// sort has a predictable behavior in the presence of NaNs. Rather +// than using floating point comparison, we use the following trick: +// If f is a float, and +// x = bit_cast(f); +// y = x < 0 ? 0x7FFFFFFF - x : x; +// then y is ordered as an int32 such that finite values have the +// obvious order, -0 is ordered before 0, and -NaN and NaN appear at +// the beginning and end of the ordering. +template +CastType Convert(KeyType value) { + CastType casted_value; + memcpy(&casted_value, &value, sizeof(CastType)); + if (casted_value < 0) { + return static_cast(std::numeric_limits::max()) - + casted_value; } - if (lhs != rhs) { - return lhs < rhs; - } - return lhs_index < rhs_index; + return casted_value; +} + +template +bool LessThan(KeyType lhs, KeyType rhs) { + return Convert(lhs) < + Convert(rhs); } template <> void KeyValueSort(std::pair* row_to_sort, int64 num_elements) { - std::sort(row_to_sort, row_to_sort + num_elements, - [](const std::pair& lhs, - const std::pair& rhs) -> bool { - return LessThan(lhs.first, lhs.second, rhs.first, rhs.second); - }); + std::stable_sort(row_to_sort, row_to_sort + num_elements, + [](const std::pair& lhs, + const std::pair& rhs) -> bool { + return LessThan(lhs.first, rhs.first); + }); } template <> void KeyValueSort(std::pair* row_to_sort, int64 num_elements) { - std::sort(row_to_sort, row_to_sort + num_elements, - [](const std::pair& lhs, - const std::pair& rhs) -> bool { - return LessThan(lhs.first, lhs.second, rhs.first, rhs.second); - }); + std::stable_sort(row_to_sort, row_to_sort + num_elements, + [](const std::pair& lhs, + const std::pair& rhs) -> bool { + return LessThan(lhs.first, rhs.first); + }); } template <> void KeyValueSort(std::pair* row_to_sort, int64 num_elements) { - std::sort(row_to_sort, row_to_sort + num_elements, - [](const std::pair& lhs, - const std::pair& rhs) -> bool { - return LessThan( - Eigen::half_impl::half_to_float(lhs.first), lhs.second, - Eigen::half_impl::half_to_float(rhs.first), rhs.second); - }); + std::stable_sort(row_to_sort, row_to_sort + num_elements, + [](const std::pair& lhs, + const std::pair& rhs) -> bool { + return LessThan( + Eigen::half_impl::half_to_float(lhs.first), + Eigen::half_impl::half_to_float(rhs.first)); + }); } template diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc index 3b87683ffffefd2aa24dd234cc072425bef00a24..fa0e09ff6b5694c0e97963b83c6e541b858a1376 100644 --- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc +++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc @@ -63,7 +63,7 @@ CHECK-NOT: private constant [48 x i8] )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - ParseHloString(hlo_text)); + ParseAndReturnVerifiedModule(hlo_text)); CpuAotCompilationOptions options{ /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"", @@ -104,14 +104,14 @@ ENTRY main { )"; string filecheck_pattern = R"( -CHECK: private constant [4 x i8] -CHECK: private constant [8 x i8] +CHECK-DAG: private constant [4 x i8] +CHECK-DAG: private constant [8 x i8] CHECK-NOT: private constant [4 x i8] CHECK-NOT: private constant [8 x i8] )"; TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, - ParseHloString(hlo_text)); + ParseAndReturnVerifiedModule(hlo_text)); CpuAotCompilationOptions options{ /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"", diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h index d6371283221b63b30f968929fe2807eae3f22df0..e84bf00153aa28df29d8df486b92654feab4afbf 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h @@ -251,6 +251,7 @@ class DfsHloVisitorBase { virtual Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0; + virtual Status HandleAddDependency(HloInstructionPtr add_dependency) = 0; virtual Status HandleAfterAll(HloInstructionPtr token) = 0; // Invoked to inform the visitor that the traversal has completed, and that diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h index e57184f639f4f2c618b980a5082381f4b9c28b19..80ea5be298aea44a0f424398da74c4e478f10346 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h @@ -206,6 +206,9 @@ class DfsHloVisitorWithDefaultBase Status HandleGetDimensionSize(HloInstructionPtr get_size) override { return DefaultAction(get_size); } + Status HandleAddDependency(HloInstructionPtr add_dependency) override { + return DefaultAction(add_dependency); + } // Invoked to inform the visitor that the traversal has completed, and that // the root was "root". diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc new file mode 100644 index 0000000000000000000000000000000000000000..c8bfc8905064bcd7b68fe259fbcc1546ff083dbd --- /dev/null +++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc @@ -0,0 +1,138 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" + +namespace xla { + +Status DynamicParameterBinding::Bind( + const DynamicParameter& dynamic_parameter, + const DynamicDimension& dynamic_dimension) { + auto result = bindings_.emplace(dynamic_dimension, dynamic_parameter); + TF_RET_CHECK(result.second); + return Status::OK(); +} + +absl::optional +DynamicParameterBinding::GetBinding(const DynamicDimension& dynamic_dimension) { + auto param_iter = bindings_.find(dynamic_dimension); + if (param_iter == bindings_.end()) { + return absl::nullopt; + } + return param_iter->second; +} + +DynamicParameterBindingProto DynamicParameterBinding::ToProto() const { + DynamicParameterBindingProto result; + for (const auto& binding : bindings_) { + const DynamicDimension& dynamic_dimension = binding.first; + const DynamicParameter& dynamic_param = binding.second; + DynamicParameterBindingProto::Binding binding_proto; + binding_proto.set_dynamic_param_num(dynamic_param.parameter_num); + for (int64 i : dynamic_param.parameter_index) { + binding_proto.add_dynamic_param_index(i); + } + + binding_proto.set_target_param_num(dynamic_dimension.parameter_num); + + for (int64 i : dynamic_dimension.parameter_index) { + binding_proto.add_target_param_index(i); + } + + binding_proto.set_target_param_dim_num(dynamic_dimension.dimension); + result.add_entries()->Swap(&binding_proto); + } + return result; +} + +StatusOr DynamicParameterBinding::CreateFromProto( + const DynamicParameterBindingProto& proto) { + DynamicParameterBinding result; + for (const DynamicParameterBindingProto::Binding& binding : proto.entries()) { + int64 dynamic_param_num = binding.dynamic_param_num(); + ShapeIndex dynamic_param_index(binding.dynamic_param_index().begin(), + binding.dynamic_param_index().end()); + int64 target_param_num = binding.target_param_num(); + ShapeIndex target_param_index(binding.target_param_index().begin(), + binding.target_param_index().end()); + int64 target_dim_num = binding.target_param_num(); + + TF_RETURN_IF_ERROR( + result.Bind(DynamicParameter{dynamic_param_num, dynamic_param_index}, + DynamicDimension{target_param_num, target_param_index, + target_dim_num})); + } + + return result; +} + +string DynamicParameterBinding::ToString() const { + std::vector pieces; + pieces.push_back("DynamicParameterBinding: "); + for (const auto& binding : bindings_) { + const DynamicDimension& dynamic_dimension = binding.first; + const DynamicParameter& dynamic_param = binding.second; + pieces.push_back(absl::StrFormat( + " -- Input param number %lld at %s has dim %lld as dynamic" + " dimension, which is represented by param number %lld at " + "%s", + dynamic_dimension.parameter_num, + dynamic_dimension.parameter_index.ToString(), + dynamic_dimension.dimension, dynamic_param.parameter_num, + dynamic_param.parameter_index.ToString())); + } + return absl::StrJoin(pieces, "\n"); +} + +Status DynamicParameterBinding::ForEachBinding(BindingFn fn) const { + for (const auto& binding : bindings_) { + TF_RETURN_IF_ERROR(fn(binding.second, binding.first)); + } + return Status::OK(); +} + +Status DynamicParameterBinding::Verify(const HloModule& module) const { + const HloComputation* entry = module.entry_computation(); + return ForEachBinding([&](const DynamicParameter& dynamic_parameter, + const DynamicDimension& dynamic_dimension) + -> Status { + TF_RET_CHECK(dynamic_parameter.parameter_num < entry->num_parameters()); + TF_RET_CHECK(dynamic_dimension.parameter_num < entry->num_parameters()); + TF_RET_CHECK(ShapeUtil::IndexIsValid( + entry->parameter_instruction(dynamic_parameter.parameter_num)->shape(), + dynamic_parameter.parameter_index)); + TF_RET_CHECK(ShapeUtil::IndexIsValid( + entry->parameter_instruction(dynamic_dimension.parameter_num)->shape(), + dynamic_dimension.parameter_index)); + TF_RET_CHECK( + dynamic_dimension.dimension < + ShapeUtil::Rank(ShapeUtil::GetSubshape( + entry->parameter_instruction(dynamic_dimension.parameter_num) + ->shape(), + dynamic_dimension.parameter_index))); + return Status::OK(); + }); +} + +std::ostream& operator<<(std::ostream& out, + const DynamicParameterBinding& binding) { + out << binding.ToString(); + return out; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h new file mode 100644 index 0000000000000000000000000000000000000000..dd474d8eed1b2c30ddb8f624a864198c74eacaba --- /dev/null +++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h @@ -0,0 +1,125 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_ + +#include + +#include "absl/container/flat_hash_map.h" +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" +#include "tensorflow/compiler/xla/shape_tree.h" +#include "tensorflow/compiler/xla/shape_util.h" + +namespace xla { + +class HloModule; +// We currently use an explicit API that takes an extra parameter to indicate +// the runtime size of a dynamic dimension. DynamicParameterBinding indicates +// the relationship between parameter: We can have a dynamic parameter that +// points to another target parameter to indicate that the target parameter is +// dynamic. +// +// +// TODO(b/119520625): Remove this API once we have more dynamic shape infra +// ready. +class DynamicParameterBinding { + public: + // DynamicParameter represents a special parameter that is used to represent + // the runtime size of a dimension of another parameter. A dynamic parameter + // has to be a scalar value. + struct DynamicParameter { + // The parameter number of dynamic parameter. + int64 parameter_num; + // The index of the parameter. + ShapeIndex parameter_index; + }; + + // DynamicDimension represents a dimension whose size is determined at + // runtime. A DynamicDimension's runtime size is determined by the binded + // DynamicParameter using `DynamicParameterBinding::Bind` method. + struct DynamicDimension { + // The parameter number of dynamic dimension. + int64 parameter_num; + // The subshape index of the parameter. + ShapeIndex parameter_index; + // The dimension number in the subshape. + int64 dimension; + + // "friend" keyword are added so these functions can be found by ADL. + template + friend H AbslHashValue(H h, const DynamicDimension& m) { + return H::combine(std::move(h), m.parameter_num, m.parameter_index, + m.dimension); + } + + friend bool operator==(const DynamicDimension& lhs, + const DynamicDimension& rhs) { + return lhs.parameter_num == rhs.parameter_num && + lhs.parameter_index == rhs.parameter_index && + lhs.dimension == rhs.dimension; + } + }; + + DynamicParameterBinding() = default; + + virtual ~DynamicParameterBinding() = default; + + // Adds binding which indicates that the dimension indicated by + // `dynamic_dimension` is dynamic, and its runtime size is represented by + // `dynamic_parameter`. + Status Bind(const DynamicParameter& dynamic_parameter, + const DynamicDimension& dynamic_dimension); + + // Returns the parameter and the index representing the runtime size of + // dimension `dim_num` of parameter `param_num` at `param_index`. + // + // Returns nullopt if the binding is not set. + absl::optional GetBinding( + const DynamicDimension& dynamic_dimension); + + using BindingFn = + std::function; + + // Iterate through each binding. + Status ForEachBinding(BindingFn fn) const; + + DynamicParameterBindingProto ToProto() const; + + static StatusOr CreateFromProto( + const DynamicParameterBindingProto& proto); + + string ToString() const; + + // Verifies that the given binding is valid for the given module. + // Specifically, the binding's parameter and parameter size should be valid. + Status Verify(const HloModule& module) const; + + private: + // Keeps track of mappings from DynamicDimension to DynamicParameter. The + // direction of is chosen so that we can easily query if a dimension is + // dynamic and which dynamic parameter represents the real size of that + // dimension. + absl::flat_hash_map bindings_; +}; + +std::ostream& operator<<(std::ostream& out, + const DynamicParameterBinding& binding); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_ diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..83a6d83dffde7995bd8e43917d13c5fd2705ba6f --- /dev/null +++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc @@ -0,0 +1,153 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h" + +#include +#include + +#include "absl/algorithm/container.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_dce.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_ordering.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/core/lib/core/status_test_util.h" + +namespace xla { +namespace { +class DynamicParameterBindingTest : public HloTestBase {}; + +TEST_F(DynamicParameterBindingTest, SimpleBinding) { + // 'b' is a dynamic shape; 'a' represents the real size of b's first + // dimension. + const string module_str = R"( +HloModule TEST + +ENTRY main { + a = f32[] parameter(0) + b = f32[10] parameter(1) + ROOT root = (f32[], f32[10]) tuple(%a, %b) +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + + DynamicParameterBinding binding; + + TF_EXPECT_OK( + binding.Bind(DynamicParameterBinding::DynamicParameter{0, {}}, + DynamicParameterBinding::DynamicDimension{1, {}, 0})); + + absl::optional param = + binding.GetBinding( + DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1, + /*parameter_index=*/{}, + /*dimension=*/0}); + EXPECT_TRUE(param); + EXPECT_EQ(param->parameter_num, 0); + EXPECT_EQ(param->parameter_index, ShapeIndex({})); + TF_EXPECT_OK(binding.Verify(*module)); +} + +TEST_F(DynamicParameterBindingTest, TupleBinding) { + // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's first + // dimension. + const string module_str = R"( +HloModule TEST + +ENTRY main { + param = (f32[], f32[10]) parameter(0) + gte1 = f32[] get-tuple-element(%param), index=0 + gte2 = f32[10] get-tuple-element(%param), index=1 + ROOT root = (f32[], f32[10]) tuple(%gte1, %gte2) +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + + DynamicParameterBinding binding; + + TF_EXPECT_OK( + binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}}, + DynamicParameterBinding::DynamicDimension{0, {1}, 0})); + + absl::optional param = + binding.GetBinding( + DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0, + /*parameter_index=*/{1}, + /*dimension=*/0}); + + EXPECT_TRUE(param); + EXPECT_EQ(param->parameter_num, 0); + EXPECT_EQ(param->parameter_index, ShapeIndex({0})); + TF_EXPECT_OK(binding.Verify(*module)); +} + +TEST_F(DynamicParameterBindingTest, TupleBindingWithMultiDimension) { + // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's both + // dimensions. + const string module_str = R"( +HloModule TEST + +ENTRY main { + param = (f32[], f32[10, 10]) parameter(0) + gte1 = f32[] get-tuple-element(%param), index=0 + gte2 = f32[10, 10] get-tuple-element(%param), index=1 + ROOT root = (f32[], f32[10, 10]) tuple(%gte1, %gte2) +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + ParseHloString(module_str)); + + DynamicParameterBinding binding; + + TF_EXPECT_OK( + binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}}, + DynamicParameterBinding::DynamicDimension{0, {1}, 0})); + + TF_EXPECT_OK( + binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}}, + DynamicParameterBinding::DynamicDimension{0, {1}, 1})); + + absl::optional param = + binding.GetBinding( + DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0, + /*parameter_index=*/{1}, + /*dimension=*/0}); + + EXPECT_TRUE(param); + EXPECT_EQ(param->parameter_num, 0); + EXPECT_EQ(param->parameter_index, ShapeIndex({0})); + + absl::optional param2 = + binding.GetBinding( + DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0, + /*parameter_index=*/{1}, + /*dimension=*/0}); + EXPECT_TRUE(param2); + EXPECT_EQ(param2->parameter_num, 0); + EXPECT_EQ(param2->parameter_index, ShapeIndex({0})); + + TF_EXPECT_OK(binding.Verify(*module)); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index f98c943669be8c14d245896b91cee3eee1e47429..00bb430206afdb81f9d101c0a5b2b4cf907b447a 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -22,6 +22,7 @@ limitations under the License. // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" #include "absl/algorithm/container.h" +#include "absl/container/flat_hash_map.h" #include "absl/strings/str_cat.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Instructions.h" @@ -1671,26 +1672,66 @@ StatusOr ElementalIrEmitter::EmitElementalConcatenate( b_->SetInsertPoint(init_block); + // Assign a unique id for each *different* operand, and count how often each + // operand is used. If all operands are different, the usage count will be 1 + // for each operand. + absl::flat_hash_map to_unique_operand_id; + std::vector operand_usage_count; + for (const auto* operand : hlo->operands()) { + if (to_unique_operand_id.contains(operand)) { + ++operand_usage_count[to_unique_operand_id[operand]]; + } else { + int64 unique_operand_id = to_unique_operand_id.size(); + to_unique_operand_id[operand] = unique_operand_id; + operand_usage_count.push_back(1); + } + } + + // To avoid that we emit the same operand more than once, we create one basic + // block for each *different* operand with a PHI node for the different source + // index inputs. + std::vector emit_operand_blocks( + to_unique_operand_id.size(), nullptr); + std::vector source_index_phis(to_unique_operand_id.size(), + nullptr); + for (const auto* operand : hlo->operands()) { + int64 operand_id = to_unique_operand_id[operand]; + if (emit_operand_blocks[operand_id] != nullptr) { + continue; + } + + emit_operand_blocks[operand_id] = llvm_ir::CreateBasicBlock( + exit_block, StrCat("concat_index_from_operand_id", operand_id), b_); + auto saved_insert_point = b_->GetInsertPoint(); + llvm_ir::SetToFirstInsertPoint(emit_operand_blocks[operand_id], b_); + source_index_phis[operand_id] = + PHI(source_index.GetType(), operand_usage_count[operand_id]); + auto operand_index = source_index; + operand_index[concat_dim] = source_index_phis[operand_id]; + + // Create the terminator of the block before calling operand generators, + // because they require non-degenerate basic blocks. + b_->SetInsertPoint(llvm::BranchInst::Create( + exit_block, /*InsertAtEnd=*/emit_operand_blocks[operand_id])); + TF_ASSIGN_OR_RETURN(llvm::Value * value, + operand_to_generator.at(operand)(operand_index)); + output->addIncoming(value, b_->GetInsertBlock()); + b_->SetInsertPoint(init_block, saved_insert_point); + } + for (int64 operand_idx = 0; operand_idx < hlo->operand_count(); ++operand_idx) { const HloInstruction* operand = hlo->operand(operand_idx); - auto true_block = llvm_ir::CreateBasicBlock( - exit_block, StrCat("concat_index_from_operand", operand_idx), b_); auto false_block = llvm_ir::CreateBasicBlock( exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_); auto concat_dim_size = llvm::ConstantInt::get(source_index[concat_dim]->getType(), operand->shape().dimensions(concat_dim)); - CondBr(ICmpULT(source_index[concat_dim], concat_dim_size), true_block, - false_block); - - // Create the terminator of the true block before calling operand - // generators, because they require non-degenerate basic blocks. - b_->SetInsertPoint( - llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block)); - TF_ASSIGN_OR_RETURN(llvm::Value * value, - operand_to_generator.at(operand)(source_index)); - output->addIncoming(value, b_->GetInsertBlock()); + int64 operand_id = to_unique_operand_id[operand]; + source_index_phis[operand_id]->addIncoming(source_index[concat_dim], + b_->GetInsertBlock()); + CondBr(ICmpULT(source_index[concat_dim], concat_dim_size), + emit_operand_blocks[operand_id], false_block); // Subtract the size of the concat dimension of the current operand // from the source index. diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 45f620f3f33eee41eefa9ddfdfb166a5ba76caef..b34bca55a48b113c325dbf28c03f7a0f5b71f658 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -61,7 +61,7 @@ struct ExecutionOutput { class Executable { public: explicit Executable( - std::unique_ptr hlo_module, + std::unique_ptr hlo_module, std::unique_ptr hlo_profile_printer_data, std::unique_ptr hlo_profile_index_map) : hlo_module_(std::move(hlo_module)), @@ -162,7 +162,7 @@ class Executable { return hlo_profile_printer_data_ != nullptr; } - const HloModule& module() const { return *hlo_module_; } + HloModule& module() const { return *hlo_module_; } const bool has_module() const { return hlo_module_ != nullptr; } @@ -199,7 +199,7 @@ class Executable { // HloModule this was compiled from. BufferAssignment keeps pointers to // HloInstructions owned by the HloModule so we need to keep the HloModule // around. - const std::unique_ptr hlo_module_; + const std::unique_ptr hlo_module_; // HloSnapshot this was compiled from. Null if not dumping executions. std::unique_ptr hlo_snapshot_; diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index b1629616acd2bb715d5aa1a89286a38a45417d2c..bfd1b6cb1492f5cb709e2ecefe73782094e26f5e 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -701,6 +701,7 @@ cc_library( "//tensorflow/compiler/xla/service:hlo_cse", "//tensorflow/compiler/xla/service:hlo_dce", "//tensorflow/compiler/xla/service:hlo_element_type_converter", + "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter", "//tensorflow/compiler/xla/service:hlo_pass", "//tensorflow/compiler/xla/service:hlo_pass_pipeline", "//tensorflow/compiler/xla/service:hlo_proto", diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc index 30c1f9088968305ad0207164ecb07ba13cc89ee6..470457935acacb8940af241dadb393d770786939 100644 --- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc +++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc @@ -229,7 +229,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) { if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) { return user->opcode() == HloOpcode::kFusion && (user->fusion_kind() == HloInstruction::FusionKind::kLoop || - (user->fusion_kind() == HloInstruction::FusionKind::kInput && + (IsReduceInputFusion(*user) && LayoutsAreReduceInputFusionFriendly(*fusion, *user))); })) { VLOG(3) << "Not merging " << fusion->name() diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 57426327822d95a42f407ed7488f35acfd3623d2..ae2e718db29803a085401969a7d9b09abf690a6c 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -51,7 +51,7 @@ GpuExecutable::GpuExecutable( const string& ptx, const std::vector& cubin, std::pair compute_capability, std::unique_ptr thunk_schedule, - std::unique_ptr hlo_module, + std::unique_ptr hlo_module, std::unique_ptr assignment, std::unique_ptr hlo_profile_printer_data, std::unique_ptr hlo_profile_index_map) diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 0e276282e40fba0ae4881a51dad0c7c9e8d1c081..2b3c77f5b82aa94f44d8de56caf0f4d31c05e0cb 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -54,7 +54,7 @@ class GpuExecutable : public Executable { GpuExecutable(const string& ptx, const std::vector& cubin, std::pair compute_capability, std::unique_ptr thunk_schedule, - std::unique_ptr hlo_module, + std::unique_ptr hlo_module, std::unique_ptr assignment, std::unique_ptr hlo_profile_printer_data, std::unique_ptr hlo_profile_index_map); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc index 2d31fd5570c468b0c42fa308535fd335f3588a79..392b149abdfb5bf2ce76e8f9f7c4f2cba898ac8c 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc @@ -55,7 +55,7 @@ bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer, }); } -bool IsInputFusibleReduction(const HloInstruction& instr) { +bool IsReduceInputFusion(const HloInstruction& instr) { if (instr.IsMultiOutputFusion()) { for (const HloInstruction* operand : instr.fused_expression_root()->operands()) { @@ -67,17 +67,18 @@ bool IsInputFusibleReduction(const HloInstruction& instr) { return true; } } - return false; - } else if (instr.opcode() == HloOpcode::kFusion) { - if (IsReductionToVector(*instr.fused_expression_root())) { - CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput) - << " Fusion rooted at reduction-to-vector op must be of kind kInput: " - << instr.ToString(); - return true; - } - return false; + } else if (instr.opcode() == HloOpcode::kFusion && + IsReductionToVector(*instr.fused_expression_root())) { + CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput) + << " Fusion rooted at reduction-to-vector op must be of kind kInput: " + << instr.ToString(); + return true; } - return IsReductionToVector(instr); + return false; +} + +bool IsInputFusibleReduction(const HloInstruction& instr) { + return IsReduceInputFusion(instr) || IsReductionToVector(instr); } } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h index f7c24a0d5bbfcc61389ea19ae7f769671e4e974d..c0be354730d22fb76754a60a1c9c58781d0d452a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h @@ -33,14 +33,17 @@ namespace gpu { bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer, const HloInstruction& reduce); -// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr` -// is either an unfused reduction-to-vector op, an input fusion rooted at a -// reduction-to-vector op, or a multi-output input fusion with at least one -// reduction-to-vector op root. // Note that reduction ops are lowered in different ways. Reduce input fusions // are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at // reduction-to-vector ops. Other reduction ops are lowered by // GpuElementalIrEmitter and fused like elementwise ops. + +// Whether `instr` is an input fusion rooted at a reduction-to-vector op or a +// multi-output input fusion with at least one reduction-to-vector op root. +bool IsReduceInputFusion(const HloInstruction& instr); + +// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr` +// is either an unfused reduction-to-vector op or a reduce input fusion. bool IsInputFusibleReduction(const HloInstruction& instr); } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc index d91b7bc61fda5a07c163a07ec0e1644d2ad9db49..12222500ea732a4ca8ea6b3a37033f7e8d4ee927 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc @@ -178,7 +178,7 @@ TEST_F(GpuFusibleTest, EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce)); } -TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) { +TEST_F(GpuFusibleTest, IsReduceInputFusion_ReductionToVector) { auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( ENTRY entry { c0 = f32[] parameter(0) @@ -191,10 +191,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) { const HloInstruction* reduce = module->entry_computation()->root_instruction(); ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce); + EXPECT_FALSE(IsReduceInputFusion(*reduce)); EXPECT_TRUE(IsInputFusibleReduction(*reduce)); } -TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) { +TEST_F(GpuFusibleTest, IsReduceInputFusion_ElementalReduction) { auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( ENTRY entry { c0 = f32[] parameter(0) @@ -207,10 +208,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) { const HloInstruction* reduce = module->entry_computation()->root_instruction(); ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce); + EXPECT_FALSE(IsReduceInputFusion(*reduce)); EXPECT_FALSE(IsInputFusibleReduction(*reduce)); } -TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) { +TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputInputReduceFusion) { auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( fused_reduction { c0 = f32[] parameter(0) @@ -225,10 +227,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) { const HloInstruction* reduce = module->entry_computation()->root_instruction(); ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_TRUE(IsReduceInputFusion(*reduce)); EXPECT_TRUE(IsInputFusibleReduction(*reduce)); } -TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) { +TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputLoopReduceFusion) { auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( fused_reduction { c0 = f32[] parameter(0) @@ -243,10 +246,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) { const HloInstruction* reduce = module->entry_computation()->root_instruction(); ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_FALSE(IsReduceInputFusion(*reduce)); EXPECT_FALSE(IsInputFusibleReduction(*reduce)); } -TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) { +TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputInputReduceFusion) { auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( fused_reduction { c0 = f32[] parameter(0) @@ -263,11 +267,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) { const HloInstruction* reduce = module->entry_computation()->root_instruction(); ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_TRUE(IsReduceInputFusion(*reduce)); EXPECT_TRUE(IsInputFusibleReduction(*reduce)); } TEST_F(GpuFusibleTest, - IsInputFusibleReduction_MultiOutputInputReduceFusionWithExtraOutputs) { + IsReduceInputFusion_MultiOutputInputReduceFusionWithExtraOutputs) { auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( fused_reduction { c0 = f32[] parameter(0) @@ -284,10 +289,11 @@ TEST_F(GpuFusibleTest, const HloInstruction* reduce = module->entry_computation()->root_instruction(); ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_TRUE(IsReduceInputFusion(*reduce)); EXPECT_TRUE(IsInputFusibleReduction(*reduce)); } -TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) { +TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputLoopReduceFusion) { auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( fused_reduction { c0 = f32[] parameter(0) @@ -304,11 +310,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) { const HloInstruction* reduce = module->entry_computation()->root_instruction(); ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_FALSE(IsReduceInputFusion(*reduce)); EXPECT_FALSE(IsInputFusibleReduction(*reduce)); } TEST_F(GpuFusibleTest, - IsInputFusibleReduction_MultiOutputLoopFusionReduceAndElementwiseOp) { + IsReduceInputFusion_MultiOutputLoopFusionReduceAndElementwiseOp) { auto module = ParseHloString(absl::StrCat(kModulePrefix, R"( fused_reduction { c0 = f32[] parameter(0) @@ -325,6 +332,7 @@ TEST_F(GpuFusibleTest, const HloInstruction* reduce = module->entry_computation()->root_instruction(); ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion); + EXPECT_FALSE(IsReduceInputFusion(*reduce)); EXPECT_FALSE(IsInputFusibleReduction(*reduce)); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc index 91609c730b6c0d666eb607fb42b918c0f8f250e5..1126943624a3771433ecac591545d335c1890115 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc @@ -37,7 +37,7 @@ class GpuHloOrdering : public PredecessorHloOrdering { public: GpuHloOrdering(const HloModule* module, const StreamAssignment& stream_assignment, - const std::vector& thunk_launch_order); + const std::vector& thunk_launch_order); ~GpuHloOrdering() override = default; // Only the entry computation can possibly be sequentially ordered, and only @@ -56,7 +56,7 @@ class GpuHloOrdering : public PredecessorHloOrdering { GpuHloOrdering::GpuHloOrdering( const HloModule* module, const StreamAssignment& stream_assignment, - const std::vector& thunk_launch_order) + const std::vector& thunk_launch_order) : PredecessorHloOrdering(module) { // The entry computation has a total order when there's only one stream. if (stream_assignment.StreamCount() == 1) { @@ -150,7 +150,7 @@ GpuHloOrdering::GpuHloOrdering( // However, if the total order is A,B,D,C,E, then C and E can run // concurrently. void BFSLaunchOrder(const HloComputation* computation, - std::vector* launch_order) { + std::vector* launch_order) { // This topological sort uses two data structures: // 1. `incoming_edge_count` which keeps track of the number of incoming // edges to each HLO; @@ -158,9 +158,9 @@ void BFSLaunchOrder(const HloComputation* computation, // // The sorting algorithm repeatedly pops the top from the queue and deletes // that HLO from the graph, making more HLOs incoming-edge free. - std::deque queue; + std::deque queue; std::unordered_map incoming_edge_count; - for (const auto& hlo : computation->instructions()) { + for (auto* hlo : computation->instructions()) { if (hlo->operand_count() == 0) { queue.push_back(hlo); } else { @@ -172,10 +172,10 @@ void BFSLaunchOrder(const HloComputation* computation, } while (!queue.empty()) { - const HloInstruction* x = queue.front(); + HloInstruction* x = queue.front(); queue.pop_front(); launch_order->push_back(x); - for (const HloInstruction* y : x->users()) { + for (HloInstruction* y : x->users()) { --incoming_edge_count[y]; if (incoming_edge_count[y] == 0) { queue.push_back(y); @@ -195,14 +195,14 @@ StatusOr> GpuHloSchedule::Build( std::unique_ptr schedule(new GpuHloSchedule); // Initialize thunk_launch_order_, the total order of thunk launches. - const HloComputation* entry_computation = module.entry_computation(); + HloComputation* entry_computation = module.entry_computation(); if (stream_assignment.StreamCount() == 1) { // All kernels are launched on a single stream, so there's no loss of // concurrency by optimizing for minimal memory usage. TF_ASSIGN_OR_RETURN( HloInstructionSequence sequence, ScheduleComputation( - *entry_computation, [pointer_size](const BufferValue& buffer) { + entry_computation, [pointer_size](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size); })); schedule->thunk_launch_order_ = sequence.instructions(); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h index 07a7fc67aa555845c3de57e574ab582403ec0490..7f224ffe4f03f8f05b0f1907628d99d9df387770 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h @@ -46,7 +46,7 @@ class GpuHloSchedule { // Returns the total order of thunk launches, represented in terms of HLO // instructions. - const std::vector& ThunkLaunchOrder() const { + const std::vector& ThunkLaunchOrder() const { return thunk_launch_order_; } @@ -60,7 +60,7 @@ class GpuHloSchedule { private: GpuHloSchedule(); - std::vector thunk_launch_order_; + std::vector thunk_launch_order_; std::unique_ptr hlo_ordering_; }; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc index 5f857a1a5452a20e2cde50a54d2416f79048edb1..91db7151f22fd75b20244878bee86d65acd1d304 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc @@ -33,7 +33,7 @@ namespace gpu { class GpuHloScheduleTest : public HloTestBase { protected: - using HloVec = std::vector; + using HloVec = std::vector; // Pre-canned shapes. Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2}); diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc index 7fcdd805ed32004a96ecc0da7de1d89bcf1b6229..31591914cc553f0f5ecd81cb514faa1dc56ea041 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc @@ -97,6 +97,18 @@ Status IrEmitter::HandleBitcast(HloInstruction* bitcast) { return Status::OK(); } +Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) { + VLOG(2) << "HandleAddDependency: " << add_dependency->ToString(); + const HloInstruction* operand = add_dependency->operand(0); + // Add_Dependency is a no-op, but we still want to bind it to an llvm::Value + // sometimes, e.g., when it's operand is a constant or a bitcast of a + // constant. + if (bindings_.BoundToIrValue(*operand)) { + bindings_.BindHloToIrValue(*add_dependency, GetBasePointer(*operand)); + } + return Status::OK(); +} + Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) { auto operand = get_tuple_element->operand(0); CHECK(bindings_.BoundToIrValue(*operand)); diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h index 56c3f452006f9e2d5c37cc3b54701b2367abfa14..2da46c016935d0e927879bbfb0d05cfc4899d818 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h @@ -100,6 +100,7 @@ class IrEmitter : public DfsHloVisitorWithDefault, Status HandleBatchNormInference(HloInstruction* batch_norm) override; Status HandleBatchNormTraining(HloInstruction* batch_norm) override; Status HandleBatchNormGrad(HloInstruction* batch_norm) override; + Status HandleAddDependency(HloInstruction* add_dependency) override; Status FinishVisit(HloInstruction* root) override { return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 5751cfd9521430b09dee9e440f0e28e6cdc6ea17..52f0ba7aa7bf5d7000f955a5b318fd0fa00d592a 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -65,6 +65,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h" @@ -2172,7 +2173,18 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) { Status IrEmitterUnnested::HandleSort(HloInstruction* sort) { std::vector> thunks; Shape keys_shape = sort->operand(0)->shape(); + int64 dimension_to_sort = sort->dimensions(0); + // In case there is a 'values' parameter that is a iota, we take note and use + // it later to ensure a stable sort. Otherwise, we don't guarantee a stable + // sort. + int64 iota_values_parameter_index = -1; for (int64 i = 0; i < sort->operand_count(); ++i) { + if (i > 0 && sort->operand(i)->opcode() == HloOpcode::kIota && + ShapeUtil::ElementIsIntegral(sort->operand(i)->shape()) && + Cast(sort->operand(i))->iota_dimension() == + dimension_to_sort) { + iota_values_parameter_index = i; + } ShapeIndex shape_index = sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({}); // We assume that the layout of all involved operands and outputs is the @@ -2197,7 +2209,6 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) { } } - int64 dimension_to_sort = sort->dimensions(0); uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort); int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound); CHECK_GE(1ULL << num_stages, dimension_to_sort_bound); @@ -2299,8 +2310,9 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) { } } return llvm_ir::EmitSortInPlace( - dimension_to_sort, keys_array, values_arrays, IrName(sort), xor_masks, - &b_, launch_dimensions, + dimension_to_sort, keys_array, values_arrays, + iota_values_parameter_index, IrName(sort), xor_masks, &b_, + launch_dimensions, xor_masks.size() > 1 ? num_iterations_in_sort_dim : standard_num_iterations_in_sort_dim, kTileSize); @@ -2386,7 +2398,7 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) { return Status::OK(); } -Status IrEmitterUnnested::HandleAfterAll(HloInstruction* gen_token) { +Status IrEmitterUnnested::HandleAfterAll(HloInstruction* after_all) { return Status::OK(); } @@ -3677,7 +3689,7 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile( constexpr int kNumRows = 4; KernelMappingScheme mapping_scheme( reduced_output_dims, /*tile_size_y=*/kWarpSize, - /*tile_size_x=*/kWarpSize, /*req_block_sizes=*/{2, 2, 2}, + /*tile_size_x=*/kWarpSize, /*req_block_sizes=*/{1, 1, 1}, /*num_threads_y=*/kNumRows, /*num_threads_x=*/kWarpSize, &b_); TileElementGenerator element_generator; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index 97a1e10455336cd4842275b6cf1482614bfbfa60..e09ed657a812be6ab4859a0e365a51c45a37bfed 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -171,7 +171,7 @@ class IrEmitterUnnested : public IrEmitter { Status HandleSort(HloInstruction* sort) override; Status HandleTupleSelect(HloInstruction* tuple_select) override; Status HandleCrossReplicaSum(HloInstruction* crs) override; - Status HandleAfterAll(HloInstruction* gen_token) override; + Status HandleAfterAll(HloInstruction* after_all) override; Status EmitTargetElementLoop( const HloInstruction& hlo, diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 48a860ba4132dda9c1bade86ee086916b7b7aec3..637b861f70235f17e8e739907a3f262b7004ee7c 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -67,6 +67,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_cse.h" #include "tensorflow/compiler/xla/service/hlo_dce.h" #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h" +#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_pass_fix.h" #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h" @@ -142,6 +143,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec, Compiler* compiler) { { HloPassPipeline pipeline("optimization"); + pipeline.AddPass(); pipeline.AddInvariantChecker(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false); pipeline.AddPass(); @@ -177,9 +179,10 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec, // elimination has to come after that pass. pipeline.AddPass(); - pass.AddPass( - /*is_layout_sensitive=*/false, + AlgebraicSimplifierOptions options( [](const Shape&, const Shape&) { return false; }); + options.set_enable_permutation_sort_replacement(true); + pass.AddPass(options); pass.AddPass(); pass.AddPass(); pass.AddPass(); @@ -248,11 +251,13 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec, // The LayoutAssignment pass may leave behind kCopy instructions which are // duplicate or NOPs, so remove them with algebraic simplification and CSE. - pipeline.AddPass>( - /*is_layout_sensitive=*/true, + AlgebraicSimplifierOptions options( /*valid_bitcast_callback=*/[](const Shape&, const Shape&) { return true; }); + options.set_is_layout_sensitive(true); + options.set_enable_permutation_sort_replacement(true); + pipeline.AddPass>(options); // Choose the fastest algorithm for each conv. // diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc index 141f3219387940a08ef22cbcc0be0971a14c2cd6..6b2d76764a077dc6cfa3f9ddc6e525ab330323be 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc +++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc @@ -45,7 +45,7 @@ void ThunkSchedule::AddDependenciesOnTransitiveOperands( ThunkSchedule::ThunkSchedule( std::unique_ptr thunks, std::unique_ptr stream_assignment, - const std::vector& hlo_total_order) + const std::vector& hlo_total_order) : thunks_(std::move(thunks)), stream_assignment_(std::move(stream_assignment)) { std::unordered_map hlo_to_thunk; @@ -53,7 +53,7 @@ ThunkSchedule::ThunkSchedule( InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get()); } - for (const HloInstruction* hlo : hlo_total_order) { + for (HloInstruction* hlo : hlo_total_order) { if (hlo_to_thunk.count(hlo)) { thunk_total_order_.push_back(FindOrDie(hlo_to_thunk, hlo)); } diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h index d3352994f845a535233612a17e19107511ce0622..43b628a1baf0e79a3197f3cfad3547991642eaed 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h +++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h @@ -46,7 +46,7 @@ class ThunkSchedule { public: ThunkSchedule(std::unique_ptr thunks, std::unique_ptr stream_assignment, - const std::vector& hlo_total_order); + const std::vector& hlo_total_order); // Returns the total order of executing all the thunks. const std::vector& TotalOrder() const { return thunk_total_order_; } diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc index fad3215fc81e1012ddaa5a6458bc98f90ab97f18..dc40b9446ad1bffcb757543e52fc9ab20de6d52e 100644 --- a/tensorflow/compiler/xla/service/heap_simulator_test.cc +++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc @@ -258,7 +258,7 @@ class HeapSimulatorTracker { // Constructor for testing a single entry computation. HeapSimulatorTracker( const string& name, std::unique_ptr computation, - const std::vector& instruction_sequence) { + const std::vector& instruction_sequence) { HloModuleConfig config; module_ = absl::make_unique(name, config); module_->AddEntryComputation(std::move(computation)); @@ -286,7 +286,7 @@ class HeapSimulatorTracker { // Similar to the single entry computation constructor above, but runs the // simulation over the entire module. void RunWholeModule( - const std::vector& full_module_sequence) { + const std::vector& full_module_sequence) { points_to_analysis_ = TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie(); @@ -294,7 +294,7 @@ class HeapSimulatorTracker { HloSchedule schedule(module_.get()); absl::flat_hash_map reverse_position; for (int i = 0; i < full_module_sequence.size(); ++i) { - const HloInstruction* instruction = full_module_sequence[i]; + HloInstruction* instruction = full_module_sequence[i]; schedule.GetOrCreateSequence(instruction->parent()) .push_back(instruction); reverse_position[instruction] = full_module_sequence.size() - i; diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto index dbab62f847e8ca5e0b46dfd4162a0f4222640252..c62c935af789bc5274eea835d16a3392ef393cf9 100644 --- a/tensorflow/compiler/xla/service/hlo.proto +++ b/tensorflow/compiler/xla/service/hlo.proto @@ -205,7 +205,8 @@ message HloComputationProto { repeated HloInstructionProto instructions = 2; // The program shape (with layout) of this computation. - xla.ProgramShape program_shape = 4; + + xla.ProgramShapeProto program_shape = 4; // The id of this computation. int64 id = 5; @@ -251,6 +252,41 @@ message HloInputOutputAliasProto { repeated AliasEntryProto entries = 1; } +message DynamicParameterBindingProto { + // A list of bindings which indicates that the `target_dim_num` in + // the subshape `target_param_index` of parameter `target_param_num` + // is a dynamic dimension and its real dynamic size is represented + // by `dynamic_param_index` in parameter `dynamic_param_num`. + // + // As an example, imagine we have a program: + // + // ENTRY main { + // a = f32[] parameter(0) + // b = f32[10] parameter(1) + // ROOT root = (f32[], f32[10]) tuple(%a, %b) + // } + // + // Let's say 'b' (param index 1) is a dynamic shape whose input has + // an upperbound of 10 and real size is determined at runtime.'a' + // represents the real size of b's first dimension. + // + // In this case, the fields are set in the following way: + // dynamic_param_num = 1 + // dynamic_param_index = {} + // target_param_num = 0 + // target_param_index = {} + // target_param_dim = 0 + message Binding { + int64 dynamic_param_num = 1; + repeated int64 dynamic_param_index = 2; + int64 target_param_num = 3; + repeated int64 target_param_index = 4; + int64 target_param_dim_num = 5; + } + + repeated Binding entries = 1; +} + // Serialization of HloModule. message HloModuleProto { string name = 1; @@ -262,7 +298,7 @@ message HloModuleProto { repeated HloComputationProto computations = 3; // The host program shape (with layout) of the entry computation. - xla.ProgramShape host_program_shape = 4; + xla.ProgramShapeProto host_program_shape = 4; // The id of this module. int64 id = 5; @@ -272,6 +308,8 @@ message HloModuleProto { // Describes alias information between inputs and outputs. HloInputOutputAliasProto input_output_alias = 8; + + DynamicParameterBindingProto dynamic_parameter_binding = 9; } // Serialization of LogicalBuffer. diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index 0c20d207ddbca82e2f87800d331d1bace39a512e..d06c2207cb764322c035b4b06e6a80cc515d374d 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -499,7 +499,7 @@ HloComputationProto HloComputation::ToProto() const { proto.add_instructions()->Swap(&instruction_proto); } proto.set_root_id(root_instruction()->unique_id()); - *proto.mutable_program_shape() = ComputeProgramShape(); + *proto.mutable_program_shape() = ComputeProgramShape().ToProto(); return proto; } @@ -795,7 +795,7 @@ Status HloComputation::AcceptWithOperandOrder( template Status HloComputation::AcceptOrdered( DfsHloVisitorBase* visitor, - const std::vector& order) const { + const std::vector& order) const { VLOG(3) << "Accepting visitor with order."; for (HloInstruction* root : CollectUnreachableRoots()) { TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end()) @@ -825,9 +825,9 @@ Status HloComputation::AcceptOrdered( // Explicit instantiations. template Status HloComputation::AcceptOrdered( - DfsHloVisitor*, const std::vector&) const; + DfsHloVisitor*, const std::vector&) const; template Status HloComputation::AcceptOrdered( - ConstDfsHloVisitor*, const std::vector&) const; + ConstDfsHloVisitor*, const std::vector&) const; Status HloComputation::Accept( const std::function& visitor_func) { diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h index fc7d2035e5bd0b99fa9e7a026430172f686019d4..be1ce336968504b6406c9ef4b879821821c5b187 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.h +++ b/tensorflow/compiler/xla/service/hlo_computation.h @@ -301,7 +301,7 @@ class HloComputation { // be a topological sort of all instructions in the computation. template Status AcceptOrdered(DfsHloVisitorBase* visitor, - const std::vector& order) const; + const std::vector& order) const; // Same as Accept() above, but the visitor is given as a function. Status Accept(const std::function& visitor_func); diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc index fdfb38b858c32ba5b092ec2db84d4bac487c3e78..df7d3826dbad1f264a5dc53312c062900155b0f6 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc @@ -419,6 +419,21 @@ Status HloCostAnalysis::HandleTranspose(const HloInstruction*) { } Status HloCostAnalysis::HandleAfterAll(const HloInstruction*) { + // This instruction is used to enforce ordering at compile time. No code is + // emitted. + current_should_compute_bottleneck_time_ = false; + current_properties_[kBytesAccessedKey] = 0; + current_properties_[kOptimalSecondsKey] = 0; + return Status::OK(); +} + +Status HloCostAnalysis::HandleAddDependency( + const HloInstruction* add_dependency) { + // This instruction is used to enforce ordering at compile time. No code is + // emitted. + current_should_compute_bottleneck_time_ = false; + current_properties_[kBytesAccessedKey] = 0; + current_properties_[kOptimalSecondsKey] = 0; return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h index 8ced9d776e150ac587e9ac3ed0beffbc38dc5503..33983119c9b00a248c0e8dcc5815c6367192dca3 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h @@ -101,6 +101,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor { Status HandleBroadcast(const HloInstruction* broadcast) override; Status HandlePad(const HloInstruction* pad) override; Status HandleReshape(const HloInstruction* reshape) override; + Status HandleAddDependency(const HloInstruction* add_dependency) override; Status HandleAfterAll(const HloInstruction* token) override; Status HandleTranspose(const HloInstruction* transpose) override; Status HandleWhile(const HloInstruction* xla_while) override; diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc index 5dcf6bc985ff18fa6fc1ab5a5692914b4597d065..3ed3d3c11c71dc534f193ba3ffb556b0eb0c80e4 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc @@ -466,6 +466,21 @@ bool HloDataflowAnalysis::UpdateDomainValueSet(HloInstruction* domain) { return changed; } +bool HloDataflowAnalysis::UpdateAddDependencyValueSet( + HloInstruction* add_dependency) { + // AddDependency just forwards the value of its zero-th operand. + CHECK_EQ(add_dependency->opcode(), HloOpcode::kAddDependency); + const InstructionValueSet& operand_set = + GetInstructionValueSet(add_dependency->operand(0)); + InstructionValueSet& add_dependency_set = + GetInstructionValueSet(add_dependency); + if (operand_set != add_dependency_set) { + add_dependency_set = operand_set; + return true; + } + return false; +} + bool HloDataflowAnalysis::UpdateGetTupleElementValueSet(HloInstruction* gte) { CHECK_EQ(gte->opcode(), HloOpcode::kGetTupleElement); bool changed = false; @@ -622,6 +637,8 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet( HloInstruction* instruction) { // Recompute from operands. switch (instruction->opcode()) { + case HloOpcode::kAddDependency: + return UpdateAddDependencyValueSet(instruction); case HloOpcode::kBitcast: return UpdateBitcastValueSet(instruction); case HloOpcode::kDomain: @@ -795,6 +812,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() { define_all_values(); } break; + case HloOpcode::kAddDependency: case HloOpcode::kWhile: case HloOpcode::kCall: case HloOpcode::kConditional: diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h index abac398c04fc4c418d8814a0097db4434bc1cd9c..ece17fc4c3ea0261474df5d53c088dd05016e1e4 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h @@ -193,6 +193,7 @@ class HloDataflowAnalysis { bool UpdateSendValueSet(HloInstruction* send); bool UpdateTupleValueSet(HloInstruction* tuple); bool UpdateWhileValueSet(HloInstruction* xla_while); + bool UpdateAddDependencyValueSet(HloInstruction* add_dependency); // Propagate the dataflow through the module. void Propagate(); diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc index e8eb7066f96537ff7d5a932434852bc4cf209281..f7a1f19a6f52befd58a405d0e406d7d0d37a8e57 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc @@ -1877,6 +1877,30 @@ TEST_P(HloDataflowAnalysisTest, NestedConditionals) { } } +TEST_P(HloDataflowAnalysisTest, AddDependency) { + string module_string = R"( +HloModule AddDependency +ENTRY %AddDependency (p: f32[3]) -> f32[3] { + %p = f32[3] parameter(0) + %token = token[] after-all() + ROOT %add_dep = f32[3] add-dependency(f32[3] %p, token[] %token) +} +)"; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + ParseHloString(module_string, GetModuleConfigForTest())); + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr analysis, + HloDataflowAnalysis::Run(*module)); + const HloInstruction* root = module->entry_computation()->root_instruction(); + EXPECT_EQ(root->opcode(), HloOpcode::kAddDependency); + + // The after-all and parameter should define a value. Add-dependency should + // not. + EXPECT_EQ(analysis->values().size(), 2); + EXPECT_FALSE(analysis->ValueIsDefinedAt(root)); +} + INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation, HloDataflowAnalysisTest, ::testing::Values(false, true)); diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc index 7fcafafc097a623686ca98a7cb3c6256c7904f6d..51a3fba1768aaf219b78ddc09a1c526448389d9e 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator.cc +++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc @@ -1046,8 +1046,15 @@ Status HloEvaluator::HandleBroadcast(HloInstruction* broadcast) { return Status::OK(); } -Status HloEvaluator::HandleAfterAll(HloInstruction* token) { - evaluated_[token] = LiteralUtil::CreateToken(); +Status HloEvaluator::HandleAfterAll(HloInstruction* after_all) { + evaluated_[after_all] = LiteralUtil::CreateToken(); + return Status::OK(); +} + +Status HloEvaluator::HandleAddDependency(HloInstruction* add_dependency) { + // AddDedendency just forwards its zero-th operand. + evaluated_[add_dependency] = + GetEvaluatedLiteralFor(add_dependency->operand(0)).Clone(); return Status::OK(); } @@ -1279,10 +1286,10 @@ StatusOr EvaluateSortInternal(HloInstruction* sort, key_value_vector.push_back( std::make_pair(keys_data[i], values_data[i])); } - std::sort(key_value_vector.begin(), key_value_vector.end(), - [](const kv_pair& a, const kv_pair& b) { - return SafeLess(a.first, b.first); - }); + std::stable_sort(key_value_vector.begin(), key_value_vector.end(), + [](const kv_pair& a, const kv_pair& b) { + return SafeLess(a.first, b.first); + }); std::vector result_keys; // We use a InlinedVector here because we need to convert it to an // absl::Span later, and this would not work with std::vector. diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h index d751f40fff872b831338dc8aa08a04cb00d2838c..d847900010c697d7d280ed8e4a9502f1c465ee07 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator.h +++ b/tensorflow/compiler/xla/service/hlo_evaluator.h @@ -180,7 +180,9 @@ class HloEvaluator : public DfsHloVisitorWithDefault { Status HandleBroadcast(HloInstruction* broadcast) override; - Status HandleAfterAll(HloInstruction* token) override; + Status HandleAfterAll(HloInstruction* after_all) override; + + Status HandleAddDependency(HloInstruction* add_dependency) override; Status HandleSort(HloInstruction* sort) override; diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h index b79412e4d9a1eff6cd693352ba21d7caf7397530..b87fc3e34012e75ee07bff6c1e113dce404f83cb 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h +++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h @@ -593,7 +593,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { return Status::OK(); } - Status HandleDivide(HloInstruction* divide) { + Status HandleDivide(HloInstruction* divide) override { return HandleDivide(divide); } @@ -1553,10 +1553,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { const auto& row_data = row_to_sort.data(); std::vector result_data(row_data.begin(), row_data.end()); - std::sort(result_data.begin(), result_data.end(), - [](const NativeT& a, const NativeT& b) { - return SafeLess(a, b); - }); + std::stable_sort(result_data.begin(), result_data.end(), + [](const NativeT& a, const NativeT& b) { + return SafeLess(a, b); + }); Literal sorted_row(ShapeUtil::MakeShape(keys->shape().element_type(), {sort_dim_elements})); sorted_row.PopulateR1(absl::Span(result_data)); @@ -2543,12 +2543,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { template ::value || - std::is_same::value || - std::is_same::value>::type* = nullptr> + std::is_integral::value || + std::is_floating_point::value>::type* = nullptr> Status HandleIota(HloInstruction* instruction) { auto* iota = Cast(instruction); - std::vector data(iota->shape().dimensions(iota->iota_dimension())); + // Avoid using std::vector since std::vector does not convert to + // absl::Span. + absl::InlinedVector data( + iota->shape().dimensions(iota->iota_dimension())); std::iota(data.begin(), data.end(), 0); auto result = LiteralUtil::CreateR1(data); @@ -2565,9 +2567,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault { } template ::value || - std::is_same::value || - std::is_same::value)>::type* = nullptr> + !(std::is_integral::value || + std::is_floating_point::value)>::type* = nullptr> Status HandleIota(HloInstruction* iota) { return InvalidArgument("Unsupported type for iota"); } diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc new file mode 100644 index 0000000000000000000000000000000000000000..631b3ad735f369922d10b37d11e2a1b1ba117e6b --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc @@ -0,0 +1,66 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h" + +#include "absl/algorithm/container.h" +#include "tensorflow/compiler/xla/literal_util.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/shape_inference.h" + +namespace xla { + +namespace { + +StatusOr ReplaceGetSize(HloInstruction* instr) { + if (instr->opcode() != HloOpcode::kGetDimensionSize) { + return false; + } + HloComputation* computation = instr->parent(); + + TF_ASSIGN_OR_RETURN(auto legal_shape, + ShapeInference::InferGetDimensionSizeShape( + instr->operand(0)->shape(), instr->dimension())); + TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape)); + TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), U32)); + uint32 size = instr->operand(0)->shape().dimensions(instr->dimension()); + HloInstruction* new_instr = computation->AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(size))); + TF_RETURN_IF_ERROR(computation->ReplaceInstruction(instr, new_instr)); + return true; +} + +} // namespace + +StatusOr HloGetDimensionSizeRewriter::Run(HloModule* module) { + bool changed = false; + HloProto proto; + *proto.mutable_hlo_module() = module->ToProto(); + for (auto* computation : module->computations()) { + // Replacing instructions will change the instruction list in the + // computation. So instead of iterating computation->instructions() + // directly, we make a copy of the list to avoid use-after-free. + std::vector instrs(computation->instruction_count()); + absl::c_copy(computation->instructions(), instrs.begin()); + for (auto instruction : instrs) { + TF_ASSIGN_OR_RETURN(bool replaced, ReplaceGetSize(instruction)); + changed = changed || replaced; + } + } + return changed; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h new file mode 100644 index 0000000000000000000000000000000000000000..30f44c23a835b3bcc935caaa917e040e07c4e703 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h @@ -0,0 +1,36 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_ + +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" + +namespace xla { + +// Pass to replace a kGetDimensionSize instruction with a constant instruction. +class HloGetDimensionSizeRewriter : public HloModulePass { + public: + absl::string_view name() const override { + return "hlo-get-dimension-size-rewriter"; + } + + StatusOr Run(HloModule* module) override; +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_ diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a86aebdd5b64240e6e07d8e8050c0c8681cce765 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc @@ -0,0 +1,83 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h" + +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/tests/literal_test_util.h" +#include "tensorflow/compiler/xla/tests/test_utils.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { +namespace { + +namespace op = xla::testing::opcode_matchers; + +class HloGetDimensionSizeRewriterTest : public HloTestBase { + protected: + HloGetDimensionSizeRewriterTest() {} +}; + +TEST_F(HloGetDimensionSizeRewriterTest, Ok) { + auto module = ParseHloString(R"( +HloModule _ +ENTRY gds { + p = s32[3,4] parameter(0) + size0 = u32[] get-dimension-size(p), dimensions={0} + size1 = u32[] get-dimension-size(p), dimensions={1} + ROOT mul = u32[] multiply(size0, size1) +})") + .ValueOrDie(); + HloGetDimensionSizeRewriter pass; + EXPECT_TRUE(pass.Run(module.get()).ValueOrDie()); + EXPECT_THAT(module->entry_computation()->root_instruction(), + op::Multiply(op::Constant(), op::Constant())); +} + +TEST_F(HloGetDimensionSizeRewriterTest, IllegalType) { + auto module = ParseHloString(R"( +HloModule _ +ENTRY gds { + p = s32[3]{0} parameter(0) + ROOT gds = s64[] get-dimension-size(p), dimensions={0} +})") + .ValueOrDie(); + HloGetDimensionSizeRewriter pass; + EXPECT_FALSE(pass.Run(module.get()).ok()); +} + +TEST_F(HloGetDimensionSizeRewriterTest, IllegalDimension) { + auto module = ParseHloString(R"( +HloModule _ +ENTRY gds { + p = f32[2,5] parameter(0) + ROOT gds = u32[] get-dimension-size(p), dimensions={2} +})") + .ValueOrDie(); + HloGetDimensionSizeRewriter pass; + EXPECT_FALSE(pass.Run(module.get()).ok()); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index 05cc1593e4ef4fc52b94e0536628645b1fa2abbc..7e9e94ca5f6f41ef4c5fd624a5b2f263aaf9ebdc 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -987,6 +987,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { case HloOpcode::kGetTupleElement: case HloOpcode::kTrace: case HloOpcode::kAfterAll: + case HloOpcode::kAddDependency: case HloOpcode::kTuple: return kWhite; case HloOpcode::kBroadcast: diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index ce255292d2805e214cb65d4607cf6548ff768c40..1e3881c34f0701050f1c87e15719ec403ca119d8 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -855,6 +855,16 @@ HloInstruction::CreateCollectivePermute( new HloInstruction(HloOpcode::kAfterAll, ShapeUtil::MakeTokenShape())); } +/* static */ std::unique_ptr +HloInstruction::CreateAddDependency(HloInstruction* data_operand, + HloInstruction* token_operand) { + auto instruction = absl::WrapUnique( + new HloInstruction(HloOpcode::kAddDependency, data_operand->shape())); + instruction->AppendOperand(data_operand); + instruction->AppendOperand(token_operand); + return instruction; +} + /* static */ std::unique_ptr HloInstruction::CreateWhile( const Shape& shape, HloComputation* condition, HloComputation* body, HloInstruction* init) { @@ -1394,6 +1404,10 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( clone = CreateAfterAll(new_operands); } break; + case HloOpcode::kAddDependency: + CHECK_EQ(new_operands.size(), 2); + clone = CreateAddDependency(new_operands[0], new_operands[1]); + break; } // SetupDerivedInstruction will setup the precision_config_ field. SetupDerivedInstruction(clone.get()); @@ -1680,6 +1694,7 @@ bool HloInstruction::IdenticalSlowPath( // This opcode has complex or special behavior so just return false. case HloOpcode::kAfterAll: + case HloOpcode::kAddDependency: return false; // Remaining instructions with special values. @@ -2467,6 +2482,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return visitor->HandleDomain(this); case HloOpcode::kAfterAll: return visitor->HandleAfterAll(this); + case HloOpcode::kAddDependency: + return visitor->HandleAddDependency(this); case HloOpcode::kIota: return visitor->HandleIota(this); case HloOpcode::kGetDimensionSize: @@ -2628,36 +2645,6 @@ Status HloInstruction::AcceptWithOperandOrder( return Status::OK(); } -namespace { - -// Returns true if the given order is a topological sort of the instructions -// it contains. -bool OrderIsTopologicalSort(const std::vector& order) { - // Create a map from instruction to its position in 'order'. - std::unordered_map order_position; - for (int i = 0; i < order.size(); i++) { - if (!order_position.insert({order[i], i}).second) { - // Instruction order[i] is duplicated in the order. - return false; - } - } - // Verify that the operand of each instruction in the order is also in the - // order *and* the operand's position is earlier (defs are before uses for - // all ops). - for (auto* instruction : order) { - for (auto* operand : instruction->operands()) { - if (!ContainsKey(order_position, operand) || - order_position.at(operand) >= order_position.at(instruction)) { - return false; - } - } - } - - return true; -} - -} // namespace - Status HloInstruction::Accept( const std::function& visitor_func) { FunctionVisitor visitor(visitor_func); diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 95ad29235afa36dc4091feec54cd4b0f5f24048f..87748a771a64dd9b26eec521796deb9619cd8879 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -770,6 +770,9 @@ class HloInstruction { static std::unique_ptr CreateGetDimensionSize( const Shape& shape, HloInstruction* operand, int64 dimension); + static std::unique_ptr CreateAddDependency( + HloInstruction* data_operand, HloInstruction* token_operand); + // Returns the opcode for this instruction. HloOpcode opcode() const { return opcode_; } diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc index 234fcd266aa09e193849ffb4526599114dfe22fe..d2740bcce26f04c5d7c8b64cfdaea53e3c697855 100644 --- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc +++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc @@ -73,7 +73,7 @@ class ListScheduler { // Construct and return a memory-minimizing sequence of HLO instructions // containing the given HLO computation. static StatusOr Run( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& @@ -98,7 +98,7 @@ class ListScheduler { // comparison operators. using Priority = std::pair; - ListScheduler(const HloComputation& computation, + ListScheduler(HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& @@ -111,7 +111,7 @@ class ListScheduler { // instruction. An HLO instruction "uses" a LogicalBuffer if the // LogicalBuffer is in an operand of the instruction as indicated by // points-to analysis. - for (auto* instruction : computation.instructions()) { + for (auto* instruction : computation->instructions()) { absl::flat_hash_set instr_uses; for (auto* operand : instruction->operands()) { points_to_analysis.GetPointsToSet(operand).ForEachElement( @@ -126,13 +126,13 @@ class ListScheduler { // Create map containing the number of unscheduled uses (hlo instructions) // of each logical buffer. - for (auto* instruction : computation.instructions()) { + for (auto* instruction : computation->instructions()) { for (auto* buffer : points_to_analysis.GetBuffersDefinedByInstruction(instruction)) { unscheduled_use_count_[buffer] = 0; } } - for (auto* instruction : computation.instructions()) { + for (auto* instruction : computation->instructions()) { for (const LogicalBuffer* buffer : buffer_uses_.at(instruction)) { ++unscheduled_use_count_[buffer]; } @@ -141,7 +141,7 @@ class ListScheduler { // Buffers live out of the computation have an implicit use at the end of // the computation. for (const LogicalBuffer* live_out_buffer : - points_to_analysis.GetPointsToSet(computation.root_instruction()) + points_to_analysis.GetPointsToSet(computation->root_instruction()) .CreateFlattenedSet()) { ++unscheduled_use_count_[live_out_buffer]; } @@ -157,7 +157,7 @@ class ListScheduler { // HloInstruction, plus some cached metadata, saved for the purposes of making // BytesFreedIfScheduled fast. struct ReadyListEntry { - const HloInstruction* instruction; + HloInstruction* instruction; // The total size of all buffers defined by this instruction. int64 bytes_defined; @@ -171,7 +171,7 @@ class ListScheduler { }; // Creates a ReadyListEntry for the given instruction. - ReadyListEntry MakeReadyListEntry(const HloInstruction* instruction) { + ReadyListEntry MakeReadyListEntry(HloInstruction* instruction) { ReadyListEntry entry; entry.instruction = instruction; @@ -250,13 +250,13 @@ class ListScheduler { // Populate the ready list with instructions which have no operands or // control predecessors. absl::flat_hash_map unscheduled_pred_count; - for (auto* instruction : computation_.instructions()) { + for (auto* instruction : computation_->instructions()) { // TODO(b/34466113): Replace this and above with successors() or // predecessors() when these methods are added to HloInstruction. - for (const HloInstruction* user : instruction->users()) { + for (HloInstruction* user : instruction->users()) { unscheduled_pred_count[user]++; } - for (const HloInstruction* succ : instruction->control_successors()) { + for (HloInstruction* succ : instruction->control_successors()) { unscheduled_pred_count[succ]++; } } @@ -275,7 +275,7 @@ class ListScheduler { ready_instructions[inst] = it; }; - for (auto* instruction : computation_.instructions()) { + for (auto* instruction : computation_->instructions()) { if (instruction->operands().empty() && instruction->control_predecessors().empty()) { add_to_ready_queue(instruction); @@ -287,7 +287,7 @@ class ListScheduler { // schedule. auto best_it = ready_queue.end(); --best_it; - const HloInstruction* best = best_it->second.instruction; + HloInstruction* best = best_it->second.instruction; VLOG(2) << "Schedule instruction: " << best->ToShortString() << " Bytes freed: " << best_it->first.first; ready_queue.erase(best_it); @@ -348,13 +348,13 @@ class ListScheduler { } } } - CHECK_EQ(schedule.size(), computation_.instruction_count()); - CHECK_EQ(scheduled_instructions_.size(), computation_.instruction_count()); + CHECK_EQ(schedule.size(), computation_->instruction_count()); + CHECK_EQ(scheduled_instructions_.size(), computation_->instruction_count()); return schedule; } - const HloComputation& computation_; + HloComputation* computation_; const TuplePointsToAnalysis& points_to_analysis_; const LogicalBuffer::SizeFunction& size_function_; // Computations are analyzed in post-order. When scheduling an instruction @@ -386,13 +386,13 @@ int64 SumLogicalBufferSizes( } StatusOr ScheduleComputationHelper( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const MemorySchedulerAlgorithm& algorithm, const absl::flat_hash_map& memory_by_computation) { - VLOG(2) << "Computation: " << computation.name(); + VLOG(2) << "Computation: " << computation->name(); if (algorithm) { return algorithm(computation, points_to_analysis, size_function, memory_by_computation); @@ -404,17 +404,17 @@ StatusOr ScheduleComputationHelper( } // namespace StatusOr DFSMemoryScheduler( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& memory_by_computation) { // These variables are a hack to prevent overflows. int64 cumulative_total_size = 0; - int64 total_hlos = computation.parent()->instruction_count(); + int64 total_hlos = computation->parent()->instruction_count(); absl::flat_hash_map extra_users; absl::flat_hash_map total_sizes; - for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) { + for (const HloInstruction* hlo : computation->MakeInstructionPostOrder()) { if (ListScheduler::IgnoreInstruction(*hlo)) { extra_users[hlo] = 0; total_sizes[hlo] = 0; @@ -448,8 +448,8 @@ StatusOr DFSMemoryScheduler( total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size); extra_users[hlo] = std::min(extra_users[hlo], total_hlos); } - CHECK_EQ(extra_users.size(), computation.instruction_count()); - CHECK_EQ(total_sizes.size(), computation.instruction_count()); + CHECK_EQ(extra_users.size(), computation->instruction_count()); + CHECK_EQ(total_sizes.size(), computation->instruction_count()); // Construct a total order based on DFS post-order, visiting operands in // decreasing cumulative extra user order, and next by cumulative size, with a @@ -459,7 +459,7 @@ StatusOr DFSMemoryScheduler( sequence.push_back(hlo); return Status::OK(); }); - TF_RETURN_IF_ERROR(computation.AcceptWithOperandOrder( + TF_RETURN_IF_ERROR(computation->AcceptWithOperandOrder( &visitor, [&extra_users, &total_sizes](const HloInstruction* a, const HloInstruction* b) { if (extra_users[a] != extra_users[b]) { @@ -470,12 +470,12 @@ StatusOr DFSMemoryScheduler( } return a->name() < b->name(); })); - CHECK_EQ(sequence.size(), computation.instruction_count()); + CHECK_EQ(sequence.size(), computation->instruction_count()); return sequence; } // namespace xla StatusOr ListMemoryScheduler( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& @@ -485,16 +485,16 @@ StatusOr ListMemoryScheduler( } StatusOr PostOrderMemoryScheduler( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& memory_by_computation) { - return HloInstructionSequence(computation.MakeInstructionPostOrder()); + return HloInstructionSequence(computation->MakeInstructionPostOrder()); } StatusOr DefaultMemoryScheduler( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& @@ -513,7 +513,7 @@ StatusOr DefaultMemoryScheduler( memory_by_computation)); TF_ASSIGN_OR_RETURN(const int64 list_memory, HeapSimulator::MinimumMemoryForComputation( - computation, list_sequence, points_to_analysis, + *computation, list_sequence, points_to_analysis, size_function, &memory_by_computation)); VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory); @@ -522,7 +522,7 @@ StatusOr DefaultMemoryScheduler( size_function, memory_by_computation)); TF_ASSIGN_OR_RETURN(const int64 dfs_memory, HeapSimulator::MinimumMemoryForComputation( - computation, dfs_sequence, points_to_analysis, + *computation, dfs_sequence, points_to_analysis, size_function, &memory_by_computation)); VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory); @@ -532,7 +532,7 @@ StatusOr DefaultMemoryScheduler( memory_by_computation)); TF_ASSIGN_OR_RETURN(const int64 post_order_memory, HeapSimulator::MinimumMemoryForComputation( - computation, post_order_sequence, points_to_analysis, + *computation, post_order_sequence, points_to_analysis, size_function, &memory_by_computation)); VLOG(2) << "Min-memory post order sequence: " << HumanReadableNumBytes(post_order_memory); @@ -555,17 +555,17 @@ StatusOr DefaultMemoryScheduler( } StatusOr ScheduleModule( - const HloModule& module, const LogicalBuffer::SizeFunction& size_function, + HloModule* module, const LogicalBuffer::SizeFunction& size_function, const MemorySchedulerAlgorithm& algorithm) { - HloSchedule schedule(&module); + HloSchedule schedule(module); TF_ASSIGN_OR_RETURN(std::unique_ptr points_to_analysis, - TuplePointsToAnalysis::Run(&module)); + TuplePointsToAnalysis::Run(module)); absl::flat_hash_map memory_by_computation; - for (const auto* computation : module.MakeComputationPostOrder()) { + for (auto* computation : module->MakeComputationPostOrder()) { if (!computation->IsFusionComputation()) { TF_ASSIGN_OR_RETURN(HloInstructionSequence computation_sequence, ScheduleComputationHelper( - *computation, *points_to_analysis, size_function, + computation, *points_to_analysis, size_function, algorithm, memory_by_computation)); memory_by_computation[computation] = HeapSimulator::MinimumMemoryForComputation( @@ -583,11 +583,11 @@ StatusOr ScheduleModule( } StatusOr ScheduleComputation( - const HloComputation& computation, + HloComputation* computation, const LogicalBuffer::SizeFunction& size_function) { - CHECK(!computation.IsFusionComputation()); + CHECK(!computation->IsFusionComputation()); TF_ASSIGN_OR_RETURN(std::unique_ptr points_to_analysis, - TuplePointsToAnalysis::Run(computation.parent())); + TuplePointsToAnalysis::Run(computation->parent())); absl::flat_hash_map empty_map; return ScheduleComputationHelper(computation, *points_to_analysis, size_function, nullptr, empty_map); @@ -600,7 +600,7 @@ HloMemoryScheduler::HloMemoryScheduler( StatusOr HloMemoryScheduler::Run(HloModule* module) { TF_ASSIGN_OR_RETURN(HloSchedule schedule, - ScheduleModule(*module, size_function_, algorithm_)); + ScheduleModule(module, size_function_, algorithm_)); TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule))); return true; } diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h index cca5dc493989811a0bb9790c3237e5468a3f2d67..7227bfb27c74758d2b79e404afc9eb97a1ca894d 100644 --- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h +++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h @@ -36,14 +36,14 @@ namespace xla { // that describes buffer aliasing, together with a target-specific size function // that maps a tensor's logical size to its padded size. typedef std::function( - const HloComputation&, const TuplePointsToAnalysis&, + HloComputation*, const TuplePointsToAnalysis&, const LogicalBuffer::SizeFunction&, const absl::flat_hash_map&)> MemorySchedulerAlgorithm; // List scheduler StatusOr ListMemoryScheduler( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& @@ -51,7 +51,7 @@ StatusOr ListMemoryScheduler( // DFS-order scheduler StatusOr DFSMemoryScheduler( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& @@ -59,7 +59,7 @@ StatusOr DFSMemoryScheduler( // Naive Post Order scheduler StatusOr PostOrderMemoryScheduler( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& @@ -69,7 +69,7 @@ StatusOr PostOrderMemoryScheduler( // and the DFS scheduler, and chooses whichever returns a lower min-memory, // not accounting for fragmentation. StatusOr DefaultMemoryScheduler( - const HloComputation& computation, + HloComputation* computation, const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_function, const absl::flat_hash_map& @@ -79,13 +79,13 @@ StatusOr DefaultMemoryScheduler( // the computation. size_function is the function returning the number of bytes // required for a LogicalBuffer. StatusOr ScheduleModule( - const HloModule& module, const LogicalBuffer::SizeFunction& size_function, + HloModule* module, const LogicalBuffer::SizeFunction& size_function, const MemorySchedulerAlgorithm& algorithm = {}); // Computes the schedule for a single computation. // Currently only used by the GPU backend. StatusOr ScheduleComputation( - const HloComputation& computation, + HloComputation* computation, const LogicalBuffer::SizeFunction& size_function); // A pass which schedules the HLO instructions in a module. The HloModule's diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc index 3d8482065a4a243dfe3efb3ebb173cf214a49f0c..bc0d7e2bc00eab014f2660c95a51b966642eaee9 100644 --- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc +++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc @@ -78,7 +78,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) { TF_ASSERT_OK(module->schedule().Verify()); // Verify that all instructions are in the sequence. - const std::vector& sequence = + const std::vector& sequence = module->schedule().sequence(module->entry_computation()).instructions(); EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size()); @@ -124,9 +124,9 @@ ENTRY root { }; TF_ASSERT_OK_AND_ASSIGN( HloSchedule schedule, - ScheduleModule(*module, size_fn, ListMemoryScheduler)); + ScheduleModule(module.get(), size_fn, ListMemoryScheduler)); // Verify that all instructions are in the sequence. - const std::vector& sequence = + const std::vector& sequence = schedule.sequence(module->entry_computation()).instructions(); EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size()); @@ -175,12 +175,13 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) { auto module = CreateNewVerifiedModule(); module->AddEntryComputation(builder.Build()); TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule, - ScheduleModule(*module, - [](const BufferValue& buffer) { - return ShapeUtil::ByteSizeOf( - buffer.shape(), TUPLE_SIZE); - }, - ListMemoryScheduler)); + ScheduleModule( + module.get(), + [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape(), + TUPLE_SIZE); + }, + ListMemoryScheduler)); // Verify that all instructions are in the sequence. EXPECT_EQ(module->entry_computation()->instruction_count(), @@ -225,12 +226,12 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) { {tuple, mul, add}, HloInstruction::FusionKind::kLoop); TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule, - ScheduleModule(*module, - [](const BufferValue& buffer) { - return ShapeUtil::ByteSizeOf( - buffer.shape(), 2); - }, - ListMemoryScheduler)); + ScheduleModule( + module.get(), + [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape(), 2); + }, + ListMemoryScheduler)); // Verify that all instructions are in the sequence. EXPECT_EQ(module->entry_computation()->instruction_count(), @@ -284,7 +285,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) { }; TF_ASSERT_OK_AND_ASSIGN( HloSchedule schedule, - ScheduleModule(*module, size_fn, ListMemoryScheduler)); + ScheduleModule(module.get(), size_fn, ListMemoryScheduler)); // Verify that all instructions are in the sequence. auto entry_computation = module->entry_computation(); EXPECT_EQ(module->entry_computation()->instruction_count(), diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc index 14bf17f4be16f8cf820753bc9f0473029834f1f8..a01853fe1f272a28a7002274cf5774e034c3337d 100644 --- a/tensorflow/compiler/xla/service/hlo_module.cc +++ b/tensorflow/compiler/xla/service/hlo_module.cc @@ -240,8 +240,10 @@ HloModuleProto HloModule::ToProto() const { *proto.mutable_schedule() = schedule().ToProto().ValueOrDie(); } *proto.mutable_host_program_shape() = - entry_computation_layout().ComputeProgramShape(); + entry_computation_layout().ComputeProgramShape().ToProto(); *proto.mutable_input_output_alias() = input_output_alias_config().ToProto(); + *proto.mutable_dynamic_parameter_binding() = + dynamic_parameter_binding().ToProto(); return proto; } @@ -325,6 +327,10 @@ StatusOr> HloModule::CreateFromProto( // Because we didn't uniquify the names or the ids, double-check that the // instruction and computation names and ids are unique from the proto. + TF_ASSIGN_OR_RETURN(module->dynamic_parameter_binding_, + DynamicParameterBinding::CreateFromProto( + proto.dynamic_parameter_binding())); + absl::flat_hash_set computation_names; absl::flat_hash_set instruction_names; absl::flat_hash_set computation_ids; @@ -365,7 +371,7 @@ StatusOr HloModule::CreateModuleConfigFromProto( << "No program shape found in the proto"; const auto& program_shape = module.host_program_shape(); - HloModuleConfig module_config(program_shape); + HloModuleConfig module_config(ProgramShape{program_shape}); module_config.set_debug_options(debug_options); // The module config is constructed with default layouts regardless of what is diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h index 8a1f999e3ab076b87a651a915f4de93320e7067f..66622a1d260c28078d69b01b858fd292b697805b 100644 --- a/tensorflow/compiler/xla/service/hlo_module.h +++ b/tensorflow/compiler/xla/service/hlo_module.h @@ -28,6 +28,7 @@ limitations under the License. #include "absl/types/optional.h" #include "absl/types/span.h" #include "tensorflow/compiler/xla/iterator_util.h" +#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_clone_context.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" @@ -103,11 +104,7 @@ class HloModule { HloCloneContext* context = nullptr); // Return a pointer to the entry computation of the module. - const HloComputation* entry_computation() const { - CHECK_NE(nullptr, entry_computation_); - return entry_computation_; - } - HloComputation* entry_computation() { + HloComputation* entry_computation() const { CHECK_NE(nullptr, entry_computation_); return entry_computation_; } @@ -232,6 +229,16 @@ class HloModule { return input_output_alias_config_; } + // DynamicParameterBinding holds the list of bindings that indicates which + // parameter dimensions are dynamic and which parameters represent their + // runtime value. + DynamicParameterBinding& dynamic_parameter_binding() { + return dynamic_parameter_binding_; + } + const DynamicParameterBinding& dynamic_parameter_binding() const { + return dynamic_parameter_binding_; + } + // Returns an id that is unique to this module across all modules created over // the lifetime of this process. int unique_id() const { return unique_id_; } @@ -285,6 +292,9 @@ class HloModule { // alias_config indicates the alias information of input/output buffers that // are expected from the module. HloInputOutputAliasConfig input_output_alias_config_; + + // Bindings for dynamic parameter mapping. + DynamicParameterBinding dynamic_parameter_binding_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h index 70c7d70b41c5c7bc94d1fac83c0fcf71f155b5f0..127cfd165a5d8229cac3035f56a66f1bcfa734f3 100644 --- a/tensorflow/compiler/xla/service/hlo_opcode.h +++ b/tensorflow/compiler/xla/service/hlo_opcode.h @@ -47,6 +47,8 @@ namespace xla { #define HLO_OPCODE_LIST(V) \ V(kAbs, "abs") \ V(kAdd, "add") \ + V(kAddDependency, "add-dependency") \ + V(kAfterAll, "after-all", kHloOpcodeIsVariadic) \ V(kAllToAll, "all-to-all") \ V(kAtan2, "atan2") \ V(kBatchNormGrad, "batch-norm-grad") \ @@ -84,7 +86,6 @@ namespace xla { V(kGather, "gather") \ V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \ V(kGetDimensionSize, "get-dimension-size") \ - V(kAfterAll, "after-all", kHloOpcodeIsVariadic) \ V(kGetTupleElement, "get-tuple-element") \ V(kGt, "greater-than", kHloOpcodeIsComparison) \ V(kImag, "imag") \ diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc index f5f99bece18cc637365118ddcd1273da05f4e1b6..ca6a154809be46d6a0305c29e2b89219de408019 100644 --- a/tensorflow/compiler/xla/service/hlo_ordering.cc +++ b/tensorflow/compiler/xla/service/hlo_ordering.cc @@ -356,8 +356,7 @@ void SequentialHloOrdering::Initialize() { // Create a map from instruction to its order position. TF_DCHECK_OK(schedule_.Verify()); for (const auto& computation_sequence : schedule_.sequences()) { - const std::vector& order = - computation_sequence.second.instructions(); + const auto& order = computation_sequence.second.instructions(); for (int i = 0; i < order.size(); ++i) { InsertOrDie(&order_position_, order[i], i); } diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc index 4390145c6bd7484987b2851ef92336defffb388b..9b5bb5d0bd6af104ef62eaa5d3e53cedbe0213d3 100644 --- a/tensorflow/compiler/xla/service/hlo_parser.cc +++ b/tensorflow/compiler/xla/service/hlo_parser.cc @@ -47,11 +47,11 @@ const double kF16max = 65504; // Creates and returns a schedule created using the order of the instructions in // the HloComputation::instructions() vectors in the module. -HloSchedule ScheduleFromInstructionOrder(const HloModule* module) { +HloSchedule ScheduleFromInstructionOrder(HloModule* module) { HloSchedule schedule(module); - for (const HloComputation* computation : module->computations()) { + for (HloComputation* computation : module->computations()) { if (!computation->IsFusionComputation()) { - for (const HloInstruction* instruction : computation->instructions()) { + for (HloInstruction* instruction : computation->instructions()) { schedule.GetOrCreateSequence(computation).push_back(instruction); } } @@ -850,6 +850,15 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder, } break; } + case HloOpcode::kAddDependency: { + if (!ParseOperands(&operands, /*expected_size=*/2) || + !ParseAttributes(attrs)) { + return false; + } + instruction = builder->AddInstruction( + HloInstruction::CreateAddDependency(operands[0], operands[1])); + break; + } case HloOpcode::kSort: { optional> dimensions; attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc index 88682e55fb37e6cacbeaf5826286cc9f70e57e3b..f13f7504ee1fcc9edc94a40c6d48786b196a7959 100644 --- a/tensorflow/compiler/xla/service/hlo_parser_test.cc +++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc @@ -1241,7 +1241,38 @@ ENTRY Sort { } )" + }, +// AfterAll with multiple operands +{ +"AfterAllWithMultipleOperands", +R"(HloModule AfterAllWithMultipleOperands + +ENTRY AfterAllWithMultipleOperands { + p0 = f32[] parameter(0) + token0 = token[] after-all() + token1 = token[] after-all() + ROOT after-all = token[] after-all(p0, token0, token1) } + +)" +}, +// AddDependency +// A dependency chain is created from 'neg' to 'exp' using tokens. +{ +"AddDependency", +R"(HloModule AddDependency + +ENTRY AddDependency { + p = f32[] parameter(0) + neg = f32[] negate(p) + token = token[] after-all(neg) + p_after_token = f32[] add-dependency(p, token) + exp = f32[] exponential(p_after_token) + ROOT sum = f32[] add(neg, exp) +} + +)" +}, }); // clang-format on } diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc index 49e46ecd00ee4370f3e93746348373b79febed3d..48add75523f02005c70bc6baf69a6b7d5aa4f7ef 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc +++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc @@ -130,10 +130,10 @@ using ItemList = absl::InlinedVector; // before arbitrary elements. class InstructionList { public: - explicit InstructionList(const std::vector& order) { + explicit InstructionList(const HloInstructionSequence& order) { int64 position = 0; Item* last = nullptr; - for (const HloInstruction* inst : order) { + for (HloInstruction* inst : order.instructions()) { // Add a new item to the linked list. Item* item = new Item; item->next = nullptr; @@ -151,7 +151,7 @@ class InstructionList { // to be monotonically increasing through the list, and so is still useful // for quickly(-ish) determining the order of arbitrary instructions in // the list. - item->instruction = const_cast(inst); + item->instruction = inst; item->position = position; position++; @@ -927,7 +927,7 @@ Item* PickRematerializationCandidate( StatusOr HloRematerialization::ComputePeakMemory( const HloComputation* computation, - const std::vector& order) const { + const HloInstructionSequence& order) const { InstructionList instruction_list(order); MemoryUsageTracker tracker(computation, size_function_, *points_to_analysis_, instruction_list); @@ -971,8 +971,7 @@ StatusOr HloRematerialization::RematerializeComputation( << HumanReadableNumBytes(computation_peak_memory_.at(computation)); CHECK(!ContainsKey(rematerialized_computations_, computation)); - InstructionList instruction_list( - schedule->sequence(computation).instructions()); + InstructionList instruction_list(schedule->sequence(computation)); MemoryUsageTracker memory_tracker(computation, size_function_, *points_to_analysis_, instruction_list); bool changed = false; @@ -1184,7 +1183,7 @@ StatusOr HloRematerialization::RematerializeComputation( sequence.clear(); for (auto* item = instruction_list.first(); item != nullptr; item = instruction_list.next(item)) { - const HloInstruction* instruction = item->instruction; + HloInstruction* instruction = item->instruction; sequence.push_back(instruction); } rematerialized_computations_.insert(computation); @@ -1235,10 +1234,8 @@ StatusOr HloRematerialization::Run(HloModule* module) { if (node.context() == CallContext::kSequential) { TF_ASSIGN_OR_RETURN( computation_peak_memory_[node.computation()], - ComputePeakMemory(node.computation(), - module->schedule() - .sequence(node.computation()) - .instructions())); + ComputePeakMemory(node.computation(), module->schedule().sequence( + node.computation()))); } return Status::OK(); }, diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h index 70d83c04f07ca7fd0139f586869e8fe688f958f4..a07d348041b72bba45c6fd1f726f2a0065d01e53 100644 --- a/tensorflow/compiler/xla/service/hlo_rematerialization.h +++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h @@ -87,9 +87,8 @@ class HloRematerialization : public HloModulePass { // peak memory is the maximum total size of all live HLO instruction values at // any program point. 'order' is the order in which the HLO instructions will // be emitted which is used to determine lifespans of HLO values. - StatusOr ComputePeakMemory( - const HloComputation* computation, - const std::vector& order) const; + StatusOr ComputePeakMemory(const HloComputation* computation, + const HloInstructionSequence& order) const; // Returns the peak memory usage of the called computations for the given // instruction. Zero is returned if the instruction calls no computations. diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc index a5780b7551a43f2b64f2ac61ef1bf6ce9e07eb16..8f6eb974c5179b420c8f961393ca923e0a3b3530 100644 --- a/tensorflow/compiler/xla/service/hlo_schedule.cc +++ b/tensorflow/compiler/xla/service/hlo_schedule.cc @@ -46,8 +46,8 @@ namespace xla { << "No computation exists in HLO module with id " << computation_id; const HloComputation* computation = comp_it->second; - absl::flat_hash_map id_to_instruction; - for (const HloInstruction* instruction : computation->instructions()) { + absl::flat_hash_map id_to_instruction; + for (HloInstruction* instruction : computation->instructions()) { id_to_instruction[instruction->unique_id()] = instruction; } @@ -81,9 +81,8 @@ StatusOr HloSchedule::ToProto() const { return std::move(proto); } -void HloSchedule::set_sequence( - const HloComputation* computation, - absl::Span sequence) { +void HloSchedule::set_sequence(const HloComputation* computation, + absl::Span sequence) { set_sequence(computation, HloInstructionSequence(sequence)); } @@ -114,8 +113,8 @@ Status HloSchedule::UpdateComputationSchedule( const HloComputation* computation) { // Map from unique ID to HloInstruction pointer for instructions in the // computation. - absl::flat_hash_map id_to_instruction; - for (const HloInstruction* instruction : computation->instructions()) { + absl::flat_hash_map id_to_instruction; + for (HloInstruction* instruction : computation->instructions()) { InsertOrDie(&id_to_instruction, instruction->unique_id(), instruction); } @@ -128,7 +127,7 @@ Status HloSchedule::UpdateComputationSchedule( // Map from HloInstruction X to newly added instructions (instruction is in // computation, but not in schedule) which use X. If an instruction is not in // the map, then it has no users which are newly added instructions. - absl::flat_hash_map> + absl::flat_hash_map> new_instruction_uses; // For each newly added instruction, this is the count of the instruction's @@ -138,9 +137,9 @@ Status HloSchedule::UpdateComputationSchedule( // Create a worklist of newly added instructions which are ready to be added // to the schedule. Initialize worklist with those that have zero operands. - std::queue worklist; + std::queue worklist; - for (const HloInstruction* instruction : computation->instructions()) { + for (HloInstruction* instruction : computation->instructions()) { if (ids_in_schedule.count(instruction->unique_id()) == 0) { // This is a newly added instruction which is not in the schedule. if (instruction->operands().empty()) { @@ -161,17 +160,17 @@ Status HloSchedule::UpdateComputationSchedule( // Lambda which schedules all instructions on the worklist. auto schedule_worklist = [&]() { while (!worklist.empty()) { - const HloInstruction* instruction = worklist.front(); + HloInstruction* instruction = worklist.front(); worklist.pop(); new_sequence.push_back(instruction); - std::vector* new_users = + std::vector* new_users = tensorflow::gtl::FindOrNull(new_instruction_uses, instruction); if (new_users != nullptr) { // This just-scheduled instruction has users which are newly added to // the module. Update the number of unscheduled operands and push the // newly added instruction to the worklist if it is ready to // schedule. - for (const HloInstruction* new_user : *new_users) { + for (HloInstruction* new_user : *new_users) { unscheduled_operand_count.at(new_user)--; CHECK_GE(unscheduled_operand_count.at(new_user), 0); if (unscheduled_operand_count.at(new_user) == 0) { diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h index 0a714101ee587aa847fa674bbde5586287c51f33..486ddbf499de80c634bc497158cd79ca066cc866 100644 --- a/tensorflow/compiler/xla/service/hlo_schedule.h +++ b/tensorflow/compiler/xla/service/hlo_schedule.h @@ -35,14 +35,14 @@ class HloInstructionSequence { public: HloInstructionSequence() = default; explicit HloInstructionSequence( - absl::Span instructions) { - for (const HloInstruction* instruction : instructions) { + absl::Span instructions) { + for (HloInstruction* instruction : instructions) { push_back(instruction); } } // Adds the instruction to the end of the sequence. - void push_back(const HloInstruction* instruction) { + void push_back(HloInstruction* instruction) { instruction_sequence_.push_back(instruction); id_sequence_.push_back(instruction->unique_id()); } @@ -56,7 +56,7 @@ class HloInstructionSequence { int64 size() const { return instruction_sequence_.size(); } // Returns the sequence of HLO instructions. - const std::vector& instructions() const { + const std::vector& instructions() const { return instruction_sequence_; } @@ -65,7 +65,7 @@ class HloInstructionSequence { private: // The sequence as HloInstructions. - std::vector instruction_sequence_; + std::vector instruction_sequence_; // The sequence of HLO instructions, represented by their unique IDs. The // sequence is stored as both HloInstructions and unique IDs because the @@ -98,7 +98,7 @@ class HloSchedule { // Sets the sequence for the given computation to the given sequence. void set_sequence(const HloComputation* computation, - absl::Span sequence); + absl::Span sequence); void set_sequence(const HloComputation* computation, HloInstructionSequence sequence); diff --git a/tensorflow/compiler/xla/service/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/hlo_schedule_test.cc index 1424569ac1f62e4b965876141f1eb40be4f15bea..0e56e6f760e35ddcb45c6f58771d78405a09acfe 100644 --- a/tensorflow/compiler/xla/service/hlo_schedule_test.cc +++ b/tensorflow/compiler/xla/service/hlo_schedule_test.cc @@ -56,10 +56,10 @@ ENTRY main { ParseHloString(module_str)); TF_ASSERT_OK_AND_ASSIGN( HloSchedule schedule, - ScheduleModule(*module, [](const BufferValue& buffer) { + ScheduleModule(module.get(), [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape()); })); - const std::vector& entry_schedule = + const auto& entry_schedule = schedule.sequence(module->entry_computation()).instructions(); EXPECT_EQ(entry_schedule.size(), 6); @@ -90,7 +90,7 @@ ENTRY main { ParseHloString(module_str)); TF_ASSERT_OK_AND_ASSIGN( HloSchedule schedule, - ScheduleModule(*module, [](const BufferValue& buffer) { + ScheduleModule(module.get(), [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape()); })); @@ -139,7 +139,7 @@ ENTRY main { ParseHloString(module_str)); TF_ASSERT_OK_AND_ASSIGN( HloSchedule schedule, - ScheduleModule(*module, [](const BufferValue& buffer) { + ScheduleModule(module.get(), [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape()); })); @@ -183,7 +183,7 @@ ENTRY main { ParseHloString(module_str)); TF_ASSERT_OK_AND_ASSIGN( HloSchedule schedule, - ScheduleModule(*module, [](const BufferValue& buffer) { + ScheduleModule(module.get(), [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape()); })); @@ -244,7 +244,7 @@ ENTRY %WhileLoop () -> s32[] { ParseHloString(module_str)); TF_ASSERT_OK_AND_ASSIGN( HloSchedule schedule, - ScheduleModule(*module, [](const BufferValue& buffer) { + ScheduleModule(module.get(), [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/sizeof(void*)); })); @@ -313,7 +313,7 @@ ENTRY %WhileLoop () -> s32[] { ParseHloString(module_str)); TF_ASSERT_OK_AND_ASSIGN( HloSchedule schedule, - ScheduleModule(*module, [](const BufferValue& buffer) { + ScheduleModule(module.get(), [](const BufferValue& buffer) { return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/sizeof(void*)); })); diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h index b6670d409b92e8be42f5cdb40fba8d662ae83958..1f01b0bb365450a933da9cc443db5223c06903f0 100644 --- a/tensorflow/compiler/xla/service/hlo_value.h +++ b/tensorflow/compiler/xla/service/hlo_value.h @@ -166,9 +166,6 @@ class HloValue : public BufferValue { // Whether this value is live out of the HLO module. bool live_out_of_module_ = false; - - // Whether this value is live out of its computation. - bool live_out_of_computation_ = false; }; std::ostream& operator<<(std::ostream& out, const HloValue& hlo_value); diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index c5688a445acdcbcd06d1a43a117d1bde8f95f07d..017549ce1b95cc4abc67ec1d15dadbc1398c977a 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -753,7 +753,13 @@ Status ShapeVerifier::HandleAfterAll(HloInstruction* token) { for (const HloInstruction* operand : token->operands()) { operand_shapes.push_back(&operand->shape()); } - return CheckShape(token, ShapeInference::InferAfterAllShape(operand_shapes)); + return CheckShape(token, ShapeUtil::MakeTokenShape()); +} + +Status ShapeVerifier::HandleAddDependency(HloInstruction* add_dependency) { + TF_RETURN_IF_ERROR(CheckOperandCount(add_dependency, 2)); + TF_RETURN_IF_ERROR(CheckIsTokenOperand(add_dependency, 1)); + return CheckShape(add_dependency, add_dependency->operand(0)->shape()); } Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) { @@ -1426,6 +1432,8 @@ StatusOr HloVerifier::Run(HloModule* module) { return target_metadata_->ShapeSize(shape); })); + TF_RETURN_IF_ERROR(module->dynamic_parameter_binding().Verify(*module)); + return false; } diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h index 9fbfd6a21c1f1148801000169046fbcbb37934fe..e4d0c3d6957885f1d719fedb5a900de601e397f8 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.h +++ b/tensorflow/compiler/xla/service/hlo_verifier.h @@ -95,6 +95,7 @@ class ShapeVerifier : public DfsHloVisitor { Status HandleScatter(HloInstruction* scatter) override; Status HandleAfterAll(HloInstruction* token) override; Status HandleGetDimensionSize(HloInstruction* get_size) override; + Status HandleAddDependency(HloInstruction* add_dependency) override; Status FinishVisit(HloInstruction*) override { return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc index 7f2d7e7cffc6debaaf9b64fffc5a8a7037ecdaa3..2297edcbe1d167f0752423f76b795b3592e85c47 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion.cc @@ -103,7 +103,6 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kShiftRightLogical: case HloOpcode::kSlice: case HloOpcode::kSubtract: - case HloOpcode::kAfterAll: case HloOpcode::kTranspose: case HloOpcode::kTuple: case HloOpcode::kTupleSelect: @@ -116,7 +115,10 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) { case HloOpcode::kSin: return ShapeUtil::ElementIsComplex(instruction.shape()); - // Expensive instructions. + // Expensive instructions or unusual instructions for which fusion is + // nonsensical. + case HloOpcode::kAddDependency: + case HloOpcode::kAfterAll: case HloOpcode::kAtan2: case HloOpcode::kBatchNormGrad: case HloOpcode::kBatchNormInference: diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index a06d6113e84630df14ff68280c248cccb9afaf06..7635fbfed6f6a51fc9d203251d9bebf43cc63fd9 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -37,7 +37,7 @@ namespace xla { namespace interpreter { InterpreterExecutable::InterpreterExecutable( - std::unique_ptr hlo_module, + std::unique_ptr hlo_module, std::unique_ptr evaluator) : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr, /*hlo_profile_index_map=*/nullptr), diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h index 3b1ebce0c75457d65e6834c809fe488a9c4a159a..bda13d376360306c81230e41b01cefc6caff230d 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.h +++ b/tensorflow/compiler/xla/service/interpreter/executable.h @@ -42,7 +42,7 @@ namespace interpreter { // buffer allocation. Refer to interpreter/README.md for more. class InterpreterExecutable : public Executable { public: - InterpreterExecutable(std::unique_ptr hlo_module, + InterpreterExecutable(std::unique_ptr hlo_module, std::unique_ptr evaluator); ~InterpreterExecutable() override; diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc index a90411922205c0006159ff99f35a70138b1bee4f..eddef850cf5250b85b564c1e6c92d1cc8ecd1a43 100644 --- a/tensorflow/compiler/xla/service/layout_assignment.cc +++ b/tensorflow/compiler/xla/service/layout_assignment.cc @@ -2000,6 +2000,7 @@ bool LayoutAssignment::InstructionCanChangeLayout( switch (instruction->opcode()) { case HloOpcode::kAbs: case HloOpcode::kAdd: + case HloOpcode::kAddDependency: case HloOpcode::kAnd: case HloOpcode::kAtan2: case HloOpcode::kBitcastConvert: diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc index 2400b7bb7c409a4dcb33e6e8f4b409738510f3d6..61d8a0a4e6aa39e2e921acae1c65df1b3c329e46 100644 --- a/tensorflow/compiler/xla/service/layout_assignment_test.cc +++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc @@ -328,11 +328,10 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) { // %tuple.1 = Tuple(%copy) layout=({0,1}) // %tuple.2 = Tuple(%tuple.0, %tuple.1) layout=(({1,0}), ({0,1})) // - EXPECT_TRUE( - AlgebraicSimplifier(/*is_layout_sensitive=*/true, - [](const Shape&, const Shape&) { return false; }) - .Run(m.get()) - .ValueOrDie()); + AlgebraicSimplifierOptions options( + [](const Shape&, const Shape&) { return false; }); + options.set_is_layout_sensitive(true); + EXPECT_TRUE(AlgebraicSimplifier(options).Run(m.get()).ValueOrDie()); HloInstruction* root = m->entry_computation()->root_instruction(); // Verify layout of the root and the root's operands. EXPECT_TRUE(ShapeUtil::Equal(result_shape, root->shape())); diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h index c44ffb9e07be20b2ebaa697c6d8bcda1269c2c68..06002d57b0d7daa07f903feebe67a60a083c0e7c 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h @@ -90,6 +90,10 @@ class KernelMappingScheme { enum { DimZ = 0, DimY, DimX, DimTot }; public: + // dims_in_elems: the normalized tensor dimensions. + // req_block_sizes: the requested block size in number of tiles for each + // dimension. The actual block size is set to min(req_block_size, + // dims_in_number_of_blocks). explicit KernelMappingScheme(absl::Span dims_in_elems, int64 tile_size_y, int64 tile_size_x, absl::Span req_block_sizes, diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc index fd16af67fe99b4f440ad962b4b648a3b22c41dc6..e22c2173c271fc9571be1ddb0759d2b31562dc98 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc @@ -47,7 +47,8 @@ namespace { // Adds the inner comparison loop body where we compare elements. void EmitCompareLoopBody( int64 iteration_bound, PrimitiveType key_type, int64 num_values, - llvm::Value* element_pair_index, int64 xor_mask, llvm::Type* index_type, + int64 iota_values_parameter_index, llvm::Value* element_pair_index, + int64 xor_mask, llvm::Type* index_type, std::function read_element, std::function write_element, @@ -139,34 +140,42 @@ void EmitCompareLoopBody( is_signed_comparison = false; } // If key2 < key1 - ksl.IfReturnVoid( - "is_smaller_than", + auto is_smaller_than = b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT : llvm::ICmpInst::ICMP_ULT, - compare_key2, compare_key1), - [&]() { - // Swap key1 with key2. - write_element(0, current_keys_index, key2); - write_element(0, compare_keys_index, key1); - for (int64 i = 1; i <= num_values; ++i) { - // Also swap the values. - auto value1 = read_element(i, current_keys_index); - auto value2 = read_element(i, compare_keys_index); - write_element(i, current_keys_index, value2); - write_element(i, compare_keys_index, value1); - } - }); + compare_key2, compare_key1); + if (iota_values_parameter_index >= 0) { + auto keys_equal = b->CreateICmpEQ(compare_key1, compare_key2); + auto key_index1 = + read_element(iota_values_parameter_index, current_keys_index); + auto key_index2 = + read_element(iota_values_parameter_index, compare_keys_index); + auto index_is_smaller_than = + b->CreateICmp(llvm::ICmpInst::ICMP_ULT, key_index2, key_index1); + is_smaller_than = b->CreateOr( + is_smaller_than, b->CreateAnd(keys_equal, index_is_smaller_than)); + } + ksl.IfReturnVoid("is_smaller_than", is_smaller_than, [&]() { + // Swap key1 with key2. + write_element(0, current_keys_index, key2); + write_element(0, compare_keys_index, key1); + for (int64 i = 1; i <= num_values; ++i) { + // Also swap the values. + auto value1 = read_element(i, current_keys_index); + auto value2 = read_element(i, compare_keys_index); + write_element(i, current_keys_index, value2); + write_element(i, compare_keys_index, value1); + } + }); }); } -void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index, - int64 dimension_to_sort, - int64 dimension_to_sort_bound, - PrimitiveType keys_type, - absl::Span xor_masks, - const std::vector& params, - const std::vector& param_shmem_buffers, - int64 tile_size, llvm::IRBuilder<>* b) { +void EmitTiledCompareLoop( + const IrArray::Index& tiled_keys_index, int64 dimension_to_sort, + int64 dimension_to_sort_bound, PrimitiveType keys_type, + absl::Span xor_masks, const std::vector& params, + const std::vector& param_shmem_buffers, + int64 iota_values_parameter_index, int64 tile_size, llvm::IRBuilder<>* b) { KernelSupportLibrary ksl(b); llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic( llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b); @@ -253,20 +262,22 @@ void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index, RoundDownToNearest(dimension_to_sort_bound, tile_size))), [&]() { EmitCompareLoopBody(dimension_to_sort_bound % tile_size, keys_type, - params.size() - 1, element_pair_index, xor_mask, + params.size() - 1, iota_values_parameter_index, + element_pair_index, xor_mask, tiled_keys_index.GetType(), read_element, write_element, b); }, [&]() { - EmitCompareLoopBody( - tile_size, keys_type, params.size() - 1, element_pair_index, - xor_mask, tiled_keys_index.GetType(), read_element, - write_element, b, /*needs_bounds_checks=*/false); + EmitCompareLoopBody(tile_size, keys_type, params.size() - 1, + iota_values_parameter_index, element_pair_index, + xor_mask, tiled_keys_index.GetType(), + read_element, write_element, b, + /*needs_bounds_checks=*/false); }); } else { EmitCompareLoopBody(tile_size, keys_type, params.size() - 1, - element_pair_index, xor_mask, - tiled_keys_index.GetType(), read_element, + iota_values_parameter_index, element_pair_index, + xor_mask, tiled_keys_index.GetType(), read_element, write_element, b, /*needs_bounds_checks=*/false); } // Wait until all comparisons have happened. @@ -296,6 +307,7 @@ void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index, Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, const std::vector& values_arrays, + int64 iota_values_parameter_index, absl::string_view name, absl::Span xor_masks, llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions, @@ -367,8 +379,8 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, if (xor_masks.size() > 1) { EmitTiledCompareLoop(keys_index, dimension_to_sort, dimension_to_sort_bound, keys_shape.element_type(), - xor_masks, params, param_shmem_buffers, tile_size, - b); + xor_masks, params, param_shmem_buffers, + iota_values_parameter_index, tile_size, b); } else { auto read_element = [&](int64 operand, llvm::Value* index) { keys_index[dimension_to_sort] = index; @@ -380,9 +392,10 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, params[operand].EmitWriteArrayElement(keys_index, value, b); }; EmitCompareLoopBody(dimension_to_sort_bound, keys_shape.element_type(), - values_arrays.size(), tiles_index[rank - 1], - xor_masks[0], tiles_index.GetType(), read_element, - write_element, b); + values_arrays.size(), iota_values_parameter_index, + tiles_index[rank - 1], xor_masks[0], + tiles_index.GetType(), read_element, write_element, + b); } return Status::OK(); }; diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h index 556a217322d373ffd5e816dcf35888b546806633..685f9383acba416f51681270e4037d56abb4b6ea 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h +++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h @@ -31,9 +31,12 @@ namespace llvm_ir { // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort' // dimension of 'keys_array'. All other dimensions are kept as-is. This // implements the inner loop of BitonicSort. It is assumed that 'xor_masks' -// contains only powers of 2, or values 2^k - 1 (k > 0). +// contains only powers of 2, or values 2^k - 1 (k > 0). If +// 'iota_values_parameter_index' is >= 0, it points at a 'values_arrays' operand +// that is a iota and can be used to make the sorting stable. Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array, const std::vector& values_arrays, + int64 iota_values_parameter_index, absl::string_view name, absl::Span xor_masks, llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions, diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc index 2180ac845dd71da3a67b0a818866540764ce0848..5c105908f3653538a31f309096e697cf52a075bf 100644 --- a/tensorflow/compiler/xla/service/local_service.cc +++ b/tensorflow/compiler/xla/service/local_service.cc @@ -145,7 +145,7 @@ StatusOr> LocalService::CompileExecutable( const ExecutableBuildOptions& build_options) { const HloModuleProto& proto = computation.proto(); TF_RET_CHECK(proto.has_host_program_shape()); - const ProgramShape& program_shape = proto.host_program_shape(); + ProgramShape program_shape(proto.host_program_shape()); // Validate incoming layouts. if (argument_layouts.size() != program_shape.parameters_size()) { diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc index ec52a24d782a44fda961feab3230886072e755c7..972a5b9ced0d84387ef8308efe2a7aff7317d047 100644 --- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc +++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc @@ -113,6 +113,13 @@ Status LogicalBufferAnalysis::HandleGetTupleElement(HloInstruction*) { return Status::OK(); } +Status LogicalBufferAnalysis::HandleAddDependency( + HloInstruction* add_dependency) { + // AddDependency just forwards the value of its zero-th operand and does not + // create buffers. + return Status::OK(); +} + Status LogicalBufferAnalysis::HandleCopy(HloInstruction* copy) { // The top-level buffer (index={}) for kCopy is newly created, but all other // buffers (in the case of a tuple shape) come from the operand diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h index 81f524d84a8091e1fff13dc7c55b401143a02753..7ffca943d0f7805ad4420343fcdbf860415c4c40 100644 --- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h +++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h @@ -64,6 +64,7 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault { Status HandleRecvDone(HloInstruction* recv_done) override; Status HandleSend(HloInstruction* send) override; Status HandleTupleSelect(HloInstruction* tuple_select) override; + Status HandleAddDependency(HloInstruction* add_dependency) override; // A map from the buffer ID to the logical buffer std::vector> logical_buffers_; diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 75f7413b3c303da620c2815c83e03324148c0961..c4b0a5c0805300ebcf46bd80a6c535f7d89d3e3f 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -658,9 +658,9 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, // replica 0. TF_ASSIGN_OR_RETURN( std::unique_ptr module_config, - CreateModuleConfig(request.computation().host_program_shape(), - replicated_arguments.front(), - request.execution_options())); + CreateModuleConfig( + ProgramShape{request.computation().host_program_shape()}, + replicated_arguments.front(), request.execution_options())); VLOG(3) << "ExecuteGraphParallel created HloModuleConfig computation layout: " << module_config->entry_computation_layout().ToString(); @@ -824,7 +824,7 @@ Status Service::Compile(const CompileRequest* arg, CompileResponse* result) { [](const Shape& shape) { return &shape; }); TF_ASSIGN_OR_RETURN( std::unique_ptr module_config, - CreateModuleConfig(arg->computation().host_program_shape(), + CreateModuleConfig(ProgramShape{arg->computation().host_program_shape()}, argument_shapes, &arg->execution_options())); VLOG(3) << "Compile created HloModuleConfig computation layout: " << module_config->entry_computation_layout().ToString(); @@ -957,21 +957,6 @@ Status Service::TransferToClient(const TransferToClientRequest* arg, return Status::OK(); } -namespace { - -// Creates a clone of the given shaped buffer with the given device ordinal. The -// shape and DeviceMemoryBase values of the clone are identical to the original. -std::unique_ptr CloneShapedBufferOnDevice( - const ShapedBuffer& shaped_buffer, int device_ordinal) { - auto clone = absl::make_unique( - shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape(), - shaped_buffer.platform(), device_ordinal); - clone->buffers() = shaped_buffer.buffers(); - return clone; -} - -} // namespace - Status Service::TransferToServer(const TransferToServerRequest* arg, TransferToServerResponse* result) { TF_ASSIGN_OR_RETURN(Literal literal, @@ -1087,7 +1072,7 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg, "constant computation may not depend on any parameters."); } - ProgramShape program_shape = arg->computation().host_program_shape(); + ProgramShape program_shape(arg->computation().host_program_shape()); TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result())); if (arg->has_output_layout()) { TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape( @@ -1131,7 +1116,7 @@ Status Service::GetComputationGraphStats( return InvalidArgument("Program shape may not be empty."); } - HloModuleConfig config(arg->computation().host_program_shape()); + HloModuleConfig config(ProgramShape{arg->computation().host_program_shape()}); config.set_debug_options(arg->debug_options()); TF_ASSIGN_OR_RETURN(std::unique_ptr module, CreateModuleFromProto(arg->computation(), config)); diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index 2bfc1676bddc66bdc90052589ed3024510c24d8f..528d5c0ecc76054bb7ff9717b84c42a50415246d 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -391,17 +391,6 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, return ShapeUtil::MakeShape(element_type, new_dimensions); } -/* static */ StatusOr ShapeInference::InferAfterAllShape( - absl::Span arg_shapes) { - for (const Shape* arg_shape : arg_shapes) { - if (arg_shape->element_type() != TOKEN) { - return InvalidArgument( - "Operands of token instructions must be TOKEN types."); - } - } - return ShapeUtil::MakeTokenShape(); -} - /* static */ StatusOr ShapeInference::InferConvertShape( const Shape& operand_shape, PrimitiveType new_element_type) { auto old_element_type = operand_shape.element_type(); diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h index 31ef4b2e41078f87731a1eff58e37409a6004ba4..d94385a04d50baff8156570a09620fd458547936 100644 --- a/tensorflow/compiler/xla/service/shape_inference.h +++ b/tensorflow/compiler/xla/service/shape_inference.h @@ -232,13 +232,6 @@ class ShapeInference { static StatusOr InferConcatOpShape( absl::Span arg_shapes, int64 dimension); - // Infers the shape produced by a kAfterAll. Trivially this shape is always a - // TOKEN shape. However, ShapeInference serves two purposes: inferring shapes - // and checking operand shapes. This method verifies that the operand shapes - // are all TOKENs. - static StatusOr InferAfterAllShape( - absl::Span arg_shapes); - // Helper that validates the given operand shape can be converted to the // target output_shape via a convert instruction -- the requirement is that // the shape is identical except for the element type. diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc index 96f3055c98e0611dfe25517cb490014a6d1f7c76..50d51eaeb762e208004c1dae3dcc27503f3f94e9 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc @@ -280,6 +280,13 @@ Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) { return Status::OK(); } +Status TuplePointsToAnalysis::HandleAddDependency( + HloInstruction* add_dependency) { + // AddDependency just forwards the value of its zero-th operand. + CreateCopiedPointsToSet(add_dependency, add_dependency->operand(0)); + return Status::OK(); +} + Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) { // RecvDone aliases its input (Recv) tuple element {0} to element {0} of its // output. The other indices ({} and {1}) define their own buffers. diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h index bcfcb388f95b0bedb35a8c399e804034816867b3..0a1d5649d6d69fea12263e6986ce76af62615ec7 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h @@ -252,6 +252,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault { Status HandleRecvDone(HloInstruction* recv_done) override; Status HandleSend(HloInstruction* send) override; Status HandleTupleSelect(HloInstruction* tuple_select) override; + Status HandleAddDependency(HloInstruction* add_dependency) override; string ToString() const; diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc index 10ef2d38fa21c3e93c270535bc99b2f76435337d..561762b5d424ed5f537665be9d67a81dc8bdd56e 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc @@ -264,6 +264,22 @@ TEST_F(TuplePointsToAnalysisTest, GetTupleElement) { UnorderedElementsAre(inner_tuple)); } +TEST_F(TuplePointsToAnalysisTest, AddDependency) { + auto builder = HloComputation::Builder(TestName()); + auto constant = builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); + auto token = builder.AddInstruction(HloInstruction::CreateToken()); + auto add_dependency = builder.AddInstruction( + HloInstruction::CreateAddDependency(constant, token)); + BuildModuleAndRunAnalysis(builder.Build()); + + auto& points_to_set = points_to_analysis_->GetPointsToSet(add_dependency); + EXPECT_EQ(1, points_to_set.size()); + EXPECT_FALSE(points_to_set.IsAmbiguous()); + EXPECT_TRUE(points_to_set.IsDistinct()); + ExpectHasTopLevelBuffers(points_to_set.CreateFlattenedSet(), {constant}); +} + TEST_F(TuplePointsToAnalysisTest, DuplicatedElement) { // Create a tuple which contains duplicate elements. auto builder = HloComputation::Builder(TestName()); diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc index b7c28bfac7889b788645360366d1419eb80e64de..41011176ffa91e885bc58364d1fb19617d3518ad 100644 --- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc +++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/tuple_util.h" #include "tensorflow/compiler/xla/service/while_loop_analysis.h" #include "tensorflow/compiler/xla/service/while_util.h" +#include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/util.h" namespace xla { @@ -207,6 +208,37 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody( continue; } + if (!hoist_size_inflating_ops_) { + // Check that hoisting the instruction doesn't cause a significant memory + // blow-up. LICM extends the live-range of the output of the hoisted + // instruction to be the entire while loop, which may be problematic on + // platforms where memory is limited. This can be especially harmful if + // the instruction has a significantly larger output than its input, e.g. + // kIota, kBroadcast or kConstant. + int64 input_size = 0, output_size = 0; + + for (auto* operand : instruction->operands()) { + ShapeUtil::ForEachSubshape( + operand->shape(), + [&input_size](const Shape& subshape, const ShapeIndex& /*index*/) { + if (ShapeUtil::IsArray(subshape)) { + input_size += ShapeUtil::ByteSizeOfElements(subshape); + } + }); + } + ShapeUtil::ForEachSubshape( + instruction->shape(), + [&output_size](const Shape& subshape, const ShapeIndex& /*index*/) { + if (ShapeUtil::IsArray(subshape)) { + output_size += ShapeUtil::ByteSizeOfElements(subshape); + } + }); + + if (output_size > input_size) { + continue; + } + } + auto is_invariant = [&](HloInstruction* op) { return hoisted_instructions.find(op) != hoisted_instructions.end() || unhoisted_invariant_instructions.count(op) || diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h index 3031899f71e0fd77f20448d9d7489798af01615c..bd6232dc0a988775a0490abbf6125daad8476295 100644 --- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h +++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h @@ -34,8 +34,14 @@ class WhileLoopInvariantCodeMotion : public HloModulePass { // Setting `hoist_constants` to false can be help if LICM is run in the mid // level HLO pipeline because hoisting constants out of while loop bodies can // break optimizations like constant folding. - explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false) - : hoist_constants_(hoist_constants) {} + // Setting `hoist_size_inflating_ops` to false will forbid hoisting + // instructions where the size of the output(s) is larger than the size of the + // input(s). This is useful on platforms on which it's important to prevent + // blow-ups in memory size. + explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false, + bool hoist_size_inflating_ops = true) + : hoist_constants_(hoist_constants), + hoist_size_inflating_ops_(hoist_size_inflating_ops) {} ~WhileLoopInvariantCodeMotion() override = default; absl::string_view name() const override { @@ -49,6 +55,7 @@ class WhileLoopInvariantCodeMotion : public HloModulePass { HloInstruction* while_instr); bool hoist_constants_; + bool hoist_size_inflating_ops_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc index 046ccb2d3f29c2141ade5275d043875e3e278582..8e7c4bc8828552e197b41f874c070d496b85a382 100644 --- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc +++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc @@ -570,5 +570,59 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DoNotHoistOutOfSingleIteration) { EXPECT_FALSE(simplified_loop); } +const char* const kInflatingTestCase = R"( +HloModule ModuleWithWhile + +mul { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT mul = f32[] multiply(lhs, rhs) +} + +body { + p_body = (f32[]) parameter(0) + iota = f32[1024, 1024] iota(), iota_dimension=0 + add = f32[1024, 1024] add(iota, iota) + constant = f32[] constant(1.0) + reduce = f32[] reduce(f32[1024, 1024] add, f32[] constant), dimensions={0,1}, to_apply=mul + ROOT root = (f32[]) tuple(reduce) +} + +condition { + p_cond = (f32[]) parameter(0) + ROOT result = pred[] constant(true) +} + +ENTRY entry { + param = f32[] parameter(0) + while_init = (f32[]) tuple(param) + ROOT while = (f32[]) while(while_init), condition=condition, body=body +} +)"; + +TEST_F(WhileLoopInvariantCodeMotionTest, HoistsInflatingByDefault) { + auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie(); + + TF_ASSERT_OK_AND_ASSIGN( + bool simplified_loop, + WhileLoopInvariantCodeMotion(/*hoist_constants=*/true).Run(m.get())); + EXPECT_TRUE(simplified_loop); + + HloComputation* while_body = m->GetComputationWithName("wide.body"); + ASSERT_NE(while_body, nullptr); + EXPECT_THAT(while_body->instructions(), Not(Contains(op::Iota()))); +} + +TEST_F(WhileLoopInvariantCodeMotionTest, NoHoistInflating) { + auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie(); + + TF_ASSERT_OK_AND_ASSIGN( + bool simplified_loop, + WhileLoopInvariantCodeMotion(/*hoist_constants=*/true, + /*hoist_size_inflating_ops=*/false) + .Run(m.get())); + EXPECT_FALSE(simplified_loop); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc index 6f924a29d8a3ac60abe98efd2e82ae7343c7de47..c4790a7f199a90ca81e5503b4256bd95df88d4f4 100644 --- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc +++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc @@ -19,13 +19,17 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "absl/strings/str_join.h" #include "absl/types/optional.h" +#include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/service/call_inliner.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_query.h" +#include "tensorflow/compiler/xla/service/pattern_matcher.h" #include "tensorflow/compiler/xla/service/while_loop_analysis.h" namespace xla { +namespace m = match; using absl::optional; using hlo_query::ContainsInstrWithOpcode; @@ -302,6 +306,147 @@ static StatusOr TryRemoveDeadWhileParams(HloInstruction* while_op) { return true; } +// Removes each loop parameter (i.e. member of the while loop tuple) that is a +// constant and is the same in the while loop body and the while loop init. +static StatusOr TryRemoveConstantParams(HloInstruction* while_op) { + HloModule* module = while_op->GetModule(); + HloComputation* computation = while_op->parent(); + auto* while_init = while_op->mutable_operand(0); + auto* while_body = while_op->while_body(); + auto* while_cond = while_op->while_condition(); + auto* while_body_root = while_body->root_instruction(); + if (while_init->opcode() != HloOpcode::kTuple || + while_body_root->opcode() != HloOpcode::kTuple) { + return false; + } + + TF_RET_CHECK(while_cond->num_parameters() == 1); + TF_RET_CHECK(while_body->num_parameters() == 1); + TF_RET_CHECK( + ShapeUtil::Compatible(while_init->shape(), while_body_root->shape())); + + absl::flat_hash_set constant_tuple_indices; + const auto& while_shape = while_init->shape(); + for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) { + auto* init_elem = while_init->operand(i); + auto* body_elem = while_body_root->operand(i); + if (init_elem->opcode() == HloOpcode::kConstant && + body_elem->opcode() == HloOpcode::kConstant && + init_elem->literal() == body_elem->literal()) { + constant_tuple_indices.insert(i); + } + } + + if (constant_tuple_indices.empty()) { + return false; + } + + // OK, we found some constant elements of the while parameter! Eliminate + // them. + std::vector new_while_shape_elems; + for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) { + if (!constant_tuple_indices.count(i)) { + new_while_shape_elems.push_back(while_shape.tuple_shapes(i)); + } + } + Shape new_while_shape = ShapeUtil::MakeTupleShape(new_while_shape_elems); + + // `new_instrs` holds instructions created outside of a computation for + // cloning. Elements added here just need to live until the end of the + // relevant CloneWithReplacement call. + std::vector> new_instrs; + auto add_new_instr = [&](std::unique_ptr instr) { + new_instrs.push_back(std::move(instr)); + return new_instrs.back().get(); + }; + + // Returns a new tuple without the elements of constant_tuple_indices. + auto remove_constant_elems = [&](HloInstruction* instr) { + CHECK(ShapeUtil::Compatible(instr->shape(), while_shape)); + + std::vector tuple_elems; + for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) { + if (!constant_tuple_indices.count(i)) { + tuple_elems.push_back( + add_new_instr(HloInstruction::CreateGetTupleElement( + while_shape.tuple_shapes(i), instr, i))); + } + } + return HloInstruction::CreateTuple(tuple_elems); + }; + + auto add_constant_elems = [&](HloInstruction* instr) { + CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape)); + + std::vector tuple_elems; + int64 j = 0; + for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) { + if (constant_tuple_indices.count(i)) { + tuple_elems.push_back(while_init->mutable_operand(i)); + } else { + tuple_elems.push_back( + add_new_instr(HloInstruction::CreateGetTupleElement( + while_shape.tuple_shapes(i), instr, j))); + ++j; + } + } + return HloInstruction::CreateTuple(tuple_elems); + }; + + // Special case: constant_tuple_indices covers the whole while parameter, so + // the new while shape is the empty tuple. In this case, the value of the + // while loop is simply equal to the value of `init`. + // + // It's unfortunate to special-case this, but it's simpler than the + // alternative. The problem is that if our while parameter has no + // non-constant elems, the tuple returned by `add_constant_elems` won't depend + // on instr (the loop body/cond parameter), and therefore + // CloneWithReplacementPairs will *leave the parameter out entirely*, creating + // invalid HLO. + if (ShapeUtil::IsEmptyTuple(new_while_shape)) { + TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, while_init)); + return true; + } + + std::unique_ptr new_while_cond = + while_cond->CloneWithReplacementPairs({ + while_cond->parameter_instruction(0), + add_constant_elems(add_new_instr(HloInstruction::CreateParameter( + 0, new_while_shape, + while_cond->parameter_instruction(0)->name()))), + }); + + std::unique_ptr new_while_body = + while_body->CloneWithReplacementPairs( + { + while_body->parameter_instruction(0), + add_constant_elems(add_new_instr(HloInstruction::CreateParameter( + 0, new_while_shape, + while_cond->parameter_instruction(0)->name()))), + }, + { + while_body->root_instruction(), + remove_constant_elems( + add_new_instr(while_body->root_instruction()->Clone())), + }); + + // Create the final while loop, and add any new instructions created to + // `computation`. + new_instrs.clear(); + TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction( + while_op, + add_constant_elems( + computation->AddInstruction(HloInstruction::CreateWhile( + new_while_shape, + module->AddEmbeddedComputation(std::move(new_while_cond)), + module->AddEmbeddedComputation(std::move(new_while_body)), + add_new_instr(remove_constant_elems(while_init))))))); + for (auto& instr : new_instrs) { + computation->AddInstruction(std::move(instr)); + } + return true; +} + // Tries to remove a while loop from the graph. // // - Loops with trip count of 0 can be replaced by the loop's "init" value. @@ -519,14 +664,6 @@ static StatusOr TryFlattenNestedTuples(HloInstruction* while_op) { return false; } - // Cowardly refuse to perform this optimization in the presence of kDomain - // instructions, which may reference other instructions in the loop and - // therefore make this complicated. - if (ContainsInstrWithOpcode(while_body, {HloOpcode::kDomain}) || - ContainsInstrWithOpcode(while_cond, {HloOpcode::kDomain})) { - return false; - } - std::vector flattened_shape_elems; ShapeUtil::ForEachSubshape(while_shape, [&](const Shape& s, const ShapeIndex& /*index*/) { @@ -605,6 +742,248 @@ static StatusOr TryFlattenNestedTuples(HloInstruction* while_op) { return true; } +// Tries to merge loop induction variables of a given type. +// +// In this pass we're only concerned with elements of the loop's tuple that +// are effective-scalars of type `elem_ty`. Some terminology: +// +// - The trip counter is the first element of the loop's tuple that starts at +// 0 and does x++ on each iteration. +// +// - An induction variable is an element of the loop's tuple that is not the +// trip counter and does `x += ` on each iteration of the loop. +// Negative constants are OK. +// +// This pass adds a trip counter if one isn't already present, then replaces +// each induction variable with +// +// + * . +// +// This reduces the number of scalar operations in the loop, which is important +// e.g. on GPUs, where each scalar operation is nontrivially expensive because +// it's a separate kernel launch. +// +// Returns the new loop if a change was made, or null if no change was made. +// Note that the new loop is not a valid replacement for the old loop; it may +// need to be wrapped in a tuple that changes its shape. We return the loop +// itself so that you can call TryMergeInductionVariables in a loop, once for +// each integral type elem_ty. +static StatusOr TryMergeInductionVariables( + HloInstruction* while_op, PrimitiveType elem_ty) { + CHECK(primitive_util::IsIntegralType(elem_ty)) << PrimitiveType_Name(elem_ty); + HloModule* module = while_op->GetModule(); + HloComputation* computation = while_op->parent(); + auto* while_init = while_op->mutable_operand(0); + auto* while_body = while_op->while_body(); + auto* while_cond = while_op->while_condition(); + auto* while_body_root = while_body->root_instruction(); + if (while_init->opcode() != HloOpcode::kTuple || + while_body_root->opcode() != HloOpcode::kTuple) { + return nullptr; + } + + TF_RET_CHECK(while_cond->num_parameters() == 1); + TF_RET_CHECK(while_body->num_parameters() == 1); + TF_RET_CHECK( + ShapeUtil::Compatible(while_init->shape(), while_body_root->shape())); + Shape while_shape = while_init->shape(); + + // The tuple index of the trip counter, if one is present. + absl::optional trip_counter; + // Maps the tuple index of each induction variable to its constant increment. + absl::flat_hash_map induction_vars; + for (int64 i = 0; i < while_body_root->operand_count(); ++i) { + const auto& elem_shape = while_body_root->operand(i)->shape(); + if (!ShapeUtil::IsEffectiveScalar(elem_shape) || + elem_shape.element_type() != elem_ty) { + continue; + } + + HloInstruction* constant; + if (!Match(while_body_root->mutable_operand(i), + m::AddAnyOrder(m::GetTupleElement(m::Parameter(), i), + m::Constant(&constant)))) { + continue; + } + if (!trip_counter && constant->literal().IsAll(1) && + while_init->operand(i)->IsConstant() && + while_init->operand(i)->literal().IsAll(0)) { + VLOG(10) << "Found existing trip counter at index " << i; + trip_counter = i; + } else { + VLOG(10) << "Found induction variable at index " << i; + induction_vars.emplace(i, Cast(constant)); + } + } + + // There's only something to simplify if we can either: + // + // - combine one or more induction vars with an existing trip counter, or + // - replace two or more induction variables with a new trip counter. + // + // Put another way, there's only something to simplify if the number of + // induction vars plus the number of existing trip counters (0 or 1) is >= 2. + if (induction_vars.size() + (trip_counter.has_value() ? 1 : 0) < 2) { + return nullptr; + } + + // OK, we're going to do the transformation! Set up some helpers. + + // `new_instrs` holds instructions created outside of a computation for + // cloning. Elements added here just need to live until the end of the + // relevant CloneWithReplacement call. + std::vector> new_instrs; + auto add_new_instr = [&](std::unique_ptr instr) { + new_instrs.push_back(std::move(instr)); + return new_instrs.back().get(); + }; + + auto add_binary_op = [&](const Shape& shape, HloOpcode opcode, + HloInstruction* lhs, HloInstruction* rhs) { + // Reshape lhs/rhs to the output shape if necessary. This deals with the + // fact that induction variables need only be effective scalars, not true + // scalars. + if (!ShapeUtil::Compatible(shape, lhs->shape())) { + lhs = add_new_instr(HloInstruction::CreateReshape(shape, lhs)); + } + if (!ShapeUtil::Compatible(shape, rhs->shape())) { + rhs = add_new_instr(HloInstruction::CreateReshape(shape, rhs)); + } + return add_new_instr(HloInstruction::CreateBinary(shape, opcode, lhs, rhs)); + }; + + auto add_gte = [&](HloInstruction* src, int64 idx) { + return add_new_instr(HloInstruction::CreateGetTupleElement( + src->shape().tuple_shapes(idx), src, idx)); + }; + + // Our new while loop will have the same shape as the old while loop, except + // we'll add a trip counter to the end if it wasn't originally present. + Shape new_while_shape = while_shape; + bool added_trip_counter = false; + if (!trip_counter) { + VLOG(10) << "Adding new trip counter to end of loop's tuple."; + trip_counter = new_while_shape.tuple_shapes_size(); + *new_while_shape.add_tuple_shapes() = + ShapeUtil::MakeShape(elem_ty, /*dimensions=*/{}); + added_trip_counter = true; + } + + // Converts `instr` into a tuple of the "old" form -- that is, to a tuple with + // shape `while_body->shape()` and where the induction variables are "reified" + // (i.e. they have value + * ). + auto convert_to_old_form = [&](HloInstruction* instr) { + CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape)); + std::vector tuple_elems; + for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) { + const auto& elem_shape = while_shape.tuple_shapes(i); + if (!induction_vars.count(i)) { + tuple_elems.push_back(add_gte(instr, i)); + continue; + } + tuple_elems.push_back(add_binary_op( + elem_shape, HloOpcode::kAdd, add_gte(instr, i), + add_binary_op(elem_shape, HloOpcode::kMultiply, + add_gte(instr, *trip_counter), + add_new_instr(induction_vars.at(i)->Clone())))); + } + return HloInstruction::CreateTuple(tuple_elems); + }; + + // Converts `root` into a tuple of the "new" form -- that is, to a tuple with + // shape `new_while_shape` and where the induction variables (but not trip + // counters) are replaced with their unchanging values. + auto convert_to_new_form = [&](HloInstruction* old_root, + HloParameterInstruction* loop_body_param) { + CHECK(ShapeUtil::Compatible(old_root->shape(), while_shape)); + std::vector tuple_elems; + + // In the new form, induction variables come from `init`, everything else + // (including the trip counter if it's not one we created ourselves) comes + // from the `root` tuple unmodified. + for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) { + tuple_elems.push_back( + add_gte((induction_vars.count(i) ? loop_body_param : old_root), i)); + } + // If we created a trip counter ourselves, add 1 to it in the next + // iteration. + if (added_trip_counter) { + tuple_elems.push_back(add_binary_op( + new_while_shape.tuple_shapes(*trip_counter), HloOpcode::kAdd, + add_gte(loop_body_param, *trip_counter), + add_new_instr( + HloInstruction::CreateConstant(LiteralUtil::One(elem_ty))))); + } + + return HloInstruction::CreateTuple(tuple_elems); + }; + + // Creates a new init tuple, which is the same as the old init tuple except if + // we added a trip counter, it's set to 0. + auto get_new_while_init = [&](HloInstruction* init) { + CHECK(ShapeUtil::Compatible(init->shape(), while_shape)); + if (!added_trip_counter) { + return init; + } + std::vector tuple_elems; + for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) { + tuple_elems.push_back(add_gte(init, i)); + } + tuple_elems.push_back(add_new_instr( + HloInstruction::CreateConstant(LiteralUtil::Zero(elem_ty)))); + return add_new_instr(HloInstruction::CreateTuple(tuple_elems)); + }; + + std::unique_ptr new_while_cond = + while_cond->CloneWithReplacementPairs({ + while_cond->parameter_instruction(0), + convert_to_old_form(add_new_instr(HloInstruction::CreateParameter( + 0, new_while_shape, + while_cond->parameter_instruction(0)->name()))), + }); + + // Creating the new while body proceeds in two steps. First we convert the + // users of the parameter to the old form. Then as a second + // CloneWithReplacement operation we convert the root to the new form. We + // have to do this in two steps because the new root needs to use the new + // param0, and during the first clone operation, only the *old-form* param0 is + // accessible. + // + // We have to add temp_new_while_body to the module because cloning a + // computation touches the module (to get its NameUniquer). + HloComputation* temp_new_while_body = + module->AddEmbeddedComputation(while_body->CloneWithReplacementPairs({ + while_body->parameter_instruction(0), + convert_to_old_form(add_new_instr(HloInstruction::CreateParameter( + 0, new_while_shape, + while_body->parameter_instruction(0)->name()))), + })); + std::unique_ptr new_while_body = + temp_new_while_body->CloneWithReplacementPairs({ + temp_new_while_body->root_instruction(), + convert_to_new_form( + add_new_instr(temp_new_while_body->root_instruction()->Clone()), + Cast( + temp_new_while_body->parameter_instruction(0))), + }); + TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(temp_new_while_body)); + + // Create the final while loop, and add any new instructions created to + // `computation`. + new_instrs.clear(); + auto* new_while = computation->AddInstruction(HloInstruction::CreateWhile( + new_while_shape, + module->AddEmbeddedComputation(std::move(new_while_cond)), + module->AddEmbeddedComputation(std::move(new_while_body)), + get_new_while_init(while_init))); + TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction( + while_op, convert_to_old_form(new_while))); + for (auto& instr : new_instrs) { + computation->AddInstruction(std::move(instr)); + } + return new_while; +} + StatusOr WhileLoopSimplifier::Run(HloModule* module) { XLA_VLOG_LINES(3, "WhileLoopSimplifier::Run(), before:\n" + module->ToString()); @@ -650,19 +1029,50 @@ StatusOr WhileLoopSimplifier::Run(HloModule* module) { continue; } + // TODO(b/119281462): Cowardly refuse to perform any of the following + // optimizations in the presence of kDomain instructions. It seems that + // modifying a while loop's tuple doesn't work when kDomain is present. + if (ContainsInstrWithOpcode(while_op->while_body(), {HloOpcode::kDomain}) || + ContainsInstrWithOpcode(while_op->while_condition(), + {HloOpcode::kDomain})) { + continue; + } + + // Each of the optimizations below modifies the while loop itself if it's + // successful, meaning that `while_op` is no longer valid after one of these + // transformations returns true. + TF_ASSIGN_OR_RETURN(result, TryFlattenNestedTuples(while_op)); changed |= result; if (result) { - // Successfully flattening nested tuples results in us cloning and - // replacing the while loop, meaning that `while_op` is no longer valid. continue; } TF_ASSIGN_OR_RETURN(result, TryRemoveDeadWhileParams(while_op)); changed |= result; if (result) { - // Successfully removing dead while params results in us cloning and - // replacing the while loop, meaning that `while_op` is no longer valid. + continue; + } + + TF_ASSIGN_OR_RETURN(result, TryRemoveConstantParams(while_op)); + changed |= result; + if (result) { + continue; + } + + bool merged_induction_vars = false; + // Notably missing from this list are S16 and U16. These don't currently + // work because S/U16 literals are not implemented. + for (auto elem_ty : {S8, U8, S32, U32, S64, U64}) { + TF_ASSIGN_OR_RETURN(auto* new_while_op, + TryMergeInductionVariables(while_op, elem_ty)); + if (new_while_op) { + while_op = new_while_op; + changed = true; + merged_induction_vars = true; + } + } + if (merged_induction_vars) { continue; } } diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc index 05005e0b262a50cd40e004deac4c450a2e257308..4950e8269e9cf0723d717bd1734518d104c0c9f2 100644 --- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc @@ -17,9 +17,12 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "absl/strings/str_replace.h" +#include "tensorflow/compiler/xla/service/algebraic_simplifier.h" +#include "tensorflow/compiler/xla/service/hlo_cse.h" #include "tensorflow/compiler/xla/service/hlo_dce.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/service/tuple_simplifier.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/core/lib/core/status_test_util.h" @@ -27,8 +30,17 @@ limitations under the License. namespace xla { namespace { +using ::testing::_; namespace op = xla::testing::opcode_matchers; +// Returns the first kWhile instruction within m's entry computation. +HloInstruction* FindFirstWhile(HloModule* m) { + const auto& instrs = m->entry_computation()->instructions(); + return *absl::c_find_if(instrs, [](const HloInstruction* instr) { + return instr->opcode() == HloOpcode::kWhile; + }); +} + class WhileLoopSimplifierTest : public HloTestBase { protected: // Makes an HloModule that contains a loop with `num_iters` iteration. @@ -540,11 +552,7 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) { // it easy to find. EXPECT_TRUE(HloDCE().Run(m.get()).ok()); - const auto& instrs = m->entry_computation()->instructions(); - HloInstruction* new_while = - *absl::c_find_if(instrs, [](const HloInstruction* instr) { - return instr->opcode() == HloOpcode::kWhile; - }); + HloInstruction* new_while = FindFirstWhile(m.get()); Shape flat_tuple = ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3], s32[4])") .ValueOrDie(); @@ -563,5 +571,177 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) { .ValueOrDie())); } +// Edge-case: All elements of the loop carry are constants which can be removed, +// leaving us with a nullary loop. This is a special case, we just replace the +// loop with its init. +TEST_F(WhileLoopSimplifierTest, OnlyConstantsInLoopCarry) { + const string hlo_string = R"( + HloModule Test + Body { + param = (s32[1]) parameter(0) + a = s32[1] constant({0}) + ROOT tuple = (s32[1]) tuple(a) + } + Cond { + param = (s32[1]) parameter(0) + ROOT cond = pred[] constant(true) + } + ENTRY Loop { + a = s32[1] constant({0}) + init = (s32[1]) tuple(a) + ROOT while = (s32[1]) while(init), condition=Cond, body=Body + })"; + + auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie(); + EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie()); + EXPECT_TRUE(HloDCE().Run(m.get()).ok()); + EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok()); + EXPECT_THAT(m->entry_computation()->root_instruction(), + op::Tuple(op::Constant())); +} + +TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) { + const string hlo_string = R"( + HloModule Test + Body { + param = (s32[1], s32[2], s32[3]) parameter(0) + a = s32[1] get-tuple-element(param), index=0 + a.1 = s32[1] add(a, a) + b = s32[2] constant({1,1}) + c = s32[3] constant({10,10,10}) + ROOT tuple = (s32[1], s32[2], s32[3]) tuple(a.1, b, c) + } + Cond { + param = (s32[1], s32[2], s32[3]) parameter(0) + /* Use each tuple element. The verifier will then ensure that if any of + * these get modified, they're replaced with values of the correct shape. */ + a = s32[1] get-tuple-element(param), index=0 + b = s32[2] get-tuple-element(param), index=1 + c = s32[3] get-tuple-element(param), index=2 + ROOT cond = pred[] constant(true) + } + ENTRY Loop { + /* Only `b` should be simplified away. `a` is not a constant within the + * loop, and `c`'s value changes depending on whether we run 0 or 1 + * iterations of the loop. */ + a = s32[1] constant({0}) + b = s32[2] constant({1,1}) + c = s32[3] constant({2,2,2}) + init = (s32[1], s32[2], s32[3]) tuple(a,b,c) + ROOT while = (s32[1], s32[2], s32[3]) while(init), + condition=Cond, body=Body + })"; + + auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie(); + EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie()); + // DCE away the old loop so there's just one while loop in the module, making + // it easy to find. + EXPECT_TRUE(HloDCE().Run(m.get()).ok()); + // Run the tuple simplifier to make the resulting HLO a bit easier to check. + EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok()); + + HloInstruction* new_while = FindFirstWhile(m.get()); + Shape new_while_shape = + ShapeUtil::ParseShapeString("(s32[1], s32[3])").ValueOrDie(); + EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape)); + EXPECT_TRUE(ShapeUtil::Equal( + new_while->while_body()->root_instruction()->shape(), new_while_shape)); + EXPECT_TRUE(ShapeUtil::Equal( + new_while->while_body()->parameter_instruction(0)->shape(), + new_while_shape)); + EXPECT_TRUE(ShapeUtil::Equal( + new_while->while_condition()->parameter_instruction(0)->shape(), + new_while_shape)); + EXPECT_TRUE(ShapeUtil::Equal( + m->entry_computation()->root_instruction()->shape(), + ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3])").ValueOrDie())); + EXPECT_THAT(m->entry_computation()->root_instruction(), + op::Tuple(_, op::Constant(), _)); +} + +const char* const kSimpleMergeInductionVariablesModule = R"( + HloModule Test + Body { + param = (TYPE[], TYPE[], TYPE[]) parameter(0) + + a = TYPE[] get-tuple-element(param), index=0 + one = TYPE[] constant(1) + a1 = TYPE[] add(a, one) + + b = TYPE[] get-tuple-element(param), index=1 + negone = TYPE[] constant(-1) + b1 = TYPE[] add(b, negone) + + c = TYPE[] add(a, b) + + ROOT tuple = (TYPE[], TYPE[], TYPE[]) tuple(a1,b1,c) + } + Cond { + param = (TYPE[], TYPE[], TYPE[]) parameter(0) + a = TYPE[] get-tuple-element(param), index=0 + b = TYPE[] get-tuple-element(param), index=1 + sum = TYPE[] power(a, b) + ten = TYPE[] constant(10) + ROOT cond = pred[] less-than(sum, ten) + } + ENTRY Loop { + a = TYPE[] constant(10) + b = TYPE[] constant(100) + c = TYPE[] constant(0) + init = (TYPE[], TYPE[], TYPE[]) tuple(a,b,c) + while = (TYPE[], TYPE[], TYPE[]) while(init), condition=Cond, body=Body + + a1 = TYPE[] get-tuple-element(while), index=0 + b1 = TYPE[] get-tuple-element(while), index=1 + ROOT sum = TYPE[] add(a1, b1) + })"; + +TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_Simple) { + string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule, + {{"TYPE", "s32"}}); + + auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie(); + EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie()); + // DCE away the old loop so there's just one while loop in the module, making + // it easy to find, and run the tuple simplifier to make the resulting HLO + // easier to check. + EXPECT_TRUE(HloDCE().Run(m.get()).ok()); + EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok()); + + HloInstruction* new_while = FindFirstWhile(m.get()); + // We should have added a new loop counter for s32[] to the end of the tuple. + SCOPED_TRACE(m->ToString()); + Shape new_while_shape = + ShapeUtil::ParseShapeString("(s32[], s32[], s32[], s32[])").ValueOrDie(); + EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape)); + EXPECT_TRUE(ShapeUtil::Equal( + new_while->while_body()->root_instruction()->shape(), new_while_shape)); + EXPECT_TRUE(ShapeUtil::Equal( + new_while->while_body()->parameter_instruction(0)->shape(), + new_while_shape)); + EXPECT_TRUE(ShapeUtil::Equal( + new_while->while_condition()->parameter_instruction(0)->shape(), + new_while_shape)); + + EXPECT_THAT(new_while->while_body()->root_instruction(), + op::Tuple(op::GetTupleElement(op::Parameter(), 0), + op::GetTupleElement(op::Parameter(), 1), op::Add(), + op::Add(op::GetTupleElement(op::Parameter(), 3), + op::Constant()))); + EXPECT_THAT(new_while->while_condition()->root_instruction(), + op::Lt(op::Power(op::Add(), op::Add()), op::Constant())); +} + +// We shouldn't merge S16 induction variables; we can't create constants of this +// type because S16 literals are not implemented. +TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_SkipS16) { + string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule, + {{"TYPE", "s16"}}); + EXPECT_FALSE( + WhileLoopSimplifier() + .Run(ParseAndReturnVerifiedModule(hlo_string).ValueOrDie().get()) + .ValueOrDie()); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc new file mode 100644 index 0000000000000000000000000000000000000000..d209389c745df2a90ae0cf0a506bffaa1e07366a --- /dev/null +++ b/tensorflow/compiler/xla/shape.cc @@ -0,0 +1,62 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/shape.h" + +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/shape_util.h" + +namespace xla { + +ProgramShape::ProgramShape(const ProgramShapeProto& program_shape_proto) { + for (const Shape& shape : program_shape_proto.parameters()) { + *add_parameters() = shape; + } + *mutable_result() = program_shape_proto.result(); + for (const string& name : program_shape_proto.parameter_names()) { + add_parameter_names(name); + } +} + +ProgramShapeProto ProgramShape::ToProto() const { + ProgramShapeProto proto; + for (const Shape& shape : parameters()) { + *proto.add_parameters() = shape; + } + *proto.mutable_result() = result(); + for (const string& name : parameter_names()) { + proto.add_parameter_names(name); + } + return proto; +} + +string ProgramShape::ToString() const { + std::vector parameter_strings(parameters_size()); + for (int i = 0; i < parameters_size(); ++i) { + parameter_strings[i] = absl::StrCat( + i < parameter_names_size() ? parameter_names(i) : "(unknown)", ": ", + ShapeUtil::HumanString(parameters(i))); + } + return absl::StrCat("(", absl::StrJoin(parameter_strings, ", "), ") -> ", + ShapeUtil::HumanString(result())); +} + +std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape) { + out << program_shape.ToString() << "\n"; + return out; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h new file mode 100644 index 0000000000000000000000000000000000000000..c3aecb1736847eaa1d99f33c7d9cf81d9e7b3974 --- /dev/null +++ b/tensorflow/compiler/xla/shape.h @@ -0,0 +1,108 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SHAPE_H_ +#define TENSORFLOW_COMPILER_XLA_SHAPE_H_ + +#include +#include + +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { + +// Shape of the parameters and output of an XLA computation. This is analogous +// to a traditional function signature. +class ProgramShape { + public: + ProgramShape() = default; + + // Creates a ProgramShape from a ProgramShapeProto protobuf. + explicit ProgramShape(const ProgramShapeProto& program_shape_proto); + + // Returns a proto representation of the object. + ProgramShapeProto ToProto() const; + + string ToString() const; + + // The following methods mirror the protobuf generated code interface for the + // message ProgramShapeProto. This enabled easy migration of this data + // structure from a proto to a proper C++ class. + // TODO(b/29771030): Replace or augment these methods with a more ergonomic + // interface. + + // Methods for accessing and manipulating the Shape of the parameters. + int parameters_size() const { return parameters_.size(); } + const Shape& parameters(int index) const { return parameters_.at(index); } + Shape* mutable_parameters(int index) { return ¶meters_.at(index); } + Shape* add_parameters() { + parameters_.emplace_back(); + return ¶meters_.back(); + } + void clear_parameters() { parameters_.clear(); } + const std::vector& parameters() const { return parameters_; } + std::vector* mutable_parameters() { return ¶meters_; } + + // Methods for accessing and manipulating the Shape of the result. + const Shape& result() const { return result_; } + Shape* mutable_result() { return &result_; } + void clear_result() { result_.Clear(); } + + // Methods for accessing and manipulating the names of the parameters. + int parameter_names_size() const { return parameter_names_.size(); } + const string& parameter_names(int index) const { + return parameter_names_.at(index); + } + void set_parameter_names(int index, const string& value) { + parameter_names_.at(index) = value; + } + string* mutable_parameter_names(int index) { + return ¶meter_names_.at(index); + } + void add_parameter_names(const string& value) { + parameter_names_.push_back(value); + } + string* add_parameter_names() { + parameter_names_.push_back(""); + return ¶meter_names_.back(); + } + void clear_parameter_names() { parameter_names_.clear(); } + const std::vector& parameter_names() const { + return parameter_names_; + } + std::vector* mutable_parameter_names() { return ¶meter_names_; } + + string ShortDebugString() const { return ToProto().ShortDebugString(); } + string DebugString() const { return ToProto().DebugString(); } + + private: + // The shapes of the parameters of the computation represented by this object. + std::vector parameters_; + + // The names of the parameters of the computation represented by this object. + std::vector parameter_names_; + + // The shape of the result of the computation represented by this object. + Shape result_; +}; + +std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SHAPE_H_ diff --git a/tensorflow/compiler/xla/shape_test.cc b/tensorflow/compiler/xla/shape_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cc3a5eb1d641134bbe6992e44e11032ac20fca67 --- /dev/null +++ b/tensorflow/compiler/xla/shape_test.cc @@ -0,0 +1,112 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/shape.h" + +#include +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/layout_util.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/test_helpers.h" +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" + +namespace xla { +namespace { + +TEST(ShapeTest, ProgramShapeToFromProto) { + ProgramShape program_shape; + *program_shape.add_parameters() = ShapeUtil::MakeShape(F32, {1, 2, 3}); + *program_shape.add_parameters() = ShapeUtil::MakeTokenShape(); + *program_shape.add_parameters() = ShapeUtil::MakeShape(S64, {}); + *program_shape.add_parameters() = ShapeUtil::MakeTupleShape( + {ShapeUtil::MakeShape(S32, {}), + ShapeUtil::MakeTupleShape({ShapeUtil::MakeTokenShape()}), + ShapeUtil::MakeShape(F32, {42, 42})}); + + *program_shape.mutable_result() = ShapeUtil::MakeShape(F32, {7}); + + program_shape.add_parameter_names("foo"); + program_shape.add_parameter_names("bar"); + program_shape.add_parameter_names("baz"); + program_shape.add_parameter_names("qux qux"); + + // Create a copy of the program shape by round-tripping through a proto. + ProgramShape program_shape_copy(program_shape.ToProto()); + ASSERT_EQ(program_shape.parameters_size(), + program_shape_copy.parameters_size()); + for (int i = 0; i < program_shape.parameters_size(); ++i) { + EXPECT_TRUE(ShapeUtil::Equal(program_shape.parameters(i), + program_shape_copy.parameters(i))); + } + + EXPECT_TRUE( + ShapeUtil::Equal(program_shape.result(), program_shape_copy.result())); + + ASSERT_EQ(program_shape.parameter_names_size(), + program_shape_copy.parameter_names_size()); + for (int i = 0; i < program_shape.parameter_names_size(); ++i) { + EXPECT_EQ(program_shape.parameter_names(i), + program_shape_copy.parameter_names(i)); + } +} + +TEST(ShapeTest, ProgramShapeToString) { + Shape opaque = ShapeUtil::MakeOpaqueShape(); + Shape token = ShapeUtil::MakeTokenShape(); + Shape scalar = ShapeUtil::MakeShape(F32, {}); + Shape matrix = ShapeUtil::MakeShape(U32, {1, 2}); + Shape matrix2 = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1}); + Shape tuple = ShapeUtil::MakeTupleShape({opaque, scalar, matrix, matrix2}); + Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix, token}); + + ProgramShape prog = ShapeUtil::MakeProgramShape( + {opaque, scalar, matrix, matrix2, tuple, nested_tuple}, nested_tuple); + EXPECT_EQ( + "((unknown): opaque[], " + "(unknown): f32[], " + "(unknown): u32[1,2], " + "(unknown): s32[3,4], " + "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), " + "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) " + "-> " + "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])", + ShapeUtil::HumanString(prog)); + + prog.add_parameter_names("arg0"); + prog.add_parameter_names("scalar"); + prog.add_parameter_names("matrix"); + prog.add_parameter_names("matrix2"); + prog.add_parameter_names("tuple"); + prog.add_parameter_names("nested_tuple"); + EXPECT_EQ( + "(arg0: opaque[], " + "scalar: f32[], " + "matrix: u32[1,2], " + "matrix2: s32[3,4], " + "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), " + "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], " + "token[])) " + "-> " + "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])", + ShapeUtil::HumanString(prog)); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h index df610102b4c7fa08c0b7030124939009130f89f4..7bf97729165bef98fabc29040e02203eee68a53c 100644 --- a/tensorflow/compiler/xla/shape_tree.h +++ b/tensorflow/compiler/xla/shape_tree.h @@ -667,12 +667,11 @@ void ShapeTree::CopySubtreeFrom(const ShapeTree& other, template bool ShapeTree::operator==(const ShapeTree& other) const { bool equal = true; - ForEachElement( - [this, &other, &equal](const ShapeIndex& index, const T& data) { - if (data != other.element(index)) { - equal = false; - } - }); + ForEachElement([&other, &equal](const ShapeIndex& index, const T& data) { + if (data != other.element(index)) { + equal = false; + } + }); return equal; } diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc index c8ff55e7845785d9292516b823fb591cc28cbfad..2b6c484bc4f205be0180403eeac2dd391029b110 100644 --- a/tensorflow/compiler/xla/shape_tree_test.cc +++ b/tensorflow/compiler/xla/shape_tree_test.cc @@ -52,10 +52,10 @@ class ShapeTreeTest : public ::testing::Test { TEST_F(ShapeTreeTest, DefaultConstructor) { ShapeTree int_tree; - EXPECT_TRUE(ShapeUtil::IsNil(int_tree.shape())); + EXPECT_TRUE(ShapeUtil::IsEmptyTuple(int_tree.shape())); ShapeTree bool_tree; - EXPECT_TRUE(ShapeUtil::IsNil(bool_tree.shape())); + EXPECT_TRUE(ShapeUtil::IsEmptyTuple(bool_tree.shape())); } void ShapeTreeTest::TestShapeConstructor(const Shape& shape, diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc index d0c35d8dee46a1e0a5e343e0506a14ca1ce38bfd..b05ec209cc6e0673632c6cba6742eb8d1f1d9067 100644 --- a/tensorflow/compiler/xla/shape_util.cc +++ b/tensorflow/compiler/xla/shape_util.cc @@ -372,10 +372,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout( return IsTuple(shape) && TupleElementCount(shape) == 0; } -/* static */ bool ShapeUtil::IsNil(const Shape& shape) { - return IsEmptyTuple(shape); -} - /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) { CHECK(IsTuple(shape)) << HumanString(shape); return shape.tuple_shapes_size(); @@ -567,6 +563,20 @@ StatusOr StringToPrimitiveType(const string& name) { HumanString(program_shape.result())); } +/* static */ string ShapeUtil::HumanString( + const ProgramShapeProto& program_shape_proto) { + std::vector parameters; + for (auto& shape : program_shape_proto.parameters()) { + const int i = parameters.size(); + parameters.push_back(StrCat(i < program_shape_proto.parameter_names_size() + ? program_shape_proto.parameter_names(i) + : "(unknown)", + ": ", HumanString(shape))); + } + return StrCat("(", absl::StrJoin(parameters, ", "), ") -> ", + HumanString(program_shape_proto.result())); +} + namespace { // Parses shapes with simple recursive descent structure -- consumes from the // front of s and passes that view recursively as required. diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index a7a3026cf3f3a53d34d389212738ca584a19db1d..3796c5be5d7c3915ee0aed29d1319ed3efa3869f 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -28,6 +28,7 @@ limitations under the License. #include "absl/types/span.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/primitive_util.h" +#include "tensorflow/compiler/xla/shape.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" @@ -37,6 +38,7 @@ limitations under the License. #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -100,6 +102,11 @@ class ShapeIndex { string ToString() const; + template + friend H AbslHashValue(H h, const ShapeIndex& index) { + return H::combine(std::move(h), index.indices_); + } + private: container_type indices_; }; @@ -233,6 +240,7 @@ class ShapeUtil { // // (param_name: f32[42x12], ...) -> f32[24x42] static string HumanString(const ProgramShape& program_shape); + static string HumanString(const ProgramShapeProto& program_shape_proto); // Parses a ShapeUtil::HumanString-format shape string back into a shape // object. @@ -468,9 +476,6 @@ class ShapeUtil { // Returns true if shape is an empty tuple. static bool IsEmptyTuple(const Shape& shape); - // Returns true if shape is the nil shape (an empty tuple). - static bool IsNil(const Shape& shape); - // Returns the number of elements in the given tuple shape. // Precondition: IsTuple(shape) static int64 TupleElementCount(const Shape& shape); @@ -754,10 +759,18 @@ class ShapeUtil { pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads); } + tensorflow::mutex mu; + Status status; // Guarded by mu + while (n < rank) { if (pool != absl::nullopt) { - pool->Schedule( - [indexes, &visitor_function] { visitor_function(indexes); }); + pool->Schedule([indexes, &visitor_function, &mu, &status] { + StatusOr result = visitor_function(indexes); + if (!result.ok()) { + tensorflow::mutex_lock lock(mu); + status = status.ok() ? result.status() : status; + } + }); } else { TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes)); if (!should_continue) { @@ -775,7 +788,9 @@ class ShapeUtil { } } - return Status::OK(); + // Waits for the scheduled work to complete. + pool.reset(); + return status; } TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil); diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc index 0c647369a37e70f93abe1732963d2ddc7730c214..ce6330a0dc357194235914d5ad448ef2bfda7751 100644 --- a/tensorflow/compiler/xla/shape_util_test.cc +++ b/tensorflow/compiler/xla/shape_util_test.cc @@ -376,12 +376,12 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) { } TEST(ShapeUtilTest, NilShape) { - EXPECT_TRUE(ShapeUtil::IsNil(ShapeUtil::MakeNil())); - EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {1, 2, 3}))); - EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {0, 1}))); - EXPECT_FALSE(ShapeUtil::IsNil( + EXPECT_TRUE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeNil())); + EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {1, 2, 3}))); + EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {0, 1}))); + EXPECT_FALSE(ShapeUtil::IsEmptyTuple( ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {})}))); - EXPECT_FALSE(ShapeUtil::IsNil( + EXPECT_FALSE(ShapeUtil::IsEmptyTuple( ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {0})}))); } @@ -575,37 +575,6 @@ TEST(ShapeUtilTest, HumanString) { "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, " "token[])", ShapeUtil::HumanStringWithLayout(nested_tuple)); - - ProgramShape prog = ShapeUtil::MakeProgramShape( - {opaque, scalar, matrix, matrix2, tuple, nested_tuple}, nested_tuple); - EXPECT_EQ( - "((unknown): opaque[], " - "(unknown): f32[], " - "(unknown): u32[1,2], " - "(unknown): s32[3,4], " - "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), " - "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) " - "-> " - "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])", - ShapeUtil::HumanString(prog)); - - prog.add_parameter_names("arg0"); - prog.add_parameter_names("scalar"); - prog.add_parameter_names("matrix"); - prog.add_parameter_names("matrix2"); - prog.add_parameter_names("tuple"); - prog.add_parameter_names("nested_tuple"); - EXPECT_EQ( - "(arg0: opaque[], " - "scalar: f32[], " - "matrix: u32[1,2], " - "matrix2: s32[3,4], " - "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), " - "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], " - "token[])) " - "-> " - "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])", - ShapeUtil::HumanString(prog)); } TEST(ShapeUtilTest, ForEachSubshapeArray) { diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index db34d34f969311543d988ec6c3b8ee2af5b07e8e..0105e4a22650ece7bdfc35ff6dea340b5da78caf 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -135,6 +135,7 @@ cc_library( "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/memory", "@com_google_absl//absl/types:optional", "@com_google_absl//absl/types:span", @@ -297,6 +298,31 @@ xla_test( ], ) +xla_test( + name = "conv_depthwise_test", + timeout = "long", + srcs = ["conv_depthwise_test.cc"], + blacklisted_backends = [ + # disabled because of a break b/119590850. + "cpu", + "gpu", + ], + shard_count = 50, + deps = [ + "//tensorflow/compiler/xla:execution_options_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/client:xla_computation", + "//tensorflow/compiler/xla/service:bfloat16_normalization", + "//tensorflow/compiler/xla/service:despecializer", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/tests:client_library_test_base", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "@com_google_absl//absl/types:optional", + ], +) + xla_test( name = "check_execution_arity_test", srcs = ["check_execution_arity_test.cc"], @@ -1265,6 +1291,7 @@ xla_test( "enable_for_xla_interpreter", ], deps = [ + "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service:hlo_verifier", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc index 9811a015e91d866d6f4de6ebb6dac536ed6c7e06..4f5b525a34252db9e967a55af0d1bf39a2dd830e 100644 --- a/tensorflow/compiler/xla/tests/concat_test.cc +++ b/tensorflow/compiler/xla/tests/concat_test.cc @@ -492,6 +492,32 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) { ComputeAndCompareR3(&builder, expected, {p0.get(), p1.get()}); } +XLA_TEST_F(ConcatTest, ConcatDeeplyNested) { + XlaBuilder builder(TestName()); + auto a_literal = LiteralUtil::CreateR1({256.0}); + auto a = Parameter(&builder, 0, a_literal.shape(), "x"); + auto b = ConcatInDim(&builder, {a, a}, 0); + auto c = ConcatInDim(&builder, {b, b}, 0); + auto d = ConcatInDim(&builder, {c, c}, 0); + auto e = ConcatInDim(&builder, {d, d}, 0); + auto f = ConcatInDim(&builder, {e, e}, 0); + auto g = ConcatInDim(&builder, {f, f}, 0); + auto h = ConcatInDim(&builder, {g, g}, 0); + auto i = ConcatInDim(&builder, {h, h}, 0); + auto j = ConcatInDim(&builder, {i, i}, 0); + auto k = ConcatInDim(&builder, {j, j}, 0); + auto l = ConcatInDim(&builder, {k, k}, 0); + auto m = ConcatInDim(&builder, {l, l}, 0); + auto n = ConcatInDim(&builder, {m, m}, 0); + auto o = ConcatInDim(&builder, {n, n}, 0); + auto p = ConcatInDim(&builder, {o, o}, 0); + auto q = ConcatInDim(&builder, {p, p}, 0); + ConcatInDim(&builder, {q, q}, 0); + std::vector expected(131072, 256.0); + auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie(); + ComputeAndCompareR1(&builder, expected, {a_data.get()}); +} + // Describes a binary rank-2 concatenation test. struct R2BinarySpec { int64 lhs_dim0; diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..60ce576ceb20b89b59e72d821e63b0ccdee51b0b --- /dev/null +++ b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc @@ -0,0 +1,234 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/client/xla_computation.h" +#include "tensorflow/compiler/xla/execution_options_util.h" +#include "tensorflow/compiler/xla/service/bfloat16_normalization.h" +#include "tensorflow/compiler/xla/service/despecializer.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" + +namespace xla { +namespace { + +string GetFloatDataType(bool use_bfloat16) { + return use_bfloat16 ? "bf16" : "f32"; +} + +struct DepthwiseConvolution2DSpec { + int64 output_feature, window, stride, pad, lhs_dilate; + std::vector activation_dims; + std::vector activation_layout; + std::vector kernel_dims; + std::vector kernel_layout; + std::vector output_dims; + std::vector output_layout; +}; + +class DepthwiseConvolution2DTest + : public HloTestBase, + public ::testing::WithParamInterface< + ::testing::tuple> {}; + +static std::vector GetConv2DTestCases() { + std::vector config_set; + std::vector> config_options = { + {128, 6, 3, 64}, {256, 5, 3, 256}, {256, 5, 2, 144}, {144, 5, 3, 64}, + {144, 5, 2, 256}, {8, 48, 17, 8}, {128, 20, 6, 64}, {128, 1, 2, 144}, + {256, 1, 2, 64}, {64, 14, 12, 172}, {16, 9, 4, 16}}; + + for (auto option : config_options) { + int64 feature = option[0]; + int64 activation_size = option[1]; + int64 kernel_size = option[2]; + int64 batch = option[3]; + + std::vector kernel_layout = {3, 2, 1, 0}; + DepthwiseConvolution2DSpec config; + config.output_feature = feature; + config.window = kernel_size; + + config.activation_dims = {batch, activation_size, activation_size, feature}; + config.activation_layout = {3, 0, 2, 1}; + + config.kernel_dims = {kernel_size, kernel_size, 1, feature}; + config.kernel_layout = {3, 2, 1, 0}; + + if (activation_size == 1 && kernel_size == 2) { + // Test for outer dim. + config.output_dims = {batch, activation_size + kernel_size - 1, + activation_size + kernel_size, feature}; + } else if (feature == 256) { + // Restrict dilation-based tests only to one feature configuration. + config.stride = activation_size - 1; + config.pad = 0; + config.lhs_dilate = feature / 32; + config.output_dims = {batch, feature / 32, + activation_size - kernel_size + 1, feature}; + } else { + config.stride = config.pad = config.lhs_dilate = -1; + config.output_dims = {batch, activation_size - kernel_size + 1, + activation_size - kernel_size + 1, feature}; + } + + // Try this layout for all kernel shapes. + config.output_layout = {3, 0, 2, 1}; + config_set.push_back(config); + + // Try other layouts only for certain kernel shapes. + if (kernel_size % 2 == 0) { + config.activation_layout = {0, 3, 2, 1}; + config_set.push_back(config); + + config.output_layout = {0, 3, 2, 1}; + config_set.push_back(config); + + config.activation_layout = {3, 0, 2, 1}; + config_set.push_back(config); + } + } + + return config_set; +} + +string DepthwiseConvolution2DTestDataToString( + const ::testing::TestParamInfo< + ::testing::tuple>& data) { + const auto& spec = ::testing::get<0>(data.param); + const string data_type = GetFloatDataType(::testing::get<1>(data.param)); + string str = absl::StrCat( + "activation_dims_", absl::StrJoin(spec.activation_dims, "x"), + "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"), + "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_", + absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_", + absl::StrJoin(spec.output_dims, "x"), "_output_layout_", + absl::StrJoin(spec.output_layout, "_"), data_type); + // -1 indicates non-existence. + if (spec.stride != -1) { + absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1"); + } + + // Test names are not allowed to contain the '-' character. + absl::c_replace(str, '-', 'n'); + return str; +} + +string BuildHloTextDepthwiseConvolution2D( + const DepthwiseConvolution2DSpec& spec, bool use_bfloat16) { + const string data_type = GetFloatDataType(use_bfloat16); + if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) { + return absl::StrFormat( + R"( + HloModule TensorFlowDepthwiseConv, is_scheduled=true + + ENTRY main { + activation = %s[%s]{%s} parameter(0) + kernel = %s[%s]{%s} parameter(1) + ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel), + window={size=%dx%d pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f, + feature_group_count=%d + } + )", + data_type, absl::StrJoin(spec.activation_dims, ","), + absl::StrJoin(spec.activation_layout, ","), data_type, + absl::StrJoin(spec.kernel_dims, ","), + absl::StrJoin(spec.kernel_layout, ","), data_type, + absl::StrJoin(spec.output_dims, ","), + absl::StrJoin(spec.output_layout, ","), data_type, + absl::StrJoin(spec.activation_dims, ","), + absl::StrJoin(spec.activation_layout, ","), data_type, + absl::StrJoin(spec.kernel_dims, ","), + absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window, + spec.window, spec.window, spec.window, spec.output_feature); + + } else if (spec.stride == -1) { + return absl::StrFormat( + R"( + HloModule TensorFlowDepthwiseConv, is_scheduled=true + + ENTRY main { + activation = %s[%s]{%s} parameter(0) + kernel = %s[%s]{%s} parameter(1) + ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel), + window={size=%dx%d}, dim_labels=b01f_01io->b01f, + feature_group_count=%d + } + )", + data_type, absl::StrJoin(spec.activation_dims, ","), + absl::StrJoin(spec.activation_layout, ","), data_type, + absl::StrJoin(spec.kernel_dims, ","), + absl::StrJoin(spec.kernel_layout, ","), data_type, + absl::StrJoin(spec.output_dims, ","), + absl::StrJoin(spec.output_layout, ","), data_type, + absl::StrJoin(spec.activation_dims, ","), + absl::StrJoin(spec.activation_layout, ","), data_type, + absl::StrJoin(spec.kernel_dims, ","), + absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window, + spec.output_feature); + } else { + return absl::StrFormat( + R"( + HloModule TensorFlowDepthwiseConv, is_scheduled=true + + ENTRY main { + activation = %s[%s]{%s} parameter(0) + kernel = %s[%s]{%s} parameter(1) + ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel), + window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, + dim_labels=b01f_01io->b01f, feature_group_count=%d + } + )", + data_type, absl::StrJoin(spec.activation_dims, ","), + absl::StrJoin(spec.activation_layout, ","), data_type, + absl::StrJoin(spec.kernel_dims, ","), + absl::StrJoin(spec.kernel_layout, ","), data_type, + absl::StrJoin(spec.output_dims, ","), + absl::StrJoin(spec.output_layout, ","), data_type, + absl::StrJoin(spec.activation_dims, ","), + absl::StrJoin(spec.activation_layout, ","), data_type, + absl::StrJoin(spec.kernel_dims, ","), + absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window, + spec.stride, 0, 0, spec.lhs_dilate, spec.output_feature); + } +} + +XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) { + const DepthwiseConvolution2DSpec& spec = ::testing::get<0>(GetParam()); + bool use_bfloat16 = ::testing::get<1>(GetParam()); + const string hlo_text = + BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16); + + EXPECT_TRUE(RunAndCompareNoHloPasses( + hlo_text, ErrorSpec{0.01, 0.01}, [](HloModule* module) -> Status { + BFloat16MixedPrecisionRemoval remover; + TF_RETURN_IF_ERROR(remover.Run(module).status()); + Despecializer despecializer; + return despecializer.Run(module).status(); + })); +} + +INSTANTIATE_TEST_CASE_P( + DepthwiseConvolution2DTestWithRandomIndices, DepthwiseConvolution2DTest, + ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()), + ::testing::Bool()), + DepthwiseConvolution2DTestDataToString); + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc index a5e9cfd0cc9ff4bf477a0e46a6c215d1c32d92da..b52d30fd6624c26ad62bd0c5f6a6d74175e4539f 100644 --- a/tensorflow/compiler/xla/tests/convolution_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_test.cc @@ -1282,7 +1282,7 @@ TYPED_TEST(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, Types) { } template -class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest { +class Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid : public ConvolutionTest { public: void RunTest() { XlaBuilder builder(TestName()); @@ -1341,8 +1341,8 @@ class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest { } }; -TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, TestTypes); -TYPED_TEST(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, Types) { +TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, TestTypes); +TYPED_TEST(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, Types) { this->RunTest(); } diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc index 2f467a26b0a6390094f9eceaf791a3f928a00c6c..989a7c705a8254f99e5cc0e97dfde5942f146964 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.cc +++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc @@ -99,6 +99,8 @@ void VerifiedHloModule::VerifyOrAddFailure(const string& message) { ADD_FAILURE() << "HloVerifier failed on module " << name() << (message.empty() ? "" : absl::StrCat(" (", message, ")")) << ": " << status; + LOG(ERROR) << "Contents of bad module:"; + XLA_LOG_LINES(tensorflow::ERROR, ToString()); } } diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc index 5cf87e565bf493167f5173588e7afa3b96282488..34c7dc7c46427b2d18ea21fc286ee03175f70800 100644 --- a/tensorflow/compiler/xla/tests/replay_test.cc +++ b/tensorflow/compiler/xla/tests/replay_test.cc @@ -55,7 +55,8 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) { client_->GetComputationShape(computation).ConsumeValueOrDie(); std::unique_ptr replayed_shape = client_->GetComputationShape(replayed).ConsumeValueOrDie(); - ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape)); + ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(), + replayed_shape->ToProto())); // Run it. Literal literal = @@ -87,7 +88,8 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) { client_->GetComputationShape(computation).ConsumeValueOrDie(); std::unique_ptr replayed_shape = client_->GetComputationShape(replayed).ConsumeValueOrDie(); - ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape)); + ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(), + replayed_shape->ToProto())); // Run it. std::unique_ptr x_data = @@ -133,7 +135,8 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) { client_->GetComputationShape(computation).ConsumeValueOrDie(); std::unique_ptr replayed_shape = client_->GetComputationShape(replayed).ConsumeValueOrDie(); - ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape)); + ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(), + replayed_shape->ToProto())); // Run it. Literal literal = diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc index a2b7c26331b3cc89ed0413efe8eb31c2b9e37038..601c6b06938fef1f1ae809b33209ae59b24c70a2 100644 --- a/tensorflow/compiler/xla/tests/token_hlo_test.cc +++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc @@ -16,6 +16,7 @@ limitations under the License. #include #include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -108,26 +109,6 @@ XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) { ::testing::HasSubstr("Entry parameter 0 is or contains a token shape")); } -XLA_TEST_F(TokenHloTest, InvalidOperandToTokenInstruction) { - std::unique_ptr module = CreateNewUnverifiedModule(); - auto builder = HloComputation::Builder(TestName()); - auto param = builder.AddInstruction( - HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0")); - builder.AddInstruction(HloInstruction::CreateAfterAll({param})); - builder.AddInstruction( - HloInstruction::CreateConstant(LiteralUtil::CreateR0(123))); - module->AddEntryComputation(builder.Build()); - - Status status = - HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false) - .Run(module.get()) - .status(); - ASSERT_IS_NOT_OK(status); - EXPECT_THAT(status.error_message(), - ::testing::HasSubstr( - "Operands of token instructions must be TOKEN types")); -} - XLA_TEST_F(TokenHloTest, TokenInWhileLoop) { // Thread a token around a while loop. Token is created and consumed by a // AfterAll instruction in the while body. @@ -220,5 +201,95 @@ ENTRY %TokenInConditional (param.3: pred[]) -> s32[] { } } +XLA_TEST_F(TokenHloTest, AddDependency) { + string module_string = R"( +HloModule AddDependency, is_scheduled=true + +// Computes (p0 + 42) * (-p1) +// where there is a dependency from the add to the negation using a token +// with after-all and add-dependency instructions. +ENTRY %AddDependency (p0: f32[], p1: f32[]) -> f32[] { + %p0 = f32[] parameter(0) + %p1 = f32[] parameter(1) + + %forty_two = f32[] constant(42.0) + %add = f32[] add(f32[] %p0, f32[] %forty_two) + %token = token[] after-all(f32[] %add) + %p1_after_token = f32[] add-dependency(f32[] %p1, token[] %token) + %neg = f32[] negate(f32[] %p1_after_token) + ROOT %product = f32[] multiply(f32[] %add, f32[] %neg) +} +)"; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + ParseHloString(module_string, GetModuleConfigForTest())); + auto p0 = LiteralUtil::CreateR0(10.0); + auto p1 = LiteralUtil::CreateR0(3.0); + auto expected = LiteralUtil::CreateR0(-156.0); + EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0, &p1})); +} + +XLA_TEST_F(TokenHloTest, AddDependencyOfConstant) { + string module_string = R"( +HloModule AddDependencyOfConstant, is_scheduled=true + +ENTRY %AddDependency (p0: f32[]) -> f32[] { + %p0 = f32[] parameter(0) + %forty_two = f32[] constant(42.0) + %token = token[] after-all(f32[] %p0) + %forty_two_after_token = f32[] add-dependency(f32[] %forty_two, token[] %token) + ROOT %product = f32[] multiply(f32[] %p0, f32[] %forty_two_after_token) +} +)"; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + ParseHloString(module_string, GetModuleConfigForTest())); + auto p0 = LiteralUtil::CreateR0(10.0); + auto expected = LiteralUtil::CreateR0(420.0); + EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0})); +} + +XLA_TEST_F(TokenHloTest, AddDependencyAsRoot) { + string module_string = R"( +HloModule AddDependencyAsRoot, is_scheduled=true +ENTRY %AddDependency (p: f32[3]) -> f32[3] { + %p = f32[3] parameter(0) + %neg = f32[3] negate(f32[3] %p) + %token = token[] after-all() + ROOT %add_dep = f32[3] add-dependency(f32[3] %neg, token[] %token) +} +)"; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + ParseHloString(module_string, GetModuleConfigForTest())); + auto input = LiteralUtil::CreateR1({1.0, 3.0, 7.0}); + auto expected = LiteralUtil::CreateR1({-1.0, -3.0, -7.0}); + EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&input})); +} + +XLA_TEST_F(TokenHloTest, TupleShapedAddDependency) { + string module_string = R"( +HloModule TupleShapedAddDependency, is_scheduled=true +ENTRY %TupleShapedAddDependency (p0: f32[3], p1: f32[3]) -> f32[3] { + %p0 = f32[3] parameter(0) + %p1 = f32[3] parameter(1) + %forty_two = f32[] constant(42.0) + %token = token[] after-all() + %tuple = (f32[3], token[], f32[3], f32[]) tuple(f32[3] %p0, token[] %token, f32[3] %p1, f32[] %forty_two) + %add_dep = (f32[3], token[], f32[3], f32[]) add-dependency((f32[3], token[], f32[3], f32[]) %tuple, token[] %token) + %elem0 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=0 + %elem2 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=2 + ROOT %diff = f32[3] subtract(f32[3] %elem0, f32[3] %elem2) +} +)"; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr module, + ParseHloString(module_string, GetModuleConfigForTest())); + auto p0 = LiteralUtil::CreateR1({3.0, 3.0, 47.0}); + auto p1 = LiteralUtil::CreateR1({1.0, -2.0, 2.0}); + auto expected = LiteralUtil::CreateR1({2.0, 5.0, 45.0}); + EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0, &p1})); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto index 683ccc40f162ead3a248aee83d9abf3086a1ac93..27ef86ab2ebded7cfc524ac7d05fe459a52535c6 100644 --- a/tensorflow/compiler/xla/xla_data.proto +++ b/tensorflow/compiler/xla/xla_data.proto @@ -183,7 +183,7 @@ message Shape { // Shape of the parameters and output of a computation (like a traditional // function signature). -message ProgramShape { +message ProgramShapeProto { repeated Shape parameters = 1; Shape result = 2; repeated string parameter_names = 3; diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc index dc62cf7a6b24e373374b458d2e4722e79500fb93..db43aeaafec2d3ff6770b4fb891b0451f3883917 100644 --- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc +++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc @@ -174,11 +174,12 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) { ctx->set_output(0, handle_output); xla::LocalExecutable* executable = entry->get().get_executable(); - xla::ProgramShape program_shape = executable->executable() - ->module() - .config() - .entry_computation_layout() - .ComputeProgramShape(); + xla::ProgramShapeProto program_shape = executable->executable() + ->module() + .config() + .entry_computation_layout() + .ComputeProgramShape() + .ToProto(); Tensor program_shape_output(DT_STRING, TensorShape({1})); program_shape_output.vec()(0) = program_shape.SerializeAsString(); ctx->set_output(1, program_shape_output); diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc index 25464b5554d21f4b936f3f4a442fd174a8b56a8b..7e73db98f7873ba48bffaec8be2d887552f310af 100644 --- a/tensorflow/compiler/xrt/tests/raw_api_test.cc +++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc @@ -411,7 +411,7 @@ TEST(RawApiTest, CompileAndExecute) { auto expected = xla::LiteralUtil::CreateR1({27.0f, 21.0f}); EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response)); - xla::ProgramShape program_shape; + xla::ProgramShapeProto program_shape; EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec()(0))); EXPECT_EQ(program_shape.parameters_size(), 2); } @@ -465,7 +465,7 @@ TEST(RawApiTest, CompileAndExecuteWithArgumentVector) { auto expected = xla::LiteralUtil::CreateR1({27.0f, 21.0f}); EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response)); - xla::ProgramShape program_shape; + xla::ProgramShapeProto program_shape; EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec()(0))); EXPECT_EQ(program_shape.parameters_size(), 2); } @@ -510,7 +510,7 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) { TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {c_handle.program_shape}, {release}, &outputs)); - xla::ProgramShape program_shape; + xla::ProgramShapeProto program_shape; EXPECT_TRUE(program_shape.ParseFromString(outputs[0].vec()(0))); EXPECT_EQ(program_shape.parameters_size(), 1); @@ -520,7 +520,7 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) { << xla::ShapeUtil::HumanStringWithLayout(program_shape.result()); xla::ProgramShape xla_program_shape = - XlaCompiledProgramShape(xla_computation, *shapes); + XlaCompiledProgramShape(xla_computation, xla::ProgramShape(*shapes)); EXPECT_TRUE(xla::LayoutUtil::Equal( xla::ShapeUtil::GetSubshape(program_shape.parameters(0), {0}).layout(), xla::ShapeUtil::GetSubshape(xla_program_shape.parameters(0), {0}) @@ -739,7 +739,7 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) { auto expected = xla::LiteralUtil::CreateR0(15123899); EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response)); - xla::ProgramShape program_shape; + xla::ProgramShapeProto program_shape; EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec()(0))); EXPECT_EQ(program_shape.parameters_size(), 2); EXPECT_TRUE( diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto index 6ab77fbaaf0cbe23503ebc71775f52af01e41a74..ae44f717409f804b38daf32914f79648fb8a127c 100644 --- a/tensorflow/compiler/xrt/xrt.proto +++ b/tensorflow/compiler/xrt/xrt.proto @@ -36,11 +36,11 @@ message XLAComputationConfig { tensorflow.tf2xla.HostComputeMetadata host_compute_metadata = 3; // The arg/result shapes for the whole computation. - xla.ProgramShape program_shape = 4; + xla.ProgramShapeProto program_shape = 4; // The arg/result shapes for each core of a model-parallel // computation. per_core_args_and_result_shapes is optional for a // single-core computation. - repeated xla.ProgramShape per_core_program_shape = 5; + repeated xla.ProgramShapeProto per_core_program_shape = 5; // Describes how replicated computation instances should be assigned to // devices. There are num_cores_per_replica computations, and each one will be // sent and executed to the set of replica device numbers described in the diff --git a/tensorflow/contrib/autograph/examples/benchmarks/BUILD b/tensorflow/contrib/autograph/examples/benchmarks/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..6d2d70c99b4cc804f2c8bf57afdc8c11f1f73516 --- /dev/null +++ b/tensorflow/contrib/autograph/examples/benchmarks/BUILD @@ -0,0 +1,36 @@ +licenses(["notice"]) # Apache 2.0 + +load("//tensorflow:tensorflow.bzl", "py_test") +load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark") + +py_library( + name = "benchmark_base", + srcs = [ + "benchmark_base.py", + ], + deps = [ + "//tensorflow:tensorflow_py", + ], +) + +py_test( + name = "cartpole_benchmark", + size = "enormous", + srcs = ["cartpole_benchmark.py"], + tags = [ + "local", + "manual", + "no_oss", + "notap", + "nozapfhahn", + ], + deps = [ + ":benchmark_base", + # Note: required gym dependency may need to be added here. + ], +) + +tf_py_logged_benchmark( + name = "cartpole_logged_benchmark", + target = "//tensorflow/contrib/autograph/examples/benchmarks:cartpole_benchmark", +) diff --git a/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py new file mode 100644 index 0000000000000000000000000000000000000000..93c694849c4dc3faca71e7f9d8614649a7784f99 --- /dev/null +++ b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py @@ -0,0 +1,62 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Common benchmarking code. + +See https://www.tensorflow.org/community/benchmarks for usage. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +import numpy as np + +import tensorflow as tf + + +class ReportingBenchmark(tf.test.Benchmark): + """Base class for a benchmark that reports general performance metrics. + + Subclasses only need to call one of the _profile methods, and optionally + report_results. + """ + + def time_execution(self, name, target, iters, warm_up_iters=5): + for _ in range(warm_up_iters): + target() + + all_times = [] + for _ in range(iters): + iter_time = time.time() + target() + all_times.append(time.time() - iter_time) + + avg_time = np.average(all_times) + + extras = dict() + extras['all_times'] = all_times + + if isinstance(name, tuple): + extras['name'] = name + name = '_'.join(str(piece) for piece in name) + + self.report_benchmark( + iters=iters, wall_time=avg_time, name=name, extras=extras) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..4f553be58e94f11e45f0697558348fbbd26bfb91 --- /dev/null +++ b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py @@ -0,0 +1,492 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A basic RL cartpole benchmark. + +The RL model uses the OpenAI Gym environment to train a simple network using +the policy gradients method. The training scales the gradients for each step +by the episode's cumulative discounted reward and averages these gradients over +a fixed number of games before applying the optimization step. + +For benchmarking purposes, we replace the OpenAI Gym environment to a fake +that returns random actions and rewards and never ends the episode. This way +the benchmarks compare the same amount of computation at each step. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import gym +import numpy as np +import tensorflow as tf + +from tensorflow.contrib import eager +from tensorflow.contrib.autograph.examples.benchmarks import benchmark_base +from tensorflow.python import autograph as ag +from tensorflow.python.eager import context + +# +# AutoGraph implementation +# + + +@ag.convert() +def graph_append_discounted_rewards(destination, rewards, discount_rate): + """Discounts episode rewards and appends them to destination.""" + ag.set_element_type(rewards, tf.float32) + + cdr = 0.0 + reverse_discounted = [] + ag.set_element_type(reverse_discounted, tf.float32) + + for i in range(len(rewards) - 1, -1, -1): + cdr = cdr * discount_rate + rewards[i] + cdr.set_shape(()) + reverse_discounted.append(cdr) + + retval = destination + # Note: AutoGraph doesn't yet support reversed() so we use a loop instead. + for i in range(len(reverse_discounted) - 1, -1, -1): + retval.append(reverse_discounted[i]) + + return retval + + +class GraphPolicyNetwork(tf.keras.Model): + """Policy network for the cart-pole reinforcement learning problem. + + The forward path of the network takes an observation from the cart-pole + environment (length-4 vector) and outputs an action. + """ + + def __init__(self, hidden_size): + super(GraphPolicyNetwork, self).__init__() + self._hidden_layer = tf.keras.layers.Dense( + hidden_size, activation=tf.nn.elu) + self._output_layer = tf.keras.layers.Dense(1) + + def call(self, inputs): + """Calculates logits and action. + + Args: + inputs: Observations from a step in the cart-pole environment, of shape + `(batch_size, input_size)` + + Returns: + logits: the logits output by the output layer. This can be viewed as the + likelihood vales of choosing the left (0) action. Shape: + `(batch_size, 1)`. + actions: randomly selected actions ({0, 1}) based on the logits. Shape: + `(batch_size, 1)`. + """ + hidden = self._hidden_layer(inputs) + logits = self._output_layer(hidden) + + left_prob = tf.nn.sigmoid(logits) + action_probs = tf.concat([left_prob, 1.0 - left_prob], 1) + + actions = tf.multinomial(tf.log(action_probs), 1) + return logits, actions + + # TODO(mdan): Move this method out of the class. + @ag.convert() + def train(self, cart_pole_env, optimizer, discount_rate, num_games, + max_steps_per_game): + var_list = tf.trainable_variables() + grad_list = [ + tf.TensorArray(tf.float32, 0, dynamic_size=True) for _ in var_list + ] + + step_counts = [] + discounted_rewards = [] + ag.set_element_type(discounted_rewards, tf.float32) + ag.set_element_type(step_counts, tf.int32) + + # Note: we use a shared object, cart_pole_env here. Because calls to the + # object's method are made through py_func, TensorFlow cannot detect its + # data dependencies. Hence we must manually synchronize access to it + # and ensure the control dependencies are set in such a way that + # calls to reset(), take_one_step, etc. are made in the correct order. + sync_counter = tf.constant(0) + + for _ in tf.range(num_games): + with tf.control_dependencies([sync_counter]): + obs = cart_pole_env.reset() + with tf.control_dependencies([obs]): + sync_counter += 1 + + game_rewards = [] + ag.set_element_type(game_rewards, tf.float32) + + for step in tf.range(max_steps_per_game): + logits, actions = self(obs) # pylint:disable=not-callable + logits = tf.reshape(logits, ()) + actions = tf.reshape(actions, ()) + + labels = 1.0 - tf.cast(actions, tf.float32) + loss = tf.nn.sigmoid_cross_entropy_with_logits( + labels=labels, logits=logits) + grads = tf.gradients(loss, var_list) + + for i in range(len(grads)): + grad_list[i].append(grads[i]) + + with tf.control_dependencies([sync_counter]): + obs, reward, done = cart_pole_env.step(actions) + with tf.control_dependencies([obs]): + sync_counter += 1 + obs = tf.reshape(obs, (1, 4)) + + game_rewards.append(reward) + if reward < 0.1 or done: + step_counts.append(step + 1) + break + + discounted_rewards = graph_append_discounted_rewards( + discounted_rewards, game_rewards, discount_rate) + + discounted_rewards = ag.stack(discounted_rewards) + discounted_rewards.set_shape((None,)) + mean, variance = tf.nn.moments(discounted_rewards, [0]) + normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance) + + for i in range(len(grad_list)): + g = ag.stack(grad_list[i]) + + # This block just adjusts the shapes to match for multiplication. + r = normalized_rewards + if r.shape.ndims < g.shape.ndims: + r = tf.expand_dims(r, -1) + if r.shape.ndims < g.shape.ndims: + r = tf.expand_dims(r, -1) + + grad_list[i] = tf.reduce_mean(g * r, axis=0) + + optimizer.apply_gradients( + zip(grad_list, var_list), global_step=tf.train.get_global_step()) + + return ag.stack(step_counts) + + +@ag.convert() +def graph_train_model(policy_network, cart_pole_env, optimizer, iterations): + """Trains the policy network for a given number of iterations.""" + i = tf.constant(0) + mean_steps_per_iteration = [] + ag.set_element_type(mean_steps_per_iteration, tf.int32) + + while i < iterations: + steps_per_game = policy_network.train( + cart_pole_env, + optimizer, + discount_rate=0.95, + num_games=20, + max_steps_per_game=200) + mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game)) + i += 1 + + return ag.stack(mean_steps_per_iteration) + + +class GraphGymCartpoleEnv(object): + """An env backed by OpenAI Gym's CartPole environment. + + Used to confirm a functional model only. + """ + + def __init__(self): + cart_pole_env = gym.make('CartPole-v1') + cart_pole_env.seed(0) + cart_pole_env.reset() + self.env = cart_pole_env + + def reset(self): + obs = ag.utils.wrap_py_func(self.env.reset, tf.float64, ()) + obs = tf.reshape(obs, (1, 4)) + obs = tf.cast(obs, tf.float32) + return obs + + def step(self, actions): + + def take_one_step(actions): + obs, reward, done, _ = self.env.step(actions) + obs = obs.astype(np.float32) + reward = np.float32(reward) + return obs, reward, done + + return ag.utils.wrap_py_func(take_one_step, + (tf.float32, tf.float32, tf.bool), (actions,)) + + +class GraphRandomCartpoleEnv(object): + """An environment that returns random actions and never finishes. + + Used during benchmarking, it will cause training to run a constant number of + steps. + """ + + def reset(self): + return tf.random.normal((1, 4)) + + def step(self, actions): + with tf.control_dependencies([actions]): + random_obs = tf.random.normal((1, 4)) + fixed_reward = tf.constant(0.001) + done = tf.constant(False) + return random_obs, fixed_reward, done + + +# +# Eager implementation +# + + +def eager_append_discounted_rewards(discounted_rewards, rewards, discount_rate): + cdr = 0.0 + reverse_discounted = [] + + for i in range(len(rewards) - 1, -1, -1): + cdr = cdr * discount_rate + rewards[i] + reverse_discounted.append(cdr) + + discounted_rewards.extend(reversed(reverse_discounted)) + return discounted_rewards + + +class EagerPolicyNetwork(tf.keras.Model): + """Policy network for the cart-pole reinforcement learning problem. + + The forward path of the network takes an observation from the cart-pole + environment (length-4 vector) and outputs an action. + """ + + def __init__(self, hidden_size): + super(EagerPolicyNetwork, self).__init__() + self._hidden_layer = tf.keras.layers.Dense( + hidden_size, activation=tf.nn.elu) + self._output_layer = tf.keras.layers.Dense(1) + + def call(self, inputs): + """Calculates logits and action. + + Args: + inputs: Observations from a step in the cart-pole environment, of shape + `(batch_size, input_size)` + + Returns: + logits: the logits output by the output layer. This can be viewed as the + likelihood vales of choosing the left (0) action. Shape: + `(batch_size, 1)`. + actions: randomly selected actions ({0, 1}) based on the logits. Shape: + `(batch_size, 1)`. + """ + hidden = self._hidden_layer(inputs) + logits = self._output_layer(hidden) + + left_prob = tf.nn.sigmoid(logits) + action_probs = tf.concat([left_prob, 1.0 - left_prob], 1) + + self._grad_fn = eager.implicit_gradients( + self._get_cross_entropy_and_save_actions) + + actions = tf.multinomial(tf.log(action_probs), 1) + return logits, actions + + def _get_cross_entropy_and_save_actions(self, inputs): + logits, actions = self(inputs) # pylint:disable=not-callable + self._current_actions = actions + labels = 1.0 - tf.cast(actions, tf.float32) + return tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) + + def train(self, cart_pole_env, optimizer, discount_rate, num_games, + max_steps_per_game): + grad_list = None + + step_counts = [] + discounted_rewards = [] + + for _ in range(num_games): + obs = cart_pole_env.reset() + + game_rewards = [] + + for step in range(max_steps_per_game): + grads_and_vars = self._grad_fn(tf.constant([obs], dtype=tf.float32)) + grads, var_list = zip(*grads_and_vars) + actions = self._current_actions.numpy()[0][0] + + if grad_list is None: + grad_list = [[g] for g in grads] + else: + for i in range(len(grads)): + grad_list[i].append(grads[i]) + + obs, reward, done = cart_pole_env.step(actions) + + game_rewards.append(reward) + if reward < 0.1 or done: + step_counts.append(step + 1) + break + + discounted_rewards = eager_append_discounted_rewards( + discounted_rewards, game_rewards, discount_rate) + + discounted_rewards = tf.stack(discounted_rewards) + mean, variance = tf.nn.moments(discounted_rewards, [0]) + normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance) + + for i in range(len(grad_list)): + g = tf.stack(grad_list[i]) + + r = normalized_rewards + while r.shape.ndims < g.shape.ndims: + r = tf.expand_dims(r, -1) + + grad_list[i] = tf.reduce_mean(g * r, axis=0) + + optimizer.apply_gradients( + zip(grad_list, var_list), global_step=tf.train.get_global_step()) + + return tf.stack(step_counts) + + +def eager_train_model(policy_network, cart_pole_env, optimizer, iterations): + """Trains the policy network for a given number of iterations.""" + mean_steps_per_iteration = [] + + for _ in range(iterations): + steps_per_game = policy_network.train( + cart_pole_env, + optimizer, + discount_rate=0.95, + num_games=20, + max_steps_per_game=200) + mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game)) + + return mean_steps_per_iteration + + +class EagerGymCartpoleEnv(object): + """An env backed by OpenAI Gym's CartPole environment. + + Used to confirm a functional model only. + """ + + def __init__(self): + cart_pole_env = gym.make('CartPole-v1') + cart_pole_env.seed(0) + cart_pole_env.reset() + self.env = cart_pole_env + + def reset(self): + return self.env.reset() + + def step(self, actions): + obs, reward, done, _ = self.env.step(actions) + return obs, reward, done + + +class EagerRandomCartpoleEnv(object): + """An environment that returns random actions and never finishes. + + Used during benchmarking, it will cause training to run a constant number of + steps. + """ + + def reset(self): + return np.random.normal(size=(4,)) + + def step(self, actions): + with tf.control_dependencies([actions]): + random_obs = np.random.normal(size=(4,)) + fixed_reward = 0.001 + done = False + return random_obs, fixed_reward, done + + +def graph_demo_training(): + """Not used in the benchmark. Used to confirm a functional model.""" + with tf.Graph().as_default(): + tf.set_random_seed(0) + + network = GraphPolicyNetwork(hidden_size=5) + network.build((1, 4)) + env = GraphGymCartpoleEnv() + opt = tf.train.AdamOptimizer(0.05) + + train_ops = graph_train_model(network, env, opt, iterations=5) + + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(tf.local_variables_initializer()) + steps_per_iteration = sess.run(train_ops) + for i, steps in enumerate(steps_per_iteration): + print('Step {} iterations: {}'.format(i, steps)) + + +def eager_demo_training(): + with context.eager_mode(): + network = EagerPolicyNetwork(hidden_size=5) + network.build((1, 4)) + env = EagerGymCartpoleEnv() + opt = tf.train.AdamOptimizer(0.05) + + steps_per_iteration = eager_train_model(network, env, opt, iterations=5) + for i, steps in enumerate(steps_per_iteration): + print('Step {} iterations: {}'.format(i, steps)) + + +class RLCartPoleBenchmark(benchmark_base.ReportingBenchmark): + """Actual benchmark. + + Trains the RL agent a fixed number of times, on random environments that + result in constant number of steps. + """ + + def benchmark_cartpole(self): + + def train_session(sess, ops): + return lambda: sess.run(ops) + + def train_eager(network, env, opt): + return lambda: eager_train_model(network, env, opt, iterations=10) + + for model_size in (10, 100, 1000): + with tf.Graph().as_default(): + network = GraphPolicyNetwork(hidden_size=model_size) + network.build((1, 4)) + env = GraphRandomCartpoleEnv() + opt = tf.train.AdamOptimizer(0.05) + train_ops = graph_train_model(network, env, opt, iterations=10) + + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(tf.local_variables_initializer()) + + self.time_execution(('cartpole', 'autograph', model_size), + train_session(sess, train_ops), 20) + + with context.eager_mode(): + network = EagerPolicyNetwork(hidden_size=model_size) + network.build((1, 4)) + env = EagerRandomCartpoleEnv() + opt = tf.train.AdamOptimizer(0.05) + + self.time_execution(('cartpole', 'eager', model_size), + train_eager(network, env, opt), 20) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py index 13215ffabf3a956d3f83697f867457b2fa72e7c9..8b6ed9f041b89a0da02a505ec261bca82b094f74 100644 --- a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py +++ b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py @@ -81,7 +81,7 @@ class ExpectationImportanceSampleTest(test.TestCase): # Compute E_p[X_1 * X_2 > 0], with X_i the ith component of X ~ p(x). # Should equal 1/2 because p is a spherical Gaussian centered at (0, 0). def indicator(x): - x1_times_x2 = math_ops.reduce_prod(x, reduction_indices=[-1]) + x1_times_x2 = math_ops.reduce_prod(x, axis=[-1]) return 0.5 * (math_ops.sign(x1_times_x2) + 1.0) prob = mc.expectation_importance_sampler( diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py index 18d40fc1dff8e7c9aefffbe3ceba770598a42096..e83a54851195708eb7e6412b7400236f4bc06e6b 100644 --- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py +++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py @@ -353,12 +353,12 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True, def _sample_mean(values): """Mean over sample indices. In this module this is always [0].""" - return math_ops.reduce_mean(values, reduction_indices=[0]) + return math_ops.reduce_mean(values, axis=[0]) def _sample_max(values): """Max over sample indices. In this module this is always [0].""" - return math_ops.reduce_max(values, reduction_indices=[0]) + return math_ops.reduce_max(values, axis=[0]) def _get_samples(dist, z, n, seed): diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc index f083ce6f44b3c2a83d9b5d3235056eb94c4be4a8..e95dc577184f7e81d942755b41065f52131ce9f6 100644 --- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc +++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc @@ -366,6 +366,39 @@ BigtableTestClient::MutateRows( return MakeUnique(request.entries_size()); } +std::unique_ptr> +BigtableTestClient::AsyncMutateRow( + grpc::ClientContext* context, + google::bigtable::v2::MutateRowRequest const& request, + grpc::CompletionQueue* cq) { + LOG(WARNING) << "Call to InMemoryDataClient::" << __func__ + << "(); this will likely cause a crash!"; + return nullptr; +} + +std::unique_ptr<::grpc::ClientAsyncReaderInterface< + ::google::bigtable::v2::SampleRowKeysResponse>> +BigtableTestClient::AsyncSampleRowKeys( + ::grpc::ClientContext* context, + const ::google::bigtable::v2::SampleRowKeysRequest& request, + ::grpc::CompletionQueue* cq, void* tag) { + LOG(WARNING) << "Call to InMemoryDataClient::" << __func__ + << "(); this will likely cause a crash!"; + return nullptr; +} + +std::unique_ptr<::grpc::ClientAsyncReaderInterface< + ::google::bigtable::v2::MutateRowsResponse>> +BigtableTestClient::AsyncMutateRows( + ::grpc::ClientContext* context, + const ::google::bigtable::v2::MutateRowsRequest& request, + ::grpc::CompletionQueue* cq, void* tag) { + LOG(WARNING) << "Call to InMemoryDataClient::" << __func__ + << "(); this will likely cause a crash!"; + return nullptr; +} + std::shared_ptr BigtableTestClient::Channel() { LOG(WARNING) << "Call to InMemoryDataClient::Channel(); this will likely " "cause a crash!"; diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h index dac2b16a216d26f02684c7401ed2ddaa4b7baddb..c4a1f06bc504c3565c7bb09b42e48e7fbddb9cc6 100644 --- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h +++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h @@ -61,6 +61,25 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient { MutateRows(grpc::ClientContext* context, google::bigtable::v2::MutateRowsRequest const& request) override; + std::unique_ptr> + AsyncMutateRow(grpc::ClientContext* context, + google::bigtable::v2::MutateRowRequest const& request, + grpc::CompletionQueue* cq) override; + + std::unique_ptr<::grpc::ClientAsyncReaderInterface< + ::google::bigtable::v2::SampleRowKeysResponse>> + AsyncSampleRowKeys( + ::grpc::ClientContext* context, + const ::google::bigtable::v2::SampleRowKeysRequest& request, + ::grpc::CompletionQueue* cq, void* tag) override; + + std::unique_ptr<::grpc::ClientAsyncReaderInterface< + ::google::bigtable::v2::MutateRowsResponse>> + AsyncMutateRows(::grpc::ClientContext* context, + const ::google::bigtable::v2::MutateRowsRequest& request, + ::grpc::CompletionQueue* cq, void* tag) override; + std::shared_ptr Channel() override; private: diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py index a3df272e6924792128fc38fd153b9527b58b486e..b314b4d74df882a421d9a2ecce2629a63d5c5248 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py @@ -41,7 +41,8 @@ def make_custom_export_strategy(name, convert_fn, feature_columns, export_input_fn, - use_core_columns=False): + use_core_columns=False, + feature_engineering_fn=None): """Makes custom exporter of GTFlow tree format. Args: @@ -52,6 +53,7 @@ def make_custom_export_strategy(name, export_input_fn: A function that takes no arguments and returns an `InputFnOps`. use_core_columns: A boolean, whether core feature columns were used. + feature_engineering_fn: Feature eng function to be called on the input. Returns: An `ExportStrategy`. @@ -59,9 +61,12 @@ def make_custom_export_strategy(name, base_strategy = saved_model_export_utils.make_export_strategy( serving_input_fn=export_input_fn, strip_default_attrs=True) input_fn = export_input_fn() + features = input_fn.features + if feature_engineering_fn is not None: + features, _ = feature_engineering_fn(features, labels=None) (sorted_feature_names, dense_floats, sparse_float_indices, _, _, sparse_int_indices, _, _) = gbdt_batch.extract_features( - input_fn.features, feature_columns, use_core_columns) + features, feature_columns, use_core_columns) def export_fn(estimator, export_dir, checkpoint_path=None, eval_result=None): """A wrapper to export to SavedModel, and convert it to other formats.""" diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py index 99ecded6535137ed3e1c6b1cf7a1a24e9067e88c..a178820841c4c8bcb7f5742babdb6d0f4825de31 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py @@ -428,6 +428,7 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator): learner_config, examples_per_layer, quantiles, + label_dimension=1, num_trees=None, feature_columns=None, weight_column_name=None, @@ -448,6 +449,10 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator): layer. It can also be a function that computes the number of examples based on the depth of the layer that's being built. quantiles: a list of quantiles for the loss, each between 0 and 1. + label_dimension: Dimension of regression label. This is the size + of the last dimension of the labels `Tensor` (typically, this has shape + `[batch_size, label_dimension]`). When label_dimension>1, it is + recommended to use multiclass strategy diagonal hessian or full hessian. num_trees: An int, number of trees to build. feature_columns: A list of feature columns. weight_column_name: Name of the column for weights, or None if not @@ -489,9 +494,11 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator): loss_fn=functools.partial( losses.per_example_quantile_regression_loss, quantile=quantile), link_fn=array_ops.identity, - logit_dimension=1) + logit_dimension=label_dimension) return head + learner_config.num_classes = max(2, label_dimension) + super(GradientBoostedDecisionTreeQuantileRegressor, self).__init__( model_fn=model.model_builder, params={ @@ -548,6 +555,7 @@ def core_multiclass_head( # Core..QuantileRegressor directly, def core_quantile_regression_head( quantiles, + label_dimension=1, weight_column=None, loss_reduction=core_losses.Reduction.SUM_OVER_NONZERO_WEIGHTS): """Core head for quantile regression problems.""" @@ -562,7 +570,7 @@ def core_quantile_regression_head( # pylint:disable=protected-access head_fn = core_head_lib._regression_head( - label_dimension=1, + label_dimension=label_dimension, loss_fn=loss_fn, loss_reduction=loss_reduction, weight_column=weight_column) @@ -747,6 +755,7 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor( learner_config, examples_per_layer, quantiles, + label_dimension=1, num_trees=None, feature_columns=None, weight_column_name=None, @@ -766,6 +775,10 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor( layer. It can also be a function that computes the number of examples based on the depth of the layer that's being built. quantiles: a list of quantiles for the loss, each between 0 and 1. + label_dimension: Dimension of regression label. This is the size + of the last dimension of the labels `Tensor` (typically, this has shape + `[batch_size, label_dimension]`). When label_dimension>1, it is + recommended to use multiclass strategy diagonal hessian or full hessian. num_trees: An int, number of trees to build. feature_columns: A list of feature columns. weight_column_name: Name of the column for weights, or None if not @@ -799,18 +812,31 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor( mode=mode, config=config, params={ - 'head': core_quantile_regression_head(quantiles[0]), - 'feature_columns': feature_columns, - 'learner_config': learner_config, - 'num_trees': num_trees, - 'weight_column_name': weight_column_name, - 'examples_per_layer': examples_per_layer, - 'center_bias': center_bias, - 'logits_modifier_function': logits_modifier_function, - 'use_core_libs': True, - 'output_leaf_index': output_leaf_index, - 'override_global_step_value': None, - 'num_quantiles': num_quantiles, + 'head': + core_quantile_regression_head( + quantiles[0], label_dimension=label_dimension), + 'feature_columns': + feature_columns, + 'learner_config': + learner_config, + 'num_trees': + num_trees, + 'weight_column_name': + weight_column_name, + 'examples_per_layer': + examples_per_layer, + 'center_bias': + center_bias, + 'logits_modifier_function': + logits_modifier_function, + 'use_core_libs': + True, + 'output_leaf_index': + output_leaf_index, + 'override_global_step_value': + None, + 'num_quantiles': + num_quantiles, }, output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC) diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py index 1ede129925b2840b3c4f301421d3218802e82f54..ee052ac60387d8f993e4942dd7dff39e191dd3a4 100644 --- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py +++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py @@ -81,7 +81,7 @@ def _infer_ranking_train_input_fn(): _QUANTILE_REGRESSION_SIZE = 1000 -def _quantile_regression_input_fns(): +def _quantile_regression_input_fns(two_dimension=False): # The data generation is taken from # http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html np.random.seed(1) @@ -90,20 +90,28 @@ def _quantile_regression_input_fns(): """The function to predict.""" return x * np.sin(x) + def g(x): + """The function to predict.""" + return x * np.cos(x) + # Training data. x = np.atleast_2d(np.random.uniform(0, 10.0, size=_QUANTILE_REGRESSION_SIZE)).T x = x.astype(np.float32) # Labels. - y = f(x).ravel() + if not two_dimension: + y = f(x).ravel() + else: + y = np.column_stack((f(x).ravel(), g(x).ravel())) # Add random noise. dy = 1.5 + 1.0 * np.random.random(y.shape) noise = np.random.normal(0, dy) y += noise y_original = y.astype(np.float32) - y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1) + if not two_dimension: + y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1) train_input_fn = numpy_io.numpy_input_fn( x=x, @@ -417,7 +425,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase): frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3) # +/- 3% - self.assertBetween(frac_below_upper, 0.92, 0.98) + self.assertTrue(frac_below_upper >= 0.92) + self.assertTrue(frac_below_upper <= 0.98) train_input_fn, test_input_fn, _ = _quantile_regression_input_fns() model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor( @@ -435,7 +444,80 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase): frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3) # +/- 3% - self.assertBetween(frac_above_lower, 0.92, 0.98) + self.assertTrue(frac_above_lower >= 0.92) + self.assertTrue(frac_above_lower <= 0.98) + + # Multi-dimensional quantile regression. + def testQuantileRegressionMultiDimLabel(self): + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = 2 + learner_config.constraints.max_tree_depth = 3 + learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE + learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE + learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE + learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE + learner_config.regularization.tree_complexity = ( + 1.0 / _QUANTILE_REGRESSION_SIZE) + + train_input_fn, test_input_fn, y = _quantile_regression_input_fns( + two_dimension=True) + + # 95% percentile. + model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor( + quantiles=[0.95], + learner_config=learner_config, + label_dimension=2, + num_trees=100, + examples_per_layer=_QUANTILE_REGRESSION_SIZE, + center_bias=False) + + model_upper.fit(input_fn=train_input_fn, steps=1000) + result_iter = model_upper.predict(input_fn=test_input_fn) + upper = [] + for prediction_dict in result_iter: + upper.append(prediction_dict["scores"]) + + count_below_upper = np.count_nonzero(upper > y, axis=0) + count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1)) + frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3) + frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3) + frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3) + # +/- 3% + self.assertTrue(frac_below_upper_0 >= 0.92) + self.assertTrue(frac_below_upper_0 <= 0.98) + self.assertTrue(frac_below_upper_1 >= 0.92) + self.assertTrue(frac_below_upper_1 <= 0.98) + self.assertTrue(frac_both_below_upper >= 0.92) + self.assertTrue(frac_both_below_upper <= 0.98) + + train_input_fn, test_input_fn, _ = _quantile_regression_input_fns( + two_dimension=True) + model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor( + quantiles=[0.05], + learner_config=learner_config, + label_dimension=2, + num_trees=100, + examples_per_layer=_QUANTILE_REGRESSION_SIZE, + center_bias=False) + + model_lower.fit(input_fn=train_input_fn, steps=1000) + result_iter = model_lower.predict(input_fn=test_input_fn) + lower = [] + for prediction_dict in result_iter: + lower.append(prediction_dict["scores"]) + + count_above_lower = np.count_nonzero(lower < y, axis=0) + count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1)) + frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3) + frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3) + frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3) + # +/- 3% + self.assertTrue(frac_above_lower_0 >= 0.92) + self.assertTrue(frac_above_lower_0 <= 0.98) + self.assertTrue(frac_above_lower_1 >= 0.92) + self.assertTrue(frac_above_lower_1 <= 0.98) + self.assertTrue(frac_both_above_lower >= 0.92) + self.assertTrue(frac_both_above_lower <= 0.98) class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase): @@ -661,7 +743,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase): frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3) # +/- 3% - self.assertBetween(frac_below_upper, 0.92, 0.98) + self.assertTrue(frac_below_upper >= 0.92) + self.assertTrue(frac_below_upper <= 0.98) train_input_fn, test_input_fn, _ = _quantile_regression_input_fns() model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor( @@ -679,7 +762,81 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase): frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3) # +/- 3% - self.assertBetween(frac_above_lower, 0.92, 0.98) + self.assertTrue(frac_above_lower >= 0.92) + self.assertTrue(frac_above_lower <= 0.98) + + # Multi-dimensional quantile regression. + def testQuantileRegressionMultiDimLabel(self): + learner_config = learner_pb2.LearnerConfig() + learner_config.num_classes = 2 + learner_config.constraints.max_tree_depth = 3 + learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE + learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE + learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE + learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE + learner_config.regularization.tree_complexity = ( + 1.0 / _QUANTILE_REGRESSION_SIZE) + + train_input_fn, test_input_fn, y = _quantile_regression_input_fns( + two_dimension=True) + y = y.reshape(_QUANTILE_REGRESSION_SIZE, 2) + + # 95% percentile. + model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor( + quantiles=[0.95], + learner_config=learner_config, + num_trees=100, + label_dimension=2, + examples_per_layer=_QUANTILE_REGRESSION_SIZE, + center_bias=False) + + model_upper.train(input_fn=train_input_fn, steps=1000) + result_iter = model_upper.predict(input_fn=test_input_fn) + upper = [] + for prediction_dict in result_iter: + upper.append(prediction_dict["predictions"]) + + count_below_upper = np.count_nonzero(upper > y, axis=0) + count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1)) + frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3) + frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3) + frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3) + # +/- 3% + self.assertTrue(frac_below_upper_0 >= 0.92) + self.assertTrue(frac_below_upper_0 <= 0.98) + self.assertTrue(frac_below_upper_1 >= 0.92) + self.assertTrue(frac_below_upper_1 <= 0.98) + self.assertTrue(frac_both_below_upper >= 0.92) + self.assertTrue(frac_both_below_upper <= 0.98) + + train_input_fn, test_input_fn, _ = _quantile_regression_input_fns( + two_dimension=True) + model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor( + quantiles=[0.05], + learner_config=learner_config, + num_trees=100, + label_dimension=2, + examples_per_layer=_QUANTILE_REGRESSION_SIZE, + center_bias=False) + + model_lower.train(input_fn=train_input_fn, steps=1000) + result_iter = model_lower.predict(input_fn=test_input_fn) + lower = [] + for prediction_dict in result_iter: + lower.append(prediction_dict["predictions"]) + + count_above_lower = np.count_nonzero(lower < y, axis=0) + count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1)) + frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3) + frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3) + frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3) + # +/- 3% + self.assertTrue(frac_above_lower_0 >= 0.92) + self.assertTrue(frac_above_lower_0 <= 0.98) + self.assertTrue(frac_above_lower_1 >= 0.92) + self.assertTrue(frac_above_lower_1 <= 0.98) + self.assertTrue(frac_both_above_lower >= 0.92) + self.assertTrue(frac_both_above_lower <= 0.98) if __name__ == "__main__": diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py index f8da20a54c0ef297b17e76290d80a3bf83c7b0ca..220e981618b7c0bfb1e4e98c087d83b451b9b3cf 100644 --- a/tensorflow/contrib/boosted_trees/python/utils/losses.py +++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py @@ -65,9 +65,9 @@ def per_example_quantile_regression_loss(labels, weights, predictions, below is this loss but squared in the region where the loss value < 1. Args: - labels: Rank 2 (N, 1) tensor of per-example labels. + labels: Rank 2 (N, D) tensor of per-example labels. weights: Rank 2 (N, 1) tensor of per-example weights. - predictions: Rank 2 (N, 1) tensor of per-example predictions. + predictions: Rank 2 (N, D) tensor of per-example predictions. quantile: The quantile to use. Returns: @@ -119,8 +119,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15): labels = array_ops.expand_dims(labels, 1) # Labels are indices of classes, convert them to one hot encodings. target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes) - labels = math_ops.reduce_sum( - input_tensor=target_one_hot, reduction_indices=[1]) + labels = math_ops.reduce_sum(input_tensor=target_one_hot, axis=[1]) labels = math_ops.to_float(labels) # Calculate softmax probabilities for each class. diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py index 41258edd90866ae9f644a02c42dfe2dc589da998..6926c0d03fe38ab2d62cc588950c7f5a49b2aba1 100644 --- a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py +++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py @@ -74,8 +74,8 @@ class ConstrainedMinimizationProblem(object): if (constraints_shape.ndims is None or proxy_constraints_shape.ndims is None or - any([ii is None for ii in constraints_shape.as_list()]) or - any([ii is None for ii in proxy_constraints_shape.as_list()])): + any(ii is None for ii in constraints_shape.as_list()) or + any(ii is None for ii in proxy_constraints_shape.as_list())): raise ValueError( "constraints and proxy_constraints must have fully-known shapes") if constraints_shape != proxy_constraints_shape: diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py index 1e2c9121d63267692ee80f14299392e19ab95a88..a268415f0e65206294431a537be18cadbe1a1e84 100644 --- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py +++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py @@ -778,8 +778,7 @@ class CudnnParamsFormatConverterTest(TensorFlowTestCase, # Test opaque_params size lower bound opaque_params_size_v = sess.run(opaque_params_size) - min_params_size = ( - np.sum([x.size for x in ws]) + np.sum([x.size for x in bs])) + min_params_size = sum(x.size for x in ws) + np.sum(x.size for x in bs) logging.info("min_parm_size: %d vs actual_opaque_param_size: %d", min_params_size, opaque_params_size_v) self.assertLessEqual(min_params_size, opaque_params_size_v) @@ -853,8 +852,7 @@ class CudnnParamsFormatConverterTest(TensorFlowTestCase, # Test opaque_params size lower bound opaque_params_size_v = sess.run(opaque_params_size) - min_params_size = ( - np.sum([x.size for x in ws]) + np.sum([x.size for x in bs])) + min_params_size = sum(x.size for x in ws) + sum(x.size for x in bs) logging.info("min_parm_size: %d vs actual_opaque_param_size: %d", min_params_size, opaque_params_size_v) self.assertLessEqual(min_params_size, opaque_params_size_v) diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py index 6cc93dccb004687a2d583a5d1925ea6b98c98979..7e1b4062ce435f3ab4216e90b4f5fcbab984c1dc 100644 --- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py +++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py @@ -1045,8 +1045,8 @@ class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase): # Min param size estimate = sum(weights.size) + sum(biases.size) min_params_size = ( - np.sum(list(map(np.prod, rnn.canonical_weight_shapes))) + - np.sum([sp[0] for sp in rnn.canonical_bias_shapes])) + sum(map(np.prod, rnn.canonical_weight_shapes)) + + sum(sp[0] for sp in rnn.canonical_bias_shapes)) opaque_params = rnn.trainable_variables[0] with self.test_session(use_gpu=True, graph=ops.get_default_graph()): diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py index 8bbcc7cd0397a5339a69e4e44528f0e56584043a..8e25637ed91a1559b321ea96efbfaa2910f67158 100644 --- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py +++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py @@ -21,6 +21,7 @@ from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape +from tensorflow.python.keras.engine import input_spec from tensorflow.python.layers import base as base_layer from tensorflow.python.ops import array_ops from tensorflow.python.ops import init_ops @@ -322,7 +323,7 @@ class _CudnnRNN(base_layer.Layer): raise ValueError("The last dimension of the inputs to `CudnnRNN` " "should be defined. Found `None`.") self._input_size = input_shape[-1].value - self.input_spec = base_layer.InputSpec(ndim=3, axes={-1: self._input_size}) + self.input_spec = input_spec.InputSpec(ndim=3, axes={-1: self._input_size}) self._set_scope(None) diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD index a87a5624c88d1d0af10055261dad55937ed6aeb0..3ecd755d86f6be47910aebbdb46d335d165427d8 100644 --- a/tensorflow/contrib/distribute/BUILD +++ b/tensorflow/contrib/distribute/BUILD @@ -26,7 +26,6 @@ py_library( visibility = ["//tensorflow:internal"], deps = [ "//tensorflow/contrib/distribute/python:collective_all_reduce_strategy", - "//tensorflow/contrib/distribute/python:cross_tower_ops", "//tensorflow/contrib/distribute/python:mirrored_strategy", "//tensorflow/contrib/distribute/python:monitor", "//tensorflow/contrib/distribute/python:one_device_strategy", @@ -35,6 +34,7 @@ py_library( "//tensorflow/contrib/distribute/python:tpu_strategy", "//tensorflow/python:training", "//tensorflow/python:util", + "//tensorflow/python/distribute:cross_device_ops", "//tensorflow/python/distribute:distribute_config", "//tensorflow/python/distribute:distribute_coordinator", ], diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md index a938f8629d8210b4b512338a040340f21d3ef594..8a8dc159ade6f2a4a9b5ec29055ea4848492b29f 100644 --- a/tensorflow/contrib/distribute/README.md +++ b/tensorflow/contrib/distribute/README.md @@ -134,7 +134,7 @@ def model_fn(features, labels, mode): return tf.estimator.EstimatorSpec(mode, loss=loss) if mode == tf.estimator.ModeKeys.TRAIN: - train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss_fn()) + train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) ``` @@ -248,19 +248,17 @@ Let's use the same example for multi-worker. We'll start a cluster with 3 workers doing synchronous all-reduce training. In the following code snippet, we start multi-worker training using `tf.estimator.train_and_evaluate`: - ```python def model_main(): - estimator = ... distribution = tf.contrib.distribute.CollectiveAllReduceStrategy( num_gpus_per_worker=2) config = tf.estimator.RunConfig(train_distribute=distribution) + estimator = tf.estimator.Estimator(model_fn=model_fn, config=config) train_spec = tf.estimator.TrainSpec(input_fn=input_fn) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) ``` - **Note**: You don't have to set "TF\_CONFIG" manually if you use our provided Kubernetes template. @@ -327,13 +325,13 @@ start training. On your laptop, you can run ```python -estimator = ... distribution = tf.contrib.distribute.CollectiveAllReduceStrategy( num_gpus_per_worker=2) config = tf.estimator.RunConfig( experimental_distribute=tf.contrib.distribute.DistributeConfig( train_distribute=distribution, remote_cluster={"worker": ["host1:port", "host2:port", "host3:port"]})) +estimator = tf.estimator.Estimator(model_fn=model_fn, config=config) train_spec = tf.estimator.TrainSpec(input_fn=input_fn) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py index 69784c737ab47ac32fa389a9577d97c5a4718da2..8ec73654e30e4967f318c558ba94301e84a206e4 100644 --- a/tensorflow/contrib/distribute/__init__.py +++ b/tensorflow/contrib/distribute/__init__.py @@ -25,13 +25,13 @@ from __future__ import print_function # pylint: disable=unused-import,wildcard-import from tensorflow.contrib.distribute.python.collective_all_reduce_strategy import CollectiveAllReduceStrategy -from tensorflow.contrib.distribute.python.cross_tower_ops import * from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy from tensorflow.contrib.distribute.python.monitor import Monitor from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy from tensorflow.contrib.distribute.python.step_fn import * from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy +from tensorflow.python.distribute.cross_device_ops import * from tensorflow.python.distribute.distribute_config import DistributeConfig from tensorflow.python.distribute.distribute_coordinator import run_standard_tensorflow_server from tensorflow.python.training.distribute import * @@ -63,6 +63,7 @@ _allowed_symbols = [ 'get_loss_reduction', 'get_replica_context', 'has_distribution_strategy', + 'in_cross_replica_context', 'require_replica_context', 'run_standard_tensorflow_server', 'UpdateContext', diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD index 4b96179077d69a6f5220bba3dd86f3200be87466..91282a8c1dab051da7894956d202c88c90e2fe39 100644 --- a/tensorflow/contrib/distribute/python/BUILD +++ b/tensorflow/contrib/distribute/python/BUILD @@ -16,45 +16,26 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test") # TODO(priyag): Figure out testonly issues that are preventing us from # including our tests in pip for now. -py_library( - name = "values", - srcs = ["values.py"], - visibility = ["//tensorflow:internal"], - deps = [ - ":input_ops", - "//tensorflow/python:array_ops", - "//tensorflow/python:control_flow_ops", - "//tensorflow/python:device_util", - "//tensorflow/python:distribute", - "//tensorflow/python:framework_ops", - "//tensorflow/python:resource_variable_ops", - "//tensorflow/python:training", - "//tensorflow/python:util", - "//tensorflow/python/data/ops:multi_device_iterator_ops", - "//tensorflow/python/eager:context", - "//tensorflow/python/training/checkpointable:base", - "@six_archive//:six", - ], -) - cuda_py_test( name = "values_test", srcs = ["values_test.py"], additional_deps = [ + ":combinations", ":mirrored_strategy", ":multi_worker_test_base", - ":values", + "@absl_py//absl/testing:parameterized", "//tensorflow/core:protos_all_py", - "//tensorflow/python/data/ops:dataset_ops", - "//tensorflow/python:errors", "//tensorflow/python:array_ops", "//tensorflow/python:constant_op", + "//tensorflow/python:device_util", + "//tensorflow/python:errors", "//tensorflow/python:framework_ops", "//tensorflow/python:framework_test_lib", "//tensorflow/python:training", "//tensorflow/python:variable_scope", + "//tensorflow/python/data/ops:dataset_ops", + "//tensorflow/python/distribute:values", "//tensorflow/python/eager:context", - "//tensorflow/python:device_util", "//tensorflow/python/eager:test", "//tensorflow/python/estimator:estimator_py", ], @@ -68,28 +49,9 @@ py_library( srcs = ["mirrored_strategy.py"], visibility = ["//tensorflow:internal"], deps = [ - ":cross_tower_ops", - ":shared_variable_creator", - ":values", - "//tensorflow/core:protos_all_py", - "//tensorflow/python:array_ops", - "//tensorflow/python:constant_op", - "//tensorflow/python:control_flow_ops", - "//tensorflow/python:device", - "//tensorflow/python:device_util", "//tensorflow/python:distribute", - "//tensorflow/python:dtypes", - "//tensorflow/python:framework_ops", - "//tensorflow/python:pywrap_tensorflow", - "//tensorflow/python:tensor_util", - "//tensorflow/python:training", - "//tensorflow/python:util", - "//tensorflow/python:variable_scope", - "//tensorflow/python:variables", - "//tensorflow/python/distribute:multi_worker_util", - "//tensorflow/python/distribute:reduce_util", - "//tensorflow/python/eager:context", - "//tensorflow/python/eager:tape", + "//tensorflow/python/distribute:mirrored_strategy", + "//tensorflow/python/distribute:values", ], ) @@ -98,17 +60,17 @@ py_library( srcs = ["parameter_server_strategy.py"], visibility = ["//tensorflow:internal"], deps = [ - ":cross_tower_ops", ":mirrored_strategy", - ":values", "//tensorflow/core:protos_all_py", "//tensorflow/python:array_ops", "//tensorflow/python:framework_ops", "//tensorflow/python:resource_variable_ops", "//tensorflow/python:training", "//tensorflow/python:util", + "//tensorflow/python/distribute:cross_device_ops", "//tensorflow/python/distribute:multi_worker_util", "//tensorflow/python/distribute:reduce_util", + "//tensorflow/python/distribute:values", "//tensorflow/python/eager:context", ], ) @@ -121,7 +83,6 @@ cuda_py_test( ":multi_worker_test_base", ":parameter_server_strategy", ":strategy_test_lib", - ":values", "@absl_py//absl/testing:parameterized", "//tensorflow/core:protos_all_py", "//tensorflow/python:array_ops", @@ -137,6 +98,7 @@ cuda_py_test( "//tensorflow/python:variable_scope", "//tensorflow/python:variables", "//tensorflow/python/distribute:multi_worker_util", + "//tensorflow/python/distribute:values", "//tensorflow/python/eager:context", "//tensorflow/python/estimator:estimator_py", ], @@ -151,14 +113,13 @@ py_library( srcs = ["one_device_strategy.py"], visibility = ["//tensorflow:internal"], deps = [ - ":values", - "//tensorflow/contrib/eager/python:datasets", "//tensorflow/python:array_ops", "//tensorflow/python:distribute", "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", "//tensorflow/python:math_ops", "//tensorflow/python/distribute:reduce_util", + "//tensorflow/python/distribute:values", "//tensorflow/python/eager:context", "@six_archive//:six", ], @@ -169,16 +130,16 @@ py_library( srcs = ["collective_all_reduce_strategy.py"], visibility = ["//tensorflow:internal"], deps = [ - ":cross_tower_ops", - ":cross_tower_utils", ":mirrored_strategy", - ":values", "//tensorflow/core:protos_all_py", "//tensorflow/python:array_ops", "//tensorflow/python:collective_ops", "//tensorflow/python:framework_ops", "//tensorflow/python:training", + "//tensorflow/python/distribute:cross_device_ops", + "//tensorflow/python/distribute:cross_device_utils", "//tensorflow/python/distribute:multi_worker_util", + "//tensorflow/python/distribute:values", "//tensorflow/python/eager:context", ], ) @@ -241,28 +202,6 @@ py_test( ], ) -py_test( - name = "mirrored_strategy_test", - srcs = ["mirrored_strategy_test.py"], - srcs_version = "PY2AND3", - tags = [ - "no_pip", - ], - deps = [ - ":mirrored_strategy", - ":multi_worker_test_base", - ":strategy_test_lib", - "//tensorflow/python:constant_op", - "//tensorflow/python:distribute", - "//tensorflow/python:framework_ops", - "//tensorflow/python:framework_test_lib", - "//tensorflow/python:training", - "//tensorflow/python:variable_scope", - "//tensorflow/python/eager:context", - "//tensorflow/python/eager:test", - ], -) - py_test( name = "one_device_strategy_test", srcs = ["one_device_strategy_test.py"], @@ -278,35 +217,32 @@ py_test( ], ) +# TODO(priyag): Rename this test to mirrored_strategy_test cuda_py_test( name = "mirrored_strategy_multigpu_test", srcs = ["mirrored_strategy_multigpu_test.py"], additional_deps = [ + ":combinations", ":mirrored_strategy", ":multi_worker_test_base", - ":values", ":strategy_test_lib", - "//tensorflow/python:distribute", "//tensorflow/core:protos_all_py", "//tensorflow/python:array_ops", "//tensorflow/python:constant_op", + "//tensorflow/python:distribute", + "//tensorflow/python:framework_test_lib", "//tensorflow/python:layers", "//tensorflow/python:state_ops", "//tensorflow/python:variable_scope", - "//tensorflow/python:framework_test_lib", + "//tensorflow/python/distribute:values", "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", ], + shard_count = 5, tags = [ "guitar", - "no_pip", "multi_and_single_gpu", - # Do not perform the extra analysis on this test, because it is already - # performed for the `:mirrored_strategy_test` target. - "no_oss", - "noasan", - "notap", - "notsan", + "no_pip", ], ) @@ -345,7 +281,6 @@ py_library( visibility = ["//tensorflow:internal"], deps = [ ":one_device_strategy", - ":values", "//tensorflow/contrib/tpu:tpu_lib", "//tensorflow/python:constant_op", "//tensorflow/python:control_flow_ops", @@ -354,6 +289,7 @@ py_library( "//tensorflow/python:tensor_util", "//tensorflow/python:util", "//tensorflow/python/distribute:reduce_util", + "//tensorflow/python/distribute:values", ], ) @@ -363,7 +299,6 @@ cuda_py_test( additional_deps = [ ":collective_all_reduce_strategy", ":combinations", - ":cross_tower_utils", ":multi_worker_test_base", ":strategy_test_lib", "@absl_py//absl/testing:parameterized", @@ -379,6 +314,7 @@ cuda_py_test( "//tensorflow/python:layers", "//tensorflow/python:variable_scope", "//tensorflow/python:variables", + "//tensorflow/python/distribute:cross_device_utils", "//tensorflow/python/eager:context", "//tensorflow/python/estimator:estimator_py", ], @@ -509,7 +445,9 @@ cuda_py_test( "//third_party/py/numpy", "//tensorflow/contrib/optimizer_v2:training", "//tensorflow/python/data/ops:dataset_ops", - "//tensorflow/python/distribute", + "//tensorflow/python/distribute:distribute_config", + "//tensorflow/python/distribute:distribute_coordinator", + "//tensorflow/python/distribute:distribute_coordinator_context", "//tensorflow/python/eager:test", "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/feature_column", @@ -517,7 +455,7 @@ cuda_py_test( "//tensorflow/python:platform", "//tensorflow/python:summary", ], - shard_count = 5, + shard_count = 48, tags = [ "multi_and_single_gpu", "no_pip", @@ -601,52 +539,16 @@ cuda_py_test( ], ) -py_library( - name = "shared_variable_creator", - srcs = ["shared_variable_creator.py"], - visibility = ["//tensorflow:internal"], -) - -py_test( - name = "shared_variable_creator_test", - srcs = ["shared_variable_creator_test.py"], - srcs_version = "PY2AND3", - deps = [ - ":shared_variable_creator", - "//tensorflow/python:framework_test_lib", - "//tensorflow/python:variable_scope", - "//tensorflow/python/eager:test", - ], -) - -py_library( - name = "cross_tower_utils", - srcs = ["cross_tower_utils.py"], - srcs_version = "PY2AND3", - deps = [ - ":values", - "//tensorflow/python:array_ops", - "//tensorflow/python:collective_ops", - "//tensorflow/python:device", - "//tensorflow/python:dtypes", - "//tensorflow/python:framework_ops", - "//tensorflow/python:gradients", - "//tensorflow/python:math_ops", - "//tensorflow/python:nccl_ops", - "//tensorflow/python/distribute:all_reduce", - ], -) - cuda_py_test( - name = "cross_tower_utils_test", - srcs = ["cross_tower_utils_test.py"], + name = "cross_device_utils_test", + srcs = ["cross_device_utils_test.py"], additional_deps = [ ":combinations", - ":cross_tower_utils", "@absl_py//absl/testing:parameterized", "//tensorflow/python:constant_op", "//tensorflow/python:framework_ops", "//tensorflow/python:math_ops", + "//tensorflow/python/distribute:cross_device_utils", "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", ], @@ -655,41 +557,20 @@ cuda_py_test( ], ) -py_library( - name = "cross_tower_ops", - srcs = ["cross_tower_ops.py"], - srcs_version = "PY2AND3", - deps = [ - ":cross_tower_utils", - ":values", - "//tensorflow/python:array_ops", - "//tensorflow/python:device_lib", - "//tensorflow/python:framework_ops", - "//tensorflow/python:math_ops", - "//tensorflow/python:platform", - "//tensorflow/python:resource_variable_ops", - "//tensorflow/python:training", - "//tensorflow/python:variable_scope", - "//tensorflow/python/distribute:reduce_util", - "//tensorflow/python/eager:context", - "@six_archive//:six", - ], -) - cuda_py_test( - name = "cross_tower_ops_test", - srcs = ["cross_tower_ops_test.py"], + name = "cross_device_ops_test", + srcs = ["cross_device_ops_test.py"], additional_deps = [ ":combinations", - ":cross_tower_ops", ":multi_worker_test_base", ":mirrored_strategy", - ":values", "@absl_py//absl/testing:parameterized", "//tensorflow/python:array_ops", "//tensorflow/python:constant_op", "//tensorflow/python:framework_ops", "//tensorflow/python:math_ops", + "//tensorflow/python/distribute:cross_device_ops", + "//tensorflow/python/distribute:values", "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", ], @@ -699,37 +580,6 @@ cuda_py_test( ], ) -py_library( - name = "input_ops", - srcs = ["input_ops.py"], - visibility = ["//tensorflow:internal"], - deps = [ - "//tensorflow/python:framework_ops", - "//tensorflow/python/data/util:nest", - ], -) - -cuda_py_test( - name = "input_ops_test", - srcs = ["input_ops_test.py"], - additional_deps = [ - ":input_ops", - "//tensorflow/python/data/ops:dataset_ops", - "//tensorflow/contrib/data/python/ops:batching", - "//tensorflow/contrib/data/python/ops:interleave_ops", - "//tensorflow/python:errors", - "//tensorflow/python:client_testlib", - "//tensorflow/python:framework_ops", - "//tensorflow/python:framework_test_lib", - "//tensorflow/python:io_ops", - "//tensorflow/python/data/ops:readers", - "//tensorflow/python:util", - ], - tags = [ - "no_pip", - ], -) - py_library( name = "keras_test_lib", testonly = 1, @@ -769,7 +619,6 @@ py_library( srcs = ["metrics_v1_test.py"], deps = [ ":combinations", - "//tensorflow/contrib/data/python/ops:batching", "//tensorflow/python:math_ops", "//tensorflow/python:metrics", "//tensorflow/python:variables", diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py index d38bdb592a303d23871b48d80868917efc01dcd1..31bd0e996a247a2fc01405fb3b8172a40853d698 100644 --- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py +++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py @@ -43,7 +43,9 @@ class CheckpointUtilsWithDistributionStrategyTest( distribution=[combinations.default_strategy, combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, - combinations.mirrored_strategy_with_two_gpus], + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus], in_replica_mode=[True, False], mode=["graph"])) def testInitFromCheckpoint(self, distribution, in_replica_mode): diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py index dd7d29ac2b03601473f130d61fd34715fce80b77..906377b7395a520780e485461b83298320ebdcb3 100644 --- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py +++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py @@ -18,12 +18,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib -from tensorflow.contrib.distribute.python import cross_tower_utils from tensorflow.contrib.distribute.python import mirrored_strategy -from tensorflow.contrib.distribute.python import values from tensorflow.core.protobuf import rewriter_config_pb2 +from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib +from tensorflow.python.distribute import cross_device_utils from tensorflow.python.distribute import multi_worker_util +from tensorflow.python.distribute import values from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -79,11 +79,11 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended): else: local_devices = ["/device:CPU:0"] - self._collective_keys = cross_tower_utils.CollectiveKeys() + self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self).__init__( container_strategy, devices=local_devices, - cross_device_ops=cross_tower_ops_lib.CollectiveAllReduce( + cross_device_ops=cross_device_ops_lib.CollectiveAllReduce( num_workers=1, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys)) @@ -123,11 +123,11 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended): else: local_devices = [worker_device] - self._collective_keys = cross_tower_utils.CollectiveKeys() + self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self).__init__( container_strategy, devices=local_devices, - cross_device_ops=cross_tower_ops_lib.CollectiveAllReduce( + cross_device_ops=cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys)) @@ -234,8 +234,9 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended): num_input_pipelines=self._num_workers, input_pipeline_id=input_pipeline_id, num_replicas_in_sync=self._num_replicas_in_sync) - return values.PerReplicaDataset( - self._call_dataset_fn(input_fn, input_context), self._devices, True) + + return values.InputFunctionIterator( + input_fn, [(self._default_device, self._devices)], [input_context]) def _configure(self, session_config=None, @@ -319,3 +320,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended): @property def _num_replicas_in_sync(self): return len(self._devices) * self._num_workers + + # TODO(priyag): Delete this once all strategies use global batch size. + @property + def _global_batch_size(self): + return False diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py index ae7b88aa2d715f0164908d934008003b124db831..eb2b859aa559dd0c72351a009149ffdcb3c96b7c 100644 --- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py +++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py @@ -23,15 +23,18 @@ import numpy as np from tensorflow.contrib.distribute.python import collective_all_reduce_strategy from tensorflow.contrib.distribute.python import combinations -from tensorflow.contrib.distribute.python import cross_tower_utils from tensorflow.contrib.distribute.python import multi_worker_test_base from tensorflow.contrib.distribute.python import strategy_test_lib from tensorflow.core.protobuf import config_pb2 from tensorflow.python import keras +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.distribute import cross_device_utils from tensorflow.python.distribute import reduce_util +from tensorflow.python.distribute import values from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.layers import core from tensorflow.python.ops import array_ops @@ -73,7 +76,7 @@ class CollectiveAllReduceStrategyTestBase( cluster_spec=self._cluster_spec, task_type=task_type, task_id=task_id) - collective_keys = cross_tower_utils.CollectiveKeys( + collective_keys = cross_device_utils.CollectiveKeys( group_key_start=10 * num_gpus + CollectiveAllReduceStrategyTestBase.collective_key_base, instance_key_start=num_gpus * 100 + @@ -81,7 +84,7 @@ class CollectiveAllReduceStrategyTestBase( instance_key_with_id_start=num_gpus * 10000 + CollectiveAllReduceStrategyTestBase.collective_key_base) distribution.extended._collective_keys = collective_keys - distribution.extended._cross_tower_ops._collective_keys = collective_keys + distribution.extended._cross_device_ops._collective_keys = collective_keys if task_type and task_id is not None: return distribution, 'grpc://' + self._cluster_spec[task_type][ task_id], session_config @@ -242,9 +245,42 @@ class CollectiveAllReduceStrategyTestBase( reduced_x_value))) return np.allclose(x_value, reduced_x_value, atol=1e-5) + def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn, + expected_values): + distribution, master_target, config = self._get_test_object( + task_type, task_id, num_gpus) + devices = distribution.extended.worker_devices + + with ops.Graph().as_default(), \ + self.cached_session(config=config, + target=master_target) as sess: + iterator = distribution.make_input_fn_iterator(input_fn) + sess.run(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = sess.run( + [values.select_device(d, next_element) for d in devices]) + self.assertEqual(expected_value, computed_value) + + with self.assertRaises(errors.OutOfRangeError): + next_element = iterator.get_next() + sess.run([values.select_device(d, next_element) for d in devices]) + + # After re-initializing the iterator, should be able to iterate again. + sess.run(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = sess.run( + [values.select_device(d, next_element) for d in devices]) + self.assertEqual(expected_value, computed_value) + class DistributedCollectiveAllReduceStrategyTest( - CollectiveAllReduceStrategyTestBase, parameterized.TestCase): + CollectiveAllReduceStrategyTestBase, + strategy_test_lib.DistributionTestBase, + parameterized.TestCase): @classmethod def setUpClass(cls): @@ -272,7 +308,7 @@ class DistributedCollectiveAllReduceStrategyTest( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) def testVariableInitialization(self, num_gpus): if context.num_gpus() < num_gpus: - return + self.skipTest('Not enough GPUs') self._run_between_graph_clients( self._test_variable_initialization, self._cluster_spec, @@ -282,10 +318,30 @@ class DistributedCollectiveAllReduceStrategyTest( combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) def testComplexModel(self, num_gpus): if context.num_gpus() < num_gpus: - return + self.skipTest('Not enough GPUs') self._run_between_graph_clients( self._test_complex_model, self._cluster_spec, num_gpus=num_gpus) + # TODO(yuefengz): Update how we use num_gpus and required_gpus + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testMakeInputFnIterator(self, num_gpus): + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + dataset_fn = lambda: dataset_ops.Dataset.range(100) + # We use CPU as the device when num_gpus = 0 + devices_per_worker = max(1, num_gpus) + expected_values = [[i+j for j in range(devices_per_worker)] + for i in range(0, 100, devices_per_worker)] + + input_fn = self._input_fn_to_test_input_context( + dataset_fn, + expected_num_replicas_in_sync=3*devices_per_worker, + expected_num_input_pipelines=3, + expected_input_pipeline_id=1) # because task_id = 1 + self._test_input_fn_iterator('worker', 1, num_gpus, + input_fn, expected_values) + class DistributedCollectiveAllReduceStrategyTestWithChief( CollectiveAllReduceStrategyTestBase, parameterized.TestCase): @@ -326,44 +382,35 @@ class DistributedCollectiveAllReduceStrategyTestWithChief( class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase, + strategy_test_lib.DistributionTestBase, parameterized.TestCase): def testMinimizeLossGraph(self, num_gpus=2): # Collective ops doesn't support strategy with one device. if context.num_gpus() < num_gpus: - return + self.skipTest('Not enough GPUs') self._test_minimize_loss_graph(None, None, num_gpus) def testComplexModel(self, num_gpus=2): # Collective ops doesn't support strategy with one device. if context.num_gpus() < num_gpus: - return + self.skipTest('Not enough GPUs') self._test_complex_model(None, None, num_gpus) - -class InputContextTest(strategy_test_lib.DistributionTestBase): - - def testInputContextPropertyLocal(self): - d = collective_all_reduce_strategy.CollectiveAllReduceStrategy( - num_gpus_per_worker=2) - with context.graph_mode(): - input_fn = self._input_fn_to_test_input_context( - expected_num_replicas_in_sync=2, - expected_num_input_pipelines=1, - expected_input_pipeline_id=0) - d.make_input_fn_iterator(input_fn) - - def testInputContextPropertyMultiWorker(self): - d = collective_all_reduce_strategy.CollectiveAllReduceStrategy( - num_gpus_per_worker=2) - cluster_spec = {'worker': ['worker1', 'worker2', 'worker3'], 'ps': ['ps1']} - d.configure(cluster_spec=cluster_spec, task_type='worker', task_id=1) - with context.graph_mode(): - input_fn = self._input_fn_to_test_input_context( - expected_num_replicas_in_sync=6, - expected_num_input_pipelines=3, - expected_input_pipeline_id=1) # because task_id = 1 - d.make_input_fn_iterator(input_fn) + def testMakeInputFnIterator(self, num_gpus=2): + # Collective ops doesn't support strategy with one device. + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + dataset_fn = lambda: dataset_ops.Dataset.range(10) + expected_values = [[i, i+1] for i in range(0, 10, 2)] + + input_fn = self._input_fn_to_test_input_context( + dataset_fn, + expected_num_replicas_in_sync=num_gpus, + expected_num_input_pipelines=1, + expected_input_pipeline_id=0) + self._test_input_fn_iterator(None, None, num_gpus, + input_fn, expected_values) if __name__ == '__main__': diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py index a51371654031e32d084e2b0e8ae345bb2c166ae8..f3ce547f4d0ffc8d507c77adb22293edf7c54373 100644 --- a/tensorflow/contrib/distribute/python/combinations.py +++ b/tensorflow/contrib/distribute/python/combinations.py @@ -168,6 +168,8 @@ def _augment_with_special_arguments(test_method): if GPU_TEST: self.skipTest("Test that doesn't require GPUs.") elif context.num_gpus() < required_gpus: + # TODO(priyag): Consider allowing tests in graph mode using soft + # placement. self.skipTest( "{} GPUs are not available for this test. {} GPUs are available". format(required_gpus, context.num_gpus())) @@ -335,6 +337,13 @@ tpu_strategy_one_step = NamedDistribution( "TPUOneStep", lambda: tpu_lib.TPUStrategy( TPUClusterResolver(""), steps_per_run=1), required_tpu=True) +mirrored_strategy_with_one_cpu = NamedDistribution( + "Mirrored1CPU", + lambda: mirrored_lib.MirroredStrategy(["/cpu:0"])) +mirrored_strategy_with_one_gpu = NamedDistribution( + "Mirrored1GPU", + lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]), + required_gpus=1) mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]), @@ -343,6 +352,21 @@ mirrored_strategy_with_two_gpus = NamedDistribution( "Mirrored2GPUs", lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]), required_gpus=2) +core_mirrored_strategy_with_one_cpu = NamedDistribution( + "CoreMirrored1CPU", + lambda: mirrored_lib.CoreMirroredStrategy(["/cpu:0"])) +core_mirrored_strategy_with_one_gpu = NamedDistribution( + "CoreMirrored1GPU", + lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0"]), + required_gpus=1) +core_mirrored_strategy_with_gpu_and_cpu = NamedDistribution( + "CoreMirroredCPUAndGPU", + lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/cpu:0"]), + required_gpus=1) +core_mirrored_strategy_with_two_gpus = NamedDistribution( + "CoreMirrored2GPUs", + lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/gpu:1"]), + required_gpus=2) gradient_descent_optimizer_v1_fn = NamedObject( @@ -373,8 +397,11 @@ def distributions_and_v1_optimizers(): """A common set of combination with DistributionStrategies and Optimizers.""" return combine( distribution=[ - one_device_strategy, mirrored_strategy_with_gpu_and_cpu, - mirrored_strategy_with_two_gpus + one_device_strategy, + mirrored_strategy_with_gpu_and_cpu, + mirrored_strategy_with_two_gpus, + core_mirrored_strategy_with_gpu_and_cpu, + core_mirrored_strategy_with_two_gpus, ], optimizer_fn=optimizers_v1) @@ -383,7 +410,10 @@ def distributions_and_v2_optimizers(): """DistributionStrategies and V2 Optimizers.""" return combine( distribution=[ - one_device_strategy, mirrored_strategy_with_gpu_and_cpu, - mirrored_strategy_with_two_gpus + one_device_strategy, + mirrored_strategy_with_gpu_and_cpu, + mirrored_strategy_with_two_gpus, + core_mirrored_strategy_with_gpu_and_cpu, + core_mirrored_strategy_with_two_gpus, ], optimizer_fn=optimizers_v2) diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_device_ops_test.py similarity index 83% rename from tensorflow/contrib/distribute/python/cross_tower_ops_test.py rename to tensorflow/contrib/distribute/python/cross_device_ops_test.py index 2e352360a439cb15fcaa24e40632d9a6597eeb00..40410b90be7d9d9ed20fb4e696565cf79c044553 100644 --- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py +++ b/tensorflow/contrib/distribute/python/cross_device_ops_test.py @@ -24,13 +24,13 @@ from absl.testing import parameterized import numpy as np from tensorflow.contrib.distribute.python import combinations -from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib -from tensorflow.contrib.distribute.python import cross_tower_utils from tensorflow.contrib.distribute.python import mirrored_strategy from tensorflow.contrib.distribute.python import multi_worker_test_base -from tensorflow.contrib.distribute.python import values as value_lib from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib +from tensorflow.python.distribute import cross_device_utils from tensorflow.python.distribute import reduce_util +from tensorflow.python.distribute import values as value_lib from tensorflow.python.eager import context from tensorflow.python.eager import test from tensorflow.python.framework import constant_op @@ -41,7 +41,7 @@ from tensorflow.python.training import device_util def _make_per_replica(values, devices, regroup=False): - devices = cross_tower_ops_lib.get_devices_from(devices) + devices = cross_device_ops_lib.get_devices_from(devices) assert len(values) == len(devices) # We simulate the result of regroup called on PerReplica which strips the @@ -66,7 +66,7 @@ def _fake_mirrored(value, devices): All components of the returned Mirrored have the same objects, which is not true in reality. """ - devices = cross_tower_ops_lib.get_devices_from(devices) + devices = cross_device_ops_lib.get_devices_from(devices) return value_lib.Mirrored( {d: v for d, v in zip(devices, [value] * len(devices))}) @@ -118,8 +118,8 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase): self.assertEqual( sess.run(list(left._index.values())), list(right._index.values())) - def _testReductionAndBroadcast(self, cross_tower_ops, distribution): - devices = distribution.worker_devices + def _testReductionAndBroadcast(self, cross_device_ops, distribution): + devices = distribution.extended.worker_devices values = [constant_op.constant(float(d)) for d in range(len(devices))] per_replica = _make_per_replica(values, devices) @@ -142,24 +142,24 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase): # test reduce() for destinations in all_destinations: self._assert_values_equal( - cross_tower_ops.reduce( + cross_device_ops.reduce( reduce_util.ReduceOp.MEAN, per_replica, destinations=destinations), _fake_mirrored(mean, destinations)) self._assert_values_equal( - cross_tower_ops.reduce( + cross_device_ops.reduce( reduce_util.ReduceOp.MEAN, per_replica_2, destinations=destinations), _fake_mirrored(mean_2, destinations)) self._assert_values_equal( - cross_tower_ops.reduce( + cross_device_ops.reduce( reduce_util.ReduceOp.SUM, per_replica, destinations=destinations), _fake_mirrored(mean * len(devices), destinations)) self._assert_values_equal( - cross_tower_ops.reduce( + cross_device_ops.reduce( reduce_util.ReduceOp.SUM, per_replica_2, destinations=destinations), @@ -168,7 +168,7 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase): # test batch_reduce() for d1, d2 in itertools.product(all_destinations, all_destinations): self._assert_values_equal( - cross_tower_ops.batch_reduce( + cross_device_ops.batch_reduce( reduce_util.ReduceOp.MEAN, [(per_replica, d1), (per_replica_2, d2)]), [ @@ -176,7 +176,7 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase): _fake_mirrored(mean_2, d2) ]) self._assert_values_equal( - cross_tower_ops.batch_reduce( + cross_device_ops.batch_reduce( reduce_util.ReduceOp.SUM, [(per_replica, d1), (per_replica_2, d2)]), [ @@ -187,7 +187,7 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase): # test broadcast() for destinations in all_destinations: self._assert_values_equal( - cross_tower_ops.broadcast(constant_op.constant(1.), destinations), + cross_device_ops.broadcast(constant_op.constant(1.), destinations), _fake_mirrored(1., destinations)) @@ -196,62 +196,65 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): # combinations module so that we can pass in devices instead of a distribution # strategy. reduction_to_one_combinations = combinations.combine( - cross_tower_ops=[ + cross_device_ops=[ combinations.NamedObject( "DefaultReductionToOneDeviceCrossDeviceOps", - cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps()), + cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()), combinations.NamedObject( "ReductionToCPUDeviceCrossDeviceOps", - cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps( + cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps( reduce_to_device=_cpu_device)), combinations.NamedObject( "AccumulateNCrossDeviceOp", - cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps( + cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps( accumulation_fn=math_ops.accumulate_n)), ], distribution=[ combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, - combinations.mirrored_strategy_with_two_gpus + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus ], mode=["graph", "eager"]) allreduce_combinations = combinations.combine( - cross_tower_ops=[ + cross_device_ops=[ combinations.NamedObject( "AllReduce", - cross_tower_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)), + cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)), combinations.NamedObject( "HierarchicalCopy", - cross_tower_ops_lib.AllReduceCrossDeviceOps( + cross_device_ops_lib.AllReduceCrossDeviceOps( "hierarchical_copy", 8, 0, 0)), combinations.NamedObject( "AllReduceNoGradientRepacking", - cross_tower_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)), + cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)), combinations.NamedObject( "HierarchicalCopyAggregateSmallTensors", - cross_tower_ops_lib.AllReduceCrossDeviceOps( + cross_device_ops_lib.AllReduceCrossDeviceOps( "hierarchical_copy", 0, 100, 10)) ], - distribution=[combinations.mirrored_strategy_with_two_gpus], + distribution=[combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], mode=["graph", "eager"]) @combinations.generate(reduction_to_one_combinations + allreduce_combinations) - def testReductionAndBroadcast(self, cross_tower_ops, distribution): + def testReductionAndBroadcast(self, cross_device_ops, distribution): with distribution.scope(): - self._testReductionAndBroadcast(cross_tower_ops, distribution) + self._testReductionAndBroadcast(cross_device_ops, distribution) def testChooseAlgorithm(self): device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]] - result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links) - self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps) + result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links) + self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "hierarchical_copy") self.assertEqual(result._num_packs, 8) # if there are only 4 devices device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]] - result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links) - self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps) + result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links) + self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "nccl") self.assertEqual(result._num_packs, 1) @@ -259,16 +262,16 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6], [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7], [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]] - result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links) - self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps) + result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links) + self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "hierarchical_copy") self.assertEqual(result._num_packs, 8) # if not dgx1-like links device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]] - result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links) - self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps) + result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links) + self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps) self.assertEqual(result._all_reduce_alg, "nccl") self.assertEqual(result._num_packs, 1) @@ -280,7 +283,7 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0]) t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1]) per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1}) - result = cross_tower_ops_lib._simple_reduce( + result = cross_device_ops_lib._simple_reduce( per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM) # Test that the result is semantically equal to both the concatenated @@ -294,19 +297,19 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): @combinations.generate( combinations.combine( - cross_tower_ops_instance=[ + cross_device_ops_instance=[ combinations.NamedObject( "ReductionToOneDeviceCrossDeviceOps", - cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps()), + cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()), combinations.NamedObject( "AllReduceCrossDeviceOps", - cross_tower_ops_lib.AllReduceCrossDeviceOps()) + cross_device_ops_lib.AllReduceCrossDeviceOps()) ], reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN], batch_reduce=[True, False], mode=["graph", "eager"], required_gpus=1)) - def testIndexedSlicesAllReduce(self, cross_tower_ops_instance, reduce_op, + def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op, batch_reduce): devices = ["/cpu:0", "/gpu:0"] dense_shape = [5, 2] @@ -316,10 +319,10 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase): per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1}) if batch_reduce: - result = cross_tower_ops_instance.batch_reduce( + result = cross_device_ops_instance.batch_reduce( reduce_op, [(per_replica, devices)]) else: - result = cross_tower_ops_instance.reduce( + result = cross_device_ops_instance.reduce( reduce_op, per_replica, devices) total_indices_with_dups = [1, 1, 3] @@ -356,49 +359,65 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase, "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1" ] multi_worker_allreduce_combinations = combinations.combine( - cross_tower_ops=[ + cross_device_ops=[ combinations.NamedObject( "MultiWorkerAllReduce", - cross_tower_ops_lib.MultiWorkerAllReduce( + cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)), combinations.NamedObject( "MultiWorkerAllReducePack", - cross_tower_ops_lib.MultiWorkerAllReduce( + cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)), combinations.NamedObject( "MultiWorkerAllReduceAggregation", - cross_tower_ops_lib.MultiWorkerAllReduce( + cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)), combinations.NamedObject( "MultiWorkerAllReduceMultipleSpecs", - cross_tower_ops_lib.MultiWorkerAllReduce( + cross_device_ops_lib.MultiWorkerAllReduce( worker_devices, 2, [("pscpu/pscpu", 2, 100), ("xring", 2, -1)], 0, 0, 0)), ], distribution=[ combinations.NamedDistribution( "MirroredCPU", - lambda: mirrored_strategy.MirroredStrategy(num_gpus=0), + lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=0), required_gpus=0), combinations.NamedDistribution( "Mirrored1GPU", - lambda: mirrored_strategy.MirroredStrategy(num_gpus=1), + lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=1), required_gpus=1), combinations.NamedDistribution( "Mirrored2GPUs", - lambda: mirrored_strategy.MirroredStrategy(num_gpus=2), + lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=2), + required_gpus=2), + # pylint: disable=g-long-lambda + combinations.NamedDistribution( + "CoreMirroredCPU", + lambda: mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=0), + required_gpus=0), + combinations.NamedDistribution( + "CoreMirrored1GPU", + lambda: mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=1), + required_gpus=1), + combinations.NamedDistribution( + "CoreMirrored2GPUs", + lambda: mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=2), required_gpus=2), ], mode=["graph"]) @combinations.generate(multi_worker_allreduce_combinations) - def testReductionAndBroadcast(self, cross_tower_ops, distribution): + def testReductionAndBroadcast(self, cross_device_ops, distribution): distribution.configure(cluster_spec={ "worker": ["/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"] }) with distribution.scope(): - self._testReductionAndBroadcast(cross_tower_ops, distribution) + self._testReductionAndBroadcast(cross_device_ops, distribution) class MultiWorkerCollectiveAllReduceTest( @@ -419,7 +438,7 @@ class MultiWorkerCollectiveAllReduceTest( MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000 def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False): - collective_keys = cross_tower_utils.CollectiveKeys( + collective_keys = cross_device_utils.CollectiveKeys( group_key_start=10 * num_gpus + MultiWorkerCollectiveAllReduceTest.collective_key_base, instance_key_start=num_gpus * 100 + @@ -427,7 +446,7 @@ class MultiWorkerCollectiveAllReduceTest( instance_key_with_id_start=num_gpus * 10000 + MultiWorkerCollectiveAllReduceTest.collective_key_base) if local_mode: - collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce( + collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce( 1, num_gpus, collective_keys=collective_keys) if num_gpus: devices = ["/device:GPU:%d" % i for i in range(num_gpus)] @@ -435,7 +454,7 @@ class MultiWorkerCollectiveAllReduceTest( devices = ["/device:CPU:0"] return collective_all_reduce_ops, devices, "" else: - collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce( + collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce( 3, num_gpus, collective_keys=collective_keys) if num_gpus: devices = [ diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py b/tensorflow/contrib/distribute/python/cross_device_utils_test.py similarity index 84% rename from tensorflow/contrib/distribute/python/cross_tower_utils_test.py rename to tensorflow/contrib/distribute/python/cross_device_utils_test.py index e46240abbfa3d3618009f8bafe5db66e06e8bbd3..6086eba0984782f5e85235142817569bee135df0 100644 --- a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py +++ b/tensorflow/contrib/distribute/python/cross_device_utils_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for cross_tower_utils.""" +"""Tests for cross_device_utils.""" from __future__ import absolute_import from __future__ import division @@ -21,8 +21,8 @@ from __future__ import print_function from absl.testing import parameterized from tensorflow.contrib.distribute.python import combinations -from tensorflow.contrib.distribute.python import cross_tower_utils -from tensorflow.contrib.distribute.python import values as value_lib +from tensorflow.python.distribute import cross_device_utils +from tensorflow.python.distribute import values as value_lib from tensorflow.python.eager import test from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops @@ -43,7 +43,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): t0 = constant_op.constant([[1., 2.], [0, 0], [3., 4.]]) t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]]) total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]]) - result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1]) + result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1]) self._assert_values_equal(total, result) @test_util.run_in_graph_and_eager_modes @@ -53,7 +53,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): t1 = math_ops._as_indexed_slices( constant_op.constant([[0., 0.], [5, 6], [7., 8.]])) total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]]) - result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1]) + result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1]) self.assertIsInstance(result, ops.IndexedSlices) self._assert_values_equal(total, result) @@ -62,7 +62,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]]) n = 2 expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]]) - result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n) + result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n) self._assert_values_equal(expected, result) @test_util.run_in_graph_and_eager_modes @@ -71,7 +71,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) n = 2 expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]]) - result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n) + result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n) self.assertIsInstance(result, ops.IndexedSlices) self._assert_values_equal(expected, result) @@ -79,7 +79,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): def testIsIndexedSlices(self): t = math_ops._as_indexed_slices( constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) - self.assertTrue(cross_tower_utils.contains_indexed_slices(t)) + self.assertTrue(cross_device_utils.contains_indexed_slices(t)) @test_util.run_in_graph_and_eager_modes def testContainsIndexedSlices_List(self): @@ -87,7 +87,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) t1 = math_ops._as_indexed_slices( constant_op.constant([[0., 0.], [5, 6], [7., 8.]])) - self.assertTrue(cross_tower_utils.contains_indexed_slices([t0, t1])) + self.assertTrue(cross_device_utils.contains_indexed_slices([t0, t1])) @test_util.run_in_graph_and_eager_modes def testContainsIndexedSlices_Tuple(self): @@ -95,7 +95,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) t1 = math_ops._as_indexed_slices( constant_op.constant([[0., 0.], [5, 6], [7., 8.]])) - self.assertTrue(cross_tower_utils.contains_indexed_slices((t0, t1))) + self.assertTrue(cross_device_utils.contains_indexed_slices((t0, t1))) @test_util.run_in_graph_and_eager_modes def testContainsIndexedSlices_PerReplica(self): @@ -104,7 +104,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): t1 = math_ops._as_indexed_slices( constant_op.constant([[0., 0.], [5, 6], [7., 8.]])) per_replica = value_lib.PerReplica({"/gpu:0": t0, "/cpu:0": t1}) - self.assertTrue(cross_tower_utils.contains_indexed_slices(per_replica)) + self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica)) @combinations.generate(combinations.combine( mode=["graph", "eager"], @@ -113,7 +113,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): with ops.device("/cpu:0"): t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]]) destination = "/gpu:0" - result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device( + result = cross_device_utils.copy_tensor_or_indexed_slices_to_device( t, destination) self._assert_values_equal(t, result) @@ -128,7 +128,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase): t = math_ops._as_indexed_slices( constant_op.constant([[1., 2.], [0, 0], [3., 4.]])) destination = "/gpu:0" - result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device( + result = cross_device_utils.copy_tensor_or_indexed_slices_to_device( t, destination) self.assertIsInstance(result, ops.IndexedSlices) diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py index ecda5337bb51b8dbe174e94b3387e3e428e8180a..e17085628ba6d1dfc79839fd824801723f07a518 100644 --- a/tensorflow/contrib/distribute/python/estimator_integration_test.py +++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py @@ -63,7 +63,9 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase, distribution=[ combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, - combinations.mirrored_strategy_with_two_gpus + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus ], use_train_and_evaluate=[True, False])) def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate): @@ -75,12 +77,12 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase, train_input_fn = self.dataset_input_fn( x={'x': data}, y=data, - batch_size=batch_size // len(distribution.worker_devices), + batch_size=batch_size // distribution.num_replicas_in_sync, shuffle=True) eval_input_fn = self.dataset_input_fn( x={'x': data}, y=data, - batch_size=batch_size // len(distribution.worker_devices), + batch_size=batch_size // distribution.num_replicas_in_sync, shuffle=False) predict_input_fn = numpy_io.numpy_input_fn( x={'x': data}, batch_size=batch_size, shuffle=False) diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py index cc15fc80a9882de3eab7d5bc9cba98b96db6db3c..0f35657a8099523b6ba5b8f0a1a2f289c06b531a 100644 --- a/tensorflow/contrib/distribute/python/estimator_training_test.py +++ b/tensorflow/contrib/distribute/python/estimator_training_test.py @@ -50,6 +50,8 @@ from tensorflow.python.platform import gfile from tensorflow.python.platform import test from tensorflow.python.summary import summary_iterator from tensorflow.python.summary.writer import writer_cache +from tensorflow.python.training import session_manager + BATCH_SIZE = 10 LABEL_DIMENSION = 2 @@ -202,10 +204,10 @@ class DistributeCoordinatorIntegrationTest(test.TestCase, train_input_fn = self.dataset_input_fn( x={"x": DATA}, y=DATA, - batch_size=BATCH_SIZE // len(train_distribute.worker_devices), + batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync, shuffle=True) if eval_distribute: - eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices) + eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync else: eval_batch_size = BATCH_SIZE eval_input_fn = self.dataset_input_fn( @@ -291,10 +293,13 @@ class DistributeCoordinatorIntegrationTest(test.TestCase, train_distribute_cls=[ collective_all_reduce_strategy.CollectiveAllReduceStrategy, mirrored_strategy.MirroredStrategy, + mirrored_strategy.CoreMirroredStrategy, parameter_server_strategy.ParameterServerStrategy ], eval_distribute_cls=[ - None, mirrored_strategy.MirroredStrategy, + None, + mirrored_strategy.MirroredStrategy, + mirrored_strategy.CoreMirroredStrategy, parameter_server_strategy.ParameterServerStrategy, ], required_gpus=[0, 1])) @@ -322,10 +327,12 @@ class DistributeCoordinatorIntegrationTest(test.TestCase, mode=["graph"], train_distribute_cls=[ mirrored_strategy.MirroredStrategy, + mirrored_strategy.CoreMirroredStrategy, ], eval_distribute_cls=[ None, mirrored_strategy.MirroredStrategy, + mirrored_strategy.CoreMirroredStrategy, ], required_gpus=[0, 1])) def test_estimator_standalone_client(self, train_distribute_cls, @@ -405,6 +412,7 @@ class DistributeCoordinatorIntegrationTest(test.TestCase, ], eval_distribute_cls=[ None, mirrored_strategy.MirroredStrategy, + mirrored_strategy.CoreMirroredStrategy, parameter_server_strategy.ParameterServerStrategy, ], required_gpus=[0, 1])) @@ -449,8 +457,15 @@ class DistributeCoordinatorIntegrationTest(test.TestCase, @combinations.generate( combinations.combine( mode=["graph"], - train_distribute_cls=[mirrored_strategy.MirroredStrategy], - eval_distribute_cls=[None, mirrored_strategy.MirroredStrategy], + train_distribute_cls=[ + mirrored_strategy.MirroredStrategy, + mirrored_strategy.CoreMirroredStrategy + ], + eval_distribute_cls=[ + None, + mirrored_strategy.MirroredStrategy, + mirrored_strategy.CoreMirroredStrategy + ], required_gpus=[0, 1])) def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls, eval_distribute_cls): @@ -506,7 +521,8 @@ class RunConfigTest(test.TestCase): "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}): run_config_lib.RunConfig( experimental_distribute=DistributeConfig( - train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2))) + train_distribute=mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=2))) def test_should_run_distribute_coordinator(self): """Tests that should_run_distribute_coordinator return a correct value.""" @@ -529,10 +545,12 @@ class RunConfigTest(test.TestCase): {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): config_with_train_distribute = run_config_lib.RunConfig( experimental_distribute=DistributeConfig( - train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2))) + train_distribute=mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=2))) config_with_eval_distribute = run_config_lib.RunConfig( experimental_distribute=DistributeConfig( - eval_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2))) + eval_distribute=mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=2))) self.assertTrue( dc_training.should_run_distribute_coordinator( config_with_train_distribute)) @@ -545,26 +563,27 @@ class RunConfigTest(test.TestCase): {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}): config = run_config_lib.RunConfig( experimental_distribute=DistributeConfig( - train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2))) + train_distribute=mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=2))) self.assertFalse(dc_training.should_run_distribute_coordinator(config)) def test_init_run_config_duplicate_distribute(self): with self.assertRaises(ValueError): run_config_lib.RunConfig( - train_distribute=mirrored_strategy.MirroredStrategy(), + train_distribute=mirrored_strategy.CoreMirroredStrategy(), experimental_distribute=DistributeConfig( - train_distribute=mirrored_strategy.MirroredStrategy())) + train_distribute=mirrored_strategy.CoreMirroredStrategy())) with self.assertRaises(ValueError): run_config_lib.RunConfig( - eval_distribute=mirrored_strategy.MirroredStrategy(), + eval_distribute=mirrored_strategy.CoreMirroredStrategy(), experimental_distribute=DistributeConfig( - eval_distribute=mirrored_strategy.MirroredStrategy())) + eval_distribute=mirrored_strategy.CoreMirroredStrategy())) def test_init_run_config_none_distribute_coordinator_mode(self): # We don't use distribute coordinator for local training. config = run_config_lib.RunConfig( - train_distribute=mirrored_strategy.MirroredStrategy()) + train_distribute=mirrored_strategy.CoreMirroredStrategy()) dc_training.init_run_config(config, {}) self.assertIsNone(config._distribute_coordinator_mode) @@ -572,7 +591,7 @@ class RunConfigTest(test.TestCase): with test.mock.patch.dict("os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}): config = run_config_lib.RunConfig( - train_distribute=mirrored_strategy.MirroredStrategy()) + train_distribute=mirrored_strategy.CoreMirroredStrategy()) self.assertIsNone(config._distribute_coordinator_mode) # When `train_distribute` is not specified, don't use distribute @@ -588,7 +607,7 @@ class RunConfigTest(test.TestCase): with test.mock.patch.dict("os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): config = run_config_lib.RunConfig( - train_distribute=mirrored_strategy.MirroredStrategy()) + train_distribute=mirrored_strategy.CoreMirroredStrategy()) self.assertEqual(config._distribute_coordinator_mode, dc.CoordinatorMode.INDEPENDENT_WORKER) @@ -597,7 +616,7 @@ class RunConfigTest(test.TestCase): # `experimental.remote_cluster` is set use distribute coordinator with # STANDALONE_CLIENT mode. config = run_config_lib.RunConfig( - train_distribute=mirrored_strategy.MirroredStrategy(), + train_distribute=mirrored_strategy.CoreMirroredStrategy(), experimental_distribute=DistributeConfig( remote_cluster={"chief": ["fake_worker"]})) self.assertEqual(config._distribute_coordinator_mode, @@ -605,5 +624,15 @@ class RunConfigTest(test.TestCase): if __name__ == "__main__": + # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly. + orig_init = session_manager.SessionManager.__init__ + + def new_init(*args, **kwargs): + kwargs.pop("recovery_wait_secs", None) + kwargs["recovery_wait_secs"] = 0.5 + orig_init(*args, **kwargs) + + session_manager.SessionManager.__init__ = new_init + with test.mock.patch.object(sys, "exit", os._exit): test.main() diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py index 6a4cbb113a9cc91dd60c2dc03606e328e754e4f0..fba06283ce560390b9a408ac7ceb30bbe17a754b 100644 --- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py +++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py @@ -25,11 +25,9 @@ import numpy as np import six from tensorflow.contrib.distribute.python import combinations -from tensorflow.contrib.distribute.python import mirrored_strategy from tensorflow.core.protobuf import config_pb2 from tensorflow.python import keras from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.eager import context from tensorflow.python.estimator import run_config from tensorflow.python.estimator import training from tensorflow.python.estimator.canned import dnn_linear_combined @@ -71,7 +69,9 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase): distribution=[ combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, - combinations.mirrored_strategy_with_two_gpus + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus ], use_train_and_evaluate=[True, False])) def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate): @@ -83,11 +83,11 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase): train_input_fn = self.dataset_input_fn( x={'x': data}, y=data, - batch_size=batch_size // len(distribution.worker_devices)) + batch_size=batch_size // distribution.num_replicas_in_sync) eval_input_fn = self.dataset_input_fn( x={'x': data}, y=data, - batch_size=batch_size // len(distribution.worker_devices)) + batch_size=batch_size // distribution.num_replicas_in_sync) predict_input_fn = numpy_io.numpy_input_fn( x={'x': data}, batch_size=batch_size, shuffle=False) @@ -150,12 +150,14 @@ def get_model(): return model -class MirroredStrategyOptimizerV2Test(test.TestCase): - - def testKerasOptimizerWithUnequalInput(self): - if context.num_gpus() < 1: - self.skipTest('Not enough GPUs.') +class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=['graph'])) + def testKerasOptimizerWithUnequalInput(self, distribution): def create_fn(): var = variables.Variable( 2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM) @@ -168,25 +170,24 @@ class MirroredStrategyOptimizerV2Test(test.TestCase): return (var, m, v, train_op, optimizer.iterations) devices = ['/device:GPU:0', '/device:CPU:0'] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): - (var, m, v, op, counter) = dist.call_for_each_replica(create_fn) + with distribution.scope(): + (var, m, v, op, counter) = distribution.call_for_each_replica(create_fn) self.evaluate(variables.global_variables_initializer()) var_val = [2.0, 2.0, 2.0] self.assertAllClose( var_val, self.evaluate( - [dist.read_var(var), + [distribution.read_var(var), var.get(devices[0]), var.get(devices[1])])) self.assertAllClose([0, 0, 0], self.evaluate([ - dist.read_var(counter), + distribution.read_var(counter), counter.get(devices[0]), counter.get(devices[1]) ])) - train_op = dist.unwrap(op) + train_op = distribution.unwrap(op) self.evaluate(train_op) # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2 m_val = [1.2, 1.2, 1.2] @@ -194,7 +195,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase): self.assertAllClose( m_val, self.evaluate( - [dist.read_var(m), + [distribution.read_var(m), m.get(devices[0]), m.get(devices[1])])) # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25 @@ -202,7 +203,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase): self.assertAllClose( v_val, self.evaluate( - [dist.read_var(v), + [distribution.read_var(v), v.get(devices[0]), v.get(devices[1])])) # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1) @@ -211,12 +212,12 @@ class MirroredStrategyOptimizerV2Test(test.TestCase): self.assertAllClose( var_val, self.evaluate( - [dist.read_var(var), + [distribution.read_var(var), var.get(devices[0]), var.get(devices[1])])) self.assertAllClose([1, 1, 1], self.evaluate([ - dist.read_var(counter), + distribution.read_var(counter), counter.get(devices[0]), counter.get(devices[1]) ])) @@ -227,7 +228,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase): self.assertAllClose( m_val, self.evaluate( - [dist.read_var(m), + [distribution.read_var(m), m.get(devices[0]), m.get(devices[1])])) # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25 @@ -235,28 +236,29 @@ class MirroredStrategyOptimizerV2Test(test.TestCase): self.assertAllClose( v_val, self.evaluate( - [dist.read_var(v), + [distribution.read_var(v), v.get(devices[0]), v.get(devices[1])])) self.assertAllClose([2, 2, 2], self.evaluate([ - dist.read_var(counter), + distribution.read_var(counter), counter.get(devices[0]), counter.get(devices[1]) ])) - def testOptimizerWithKerasModelAndNumpyArrays(self): - if context.num_gpus() < 1: - self.skipTest('Not enough GPUs.') + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=['graph'])) + def testOptimizerWithKerasModelAndNumpyArrays(self, distribution): with self.cached_session(): model = get_model() optimizer = gradient_descent.SGD(0.001) loss = 'mse' metrics = ['mae'] - devices = ['/device:GPU:0', '/device:CPU:0'] - dist = mirrored_strategy.MirroredStrategy(devices) - model.compile(optimizer, loss, metrics=metrics, distribute=dist) + model.compile(optimizer, loss, metrics=metrics, distribute=distribution) inputs = np.zeros((64, 3), dtype=np.float32) targets = np.zeros((64, 4), dtype=np.float32) diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py index abf9e78adcc9e9922579cbb688ee3330962b8890..29d85fe971ff291df9e9ddf74c0082393bf55ba6 100644 --- a/tensorflow/contrib/distribute/python/keras_test.py +++ b/tensorflow/contrib/distribute/python/keras_test.py @@ -24,9 +24,9 @@ import numpy as np from tensorflow.contrib.distribute.python import combinations from tensorflow.contrib.distribute.python import mirrored_strategy from tensorflow.contrib.distribute.python import tpu_strategy -from tensorflow.contrib.distribute.python import values from tensorflow.python import keras from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.distribute import values from tensorflow.python.estimator import keras as keras_lib from tensorflow.python.estimator import run_config as run_config_lib from tensorflow.python.framework import constant_op @@ -220,7 +220,8 @@ def get_correctness_test_inputs(use_numpy, with_distribution, # TODO(b/118776054): Use global batch size for Keras/DS support. use_per_core_batch_size = ( with_distribution and - not isinstance(with_distribution, tpu_strategy.TPUStrategy)) + not distributed_training_utils.global_batch_size_supported( + with_distribution)) if use_per_core_batch_size: batch_size //= with_distribution.num_replicas_in_sync @@ -279,6 +280,8 @@ strategies = [combinations.default_strategy, combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus, combinations.tpu_strategy, # steps_per_run=2 combinations.tpu_strategy_one_step] @@ -288,7 +291,9 @@ def strategy_minus_tpu_combinations(): distribution=[combinations.default_strategy, combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, - combinations.mirrored_strategy_with_two_gpus], + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus], mode=['graph']) @@ -315,7 +320,8 @@ def strategy_and_inputs(): mode=['graph']) -class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): +class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase, + parameterized.TestCase): def setUp(self): self._base_dir = os.path.join(self.get_temp_dir(), @@ -323,17 +329,18 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): gfile.MakeDirs(self._base_dir) self._config = run_config_lib.RunConfig( tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir) - self._dist = mirrored_strategy.MirroredStrategy( - devices=['/device:GPU:0', '/device:GPU:1']) def tearDown(self): writer_cache.FileWriterCache.clear() if os.path.isdir(self._base_dir): gfile.DeleteRecursively(self._base_dir) - def test_train_functional_with_distribution_strategy(self): - dist = mirrored_strategy.MirroredStrategy( - devices=['/device:GPU:0', '/device:GPU:1']) + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_train_functional_with_distribution_strategy(self, distribution): keras_model = simple_functional_model() keras_model.compile( loss='categorical_crossentropy', @@ -341,8 +348,8 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01)) config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir, - train_distribute=dist, - eval_distribute=dist) + train_distribute=distribution, + eval_distribute=distribution) with self.cached_session(): est_keras = keras_lib.model_to_estimator( keras_model=keras_model, config=config) @@ -356,9 +363,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): writer_cache.FileWriterCache.clear() gfile.DeleteRecursively(self._config.model_dir) - def test_train_sequential_with_distribution_strategy(self): - dist = mirrored_strategy.MirroredStrategy( - devices=['/device:GPU:0', '/device:GPU:1']) + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_train_sequential_with_distribution_strategy(self, distribution): keras_model = simple_sequential_model() keras_model.compile( loss='categorical_crossentropy', @@ -366,7 +376,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01)) config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir, - train_distribute=dist) + train_distribute=distribution) with self.cached_session(): est_keras = keras_lib.model_to_estimator( keras_model=keras_model, config=config) @@ -380,7 +390,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): writer_cache.FileWriterCache.clear() gfile.DeleteRecursively(self._config.model_dir) - def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self, distribution): train_data, test_data = get_multi_inputs_multi_outputs_data() def train_input_fn(): @@ -410,14 +425,14 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): output_dict)).batch(16) self.do_test_multi_inputs_multi_outputs_with_input_fn( - train_input_fn, eval_input_fn) + distribution, train_input_fn, eval_input_fn) - def do_test_multi_inputs_multi_outputs_with_input_fn(self, train_input_fn, - eval_input_fn): + def do_test_multi_inputs_multi_outputs_with_input_fn( + self, distribution, train_input_fn, eval_input_fn): config = run_config_lib.RunConfig( tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir, - train_distribute=self._dist) + train_distribute=distribution) with self.cached_session(): model = multi_inputs_multi_outputs_model() est_keras = keras_lib.model_to_estimator(keras_model=model, config=config) @@ -427,9 +442,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1) self.assertLess(eval_results['loss'], baseline_eval_results['loss']) - def test_keras_optimizer_with_distribution_strategy(self): - dist = mirrored_strategy.MirroredStrategy( - devices=['/device:GPU:0', '/device:GPU:1']) + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_keras_optimizer_with_distribution_strategy(self, distribution): keras_model = simple_sequential_model() keras_model.compile( loss='categorical_crossentropy', @@ -437,7 +455,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase): config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir, - train_distribute=dist) + train_distribute=distribution) with self.cached_session(): est_keras = keras_lib.model_to_estimator(keras_model=keras_model, config=config) @@ -462,82 +480,133 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase, # Verify that the numpy value is copied to the variable. self.assertAllEqual(x, val) - def test_calculating_batch_params(self): - # This verifies that we calculate the number of steps when the batch size - # is specified. + @combinations.generate(strategy_combinations()) + def test_calculating_input_params_no_steps_no_batch_size(self, distribution): + # Calculate the per_replica_batch_size scaling factor for strategies + # that use per_core_batch_size + replica_scale_factor = 1.0 + if not distributed_training_utils.global_batch_size_supported(distribution): + replica_scale_factor = distribution.num_replicas_in_sync + with self.cached_session(): - # 64 is the number of input samples. - inputs = np.zeros((64, 3), dtype=np.float32) - # The number of replicas is equal to 3. - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', - '/device:CPU:0', - '/device:GPU:1']) - - with self.assertRaisesRegexp(ValueError, 'The number of samples is not ' - 'divisible by batch size.'): - # The batch size(128) is larger than the number of input - # samples(64). - distributed_training_utils.get_input_batch_params(inputs, - 128, - strategy) - - with self.assertRaisesRegexp(ValueError, 'is smaller than the number ' - 'of replicas'): - # The batch size(32) * num_replicas_in_sync(3) is 96 which is greater - # than the number of input samples(64). - distributed_training_utils.get_input_batch_params(inputs, - 32, - strategy) - - # The number of replicas now is equal to 2. - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', - '/device:CPU:0']) - # 32 is the batch size per replica. - steps = distributed_training_utils.get_input_batch_params(inputs, - 32, - strategy) - # The number of batches is the ratio of input samples(64) to - # batch size(32) which is 2. The number of steps(1) is the ratio of - # number of batches(2) to the number of replicas(2). + # Input samples of different sizes + input_20_samples = np.zeros((20, 3), dtype=np.float32) + input_63_samples = np.zeros((63, 3), dtype=np.float32) + input_64_samples = np.zeros((64, 3), dtype=np.float32) + + # Default global batch size 32 for input with 64 samples run in 2 steps + steps, batch_size = distributed_training_utils.get_input_params( + distribution, input_64_samples, steps=None, batch_size=None) + self.assertEqual(batch_size, 32 // replica_scale_factor) + self.assertEqual(steps, 2) + + # Computed global batch size 20 is lower than 32 if we pass less samples. + steps, batch_size = distributed_training_utils.get_input_params( + distribution, input_20_samples, steps=None, batch_size=None) + self.assertEqual(batch_size, 20 // replica_scale_factor) self.assertEqual(steps, 1) - # 16 is the batch size per replica. - steps = distributed_training_utils.get_input_batch_params(inputs, - 16, - strategy) - # The number of batches is the ratio of input samples(64) to - # batch size(16) which is 4. The number of steps(2) is the ratio of - # number of batches(4) to the number of replicas(2). + # Default global batch size 32 cannot be used with 63 samples. + with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'): + distributed_training_utils.get_input_params( + distribution, input_63_samples, steps=None, batch_size=None) + + @combinations.generate(strategy_combinations()) + def test_calculating_input_params_with_steps_no_batch_size(self, + distribution): + # Calculate the per_replica_batch_size scaling factor for strategies + # that use per_core_batch_size + replica_scale_factor = 1.0 + if not distributed_training_utils.global_batch_size_supported(distribution): + replica_scale_factor = distribution.num_replicas_in_sync + + with self.cached_session(): + # Input samples of different sizes + input_63_samples = np.zeros((63, 3), dtype=np.float32) + input_64_samples = np.zeros((64, 3), dtype=np.float32) + + # Computed global batch size is correct for number of specified 1 step + steps, batch_size = distributed_training_utils.get_input_params( + distribution, input_64_samples, steps=1, batch_size=None) + self.assertEqual(batch_size, 64 // replica_scale_factor) + self.assertEqual(steps, 1) + + # Computed global batch size is correct for number of specified 2 steps + steps, batch_size = distributed_training_utils.get_input_params( + distribution, input_64_samples, steps=2, batch_size=None) + self.assertEqual(batch_size, 32 // replica_scale_factor) self.assertEqual(steps, 2) - def test_calculating_batch_size(self): + # All samples can not be consumed in specified number of steps + with self.assertRaisesRegexp(ValueError, 'not divisible by steps'): + distributed_training_utils.get_input_params( + distribution, input_63_samples, steps=2, batch_size=None) + + # This cases is different for different strategies due to the + # difference in supported batch size being global or per-replica. + if replica_scale_factor == 1: + # Computed global batch size is correct even if not sharadable + steps, batch_size = distributed_training_utils.get_input_params( + distribution, input_63_samples, steps=3, batch_size=None) + self.assertEqual(batch_size, 21) + self.assertEqual(steps, 3) + else: + # Computed global batch size can not be sharded across replicas + with self.assertRaisesRegexp(ValueError, 'could not be sharded evenly ' + 'across the sync replicas'): + distributed_training_utils.get_input_params( + distribution, input_63_samples, steps=1, batch_size=None) + + @combinations.generate(strategy_combinations()) + def test_calculating_input_params_no_steps_with_batch_size(self, + distribution): + # Calculate the per_replica_batch_size scaling factor for strategies + # that use per_core_batch_size + replica_scale_factor = 1.0 + if not distributed_training_utils.global_batch_size_supported(distribution): + replica_scale_factor = distribution.num_replicas_in_sync + with self.cached_session(): - # 64 is the number of input samples. - inputs = np.zeros((64, 3), dtype=np.float32) - targets = np.zeros((64, 4), dtype=np.float32) + input_64_samples = np.zeros((64, 3), dtype=np.float32) + + # Computed steps is correct for specified batch size + steps, batch_size = distributed_training_utils.get_input_params( + distribution, input_64_samples, steps=None, batch_size=16) + self.assertEqual(batch_size, 16) + self.assertEqual(steps, 4 // replica_scale_factor) + + # Computed steps is correct for specified batch size + steps, batch_size = distributed_training_utils.get_input_params( + distribution, input_64_samples, steps=None, batch_size=32) + self.assertEqual(batch_size, 32) + self.assertEqual(steps, 2 // replica_scale_factor) + + # Number of samples is not divisible by the global batch size + with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'): + distributed_training_utils.get_input_params( + distribution, input_64_samples, steps=None, batch_size=20) + + # Number of samples is not divisible by the global batch size + with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'): + distributed_training_utils.get_input_params( + distribution, input_64_samples, steps=None, batch_size=3) - model = get_model() - optimizer = gradient_descent.GradientDescentOptimizer(0.001) - loss = 'mse' - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', - '/device:CPU:0']) - strategy.extended._require_static_shapes = True - - model.compile(optimizer, loss, distribute=strategy) - iterator = model._distribution_standardize_user_data(inputs, - targets, - batch_size=None, - check_steps=True, - steps_name='steps', - steps=3) - - # The global batch size(21) across all replicas is the ratio of the input - # samples(64) to the steps(3). - # The batch size(10) per device is the ratio of the global batch size(21) - # to the number of replicas(2). - # The global batch size and batch size are rounded integer values. - self.assertEqual(10, distributed_training_utils.get_batch_dimension( - iterator._iterator)) + @combinations.generate(strategy_combinations()) + def test_calculating_input_params_with_steps_with_batch_size(self, + distribution): + with self.cached_session(): + input_64_samples = np.zeros((64, 3), dtype=np.float32) + + # No change to steps and batch size if both specified and feasible + steps, batch_size = distributed_training_utils.get_input_params( + distribution, input_64_samples, steps=5, batch_size=3) + self.assertEqual(batch_size, 3) + self.assertEqual(steps, 5) + + # Number of samples is less than global batch size * steps + with self.assertRaisesRegexp(ValueError, 'less than samples required'): + distributed_training_utils.get_input_params( + distribution, input_64_samples, steps=10, batch_size=13) @combinations.generate(strategy_combinations()) def test_calling_model_with_numpy_arrays(self, distribution): @@ -611,9 +680,9 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase, loss = 'mse' model.compile(optimizer, loss, distribute=distribution) - inputs = np.zeros((10, 3), np.float32) - targets = np.zeros((10, 4), np.float32) - sample_weights = np.ones((10), np.float32) + inputs = np.zeros((20, 3), np.float32) + targets = np.zeros((20, 4), np.float32) + sample_weights = np.ones((20), np.float32) model.fit(inputs, targets, sample_weight=sample_weights, epochs=1, steps_per_epoch=2, verbose=1) @@ -636,7 +705,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase, # `predict` a list that is equal in length to the number of model outputs. # In this test our model has two outputs and each element of `outs` # corresponds to all the samples of one of the model outputs. - self.assertEqual(2, len(outs)) + self.assertLen(outs, 2) # Each of the output samples have a dimension of 7. We should process all # the available input samples(6). self.assertAllEqual([6, 7], outs[0].shape) @@ -708,16 +777,20 @@ class TestDistributionStrategyWithDatasets(test.TestCase, # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work # as clone_model's input_tensors argument only seems to accept list and not # tuples or dict. - def test_fit_with_tuple_and_dict_dataset_inputs(self): + + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=['graph'])) + def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution): with self.cached_session(): model = multi_input_output_model() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001) loss = 'mse' metrics = ['mae', keras.metrics.CategoricalAccuracy()] - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', - '/device:CPU:0']) - model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + model.compile(optimizer, loss, metrics=metrics, distribute=distribution) input_a_np = np.random.random((10, 3)) input_b_np = np.random.random((10, 5)) @@ -790,35 +863,48 @@ class TestDistributionStrategyWithDatasets(test.TestCase, model.evaluate(dataset, steps=2, verbose=1) model.predict(dataset, steps=2) - def test_dataset_input_shape_validation(self): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_dataset_wrong_input_shape(self, distribution): with self.cached_session(): model = get_model() optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001) loss = 'mse' - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', - '/device:GPU:0']) - - model.compile(optimizer, loss, distribute=strategy) + model.compile(optimizer, loss, distribute=distribution) - # User forgets to batch the dataset - inputs = np.zeros((10, 3), dtype=np.float32) + # Wrong input shape + inputs = np.zeros((10, 5), dtype=np.float32) targets = np.zeros((10, 4), dtype=np.float32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.repeat(100) + dataset = dataset.batch(10) - with self.assertRaisesRegexp(ValueError, 'expected input to have shape'): + with self.assertRaisesRegexp(ValueError, + 'expected input to have shape'): model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0) - # Wrong input shape - inputs = np.zeros((10, 5), dtype=np.float32) + @combinations.generate(combinations.combine( + distribution=[combinations.mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_dataset_no_batch_input_validation(self, distribution): + with self.cached_session(): + model = get_model() + + optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001) + loss = 'mse' + model.compile(optimizer, loss, distribute=distribution) + + # User forgets to batch the dataset + inputs = np.zeros((10, 3), dtype=np.float32) targets = np.zeros((10, 4), dtype=np.float32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.repeat(100) - dataset = dataset.batch(10) - with self.assertRaisesRegexp(ValueError, - 'expected input to have shape'): + with self.assertRaisesRegexp(ValueError, 'expected input to have shape'): model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0) @combinations.generate(combinations.combine( @@ -840,7 +926,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase, with self.assertRaisesRegexp(ValueError, 'requires fully defined shapes'): model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0) - def test_learning_phase_value(self): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_learning_phase_value(self, distribution): # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare # meaningful values. Currently we don't pass the learning phase if the # Lambda layer uses the learning phase. @@ -854,15 +945,17 @@ class TestDistributionStrategyWithDatasets(test.TestCase, optimizer = gradient_descent.GradientDescentOptimizer(0.005) loss = 'mse' metrics = ['acc'] - strategy = mirrored_strategy.MirroredStrategy( - ['/device:GPU:0', '/device:GPU:1']) + model.compile(optimizer, loss, metrics=metrics, distribute=distribution) - model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + batch_size = 8 + if isinstance(distribution, mirrored_strategy.CoreMirroredStrategy): + # CoreMirroredStrategy uses global batch size. + batch_size = 8 * distribution.num_replicas_in_sync inputs = np.ones((10, 1), dtype=np.float32) targets = np.ones((10, 1), dtype=np.float32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) - dataset = dataset.repeat().batch(8) + dataset = dataset.repeat().batch(batch_size) hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1) self.assertAlmostEqual(hist.history['acc'][0], 0, 0) @@ -873,24 +966,29 @@ class TestDistributionStrategyWithDatasets(test.TestCase, inputs = np.ones((10, 1), dtype=np.float32) predict_dataset = dataset_ops.Dataset.from_tensor_slices(inputs) - predict_dataset = predict_dataset.repeat().batch(5) + + predict_dataset = predict_dataset.repeat().batch(batch_size) output = model.predict(predict_dataset, steps=10) - # `predict` runs for 10 steps and in each step you process 10 samples. - ref_output = np.ones((100, 1), dtype=np.float32) + # `predict` runs for 10 steps + ref_output = np.ones((160, 1), dtype=np.float32) self.assertArrayNear(output, ref_output, 1e-1) class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase): - def test_validating_dataset_input_tensors_with_shape_mismatch(self): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=['graph'])) + def test_validating_dataset_input_tensors_with_shape_mismatch(self, + distribution): with self.cached_session(): - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', - '/device:CPU:0']) a = constant_op.constant([1, 2], shape=(1, 2)) b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2)) x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b}) y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a}) - with strategy.scope(): + with distribution.scope(): # Removed device and input tensor shape details from the error message # since the order of the device and the corresponding input tensor shape # is not deterministic over different runs. @@ -899,17 +997,21 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase): 'distributed tensor inputs ' 'DistributedValues:.+'): distributed_training_utils.validate_distributed_dataset_inputs( - strategy, x, y) + distribution, x, y) - def test_validating_dataset_input_tensors_with_dtype_mismatch(self): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=['graph'])) + def test_validating_dataset_input_tensors_with_dtype_mismatch(self, + distribution): with self.cached_session(): - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0', - '/device:CPU:0']) a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32) b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64) x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b}) y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a}) - with strategy.scope(): + with distribution.scope(): # Removed device and input tensor dtype details from the error message # since the order of the device and the corresponding input tensor dtype # is not deterministic over different runs. @@ -918,21 +1020,23 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase): 'distributed tensor inputs ' 'DistributedValues:.+'): distributed_training_utils.validate_distributed_dataset_inputs( - strategy, x, y) + distribution, x, y) - def test_unsupported_features(self): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_unsupported_features(self, distribution): with self.cached_session(): model = get_model() optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae'] - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', - '/device:GPU:0']) - - model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + model.compile(optimizer, loss, metrics=metrics, distribute=distribution) - dataset = get_dataset(strategy) + dataset = get_dataset(distribution) # Test with validation split with self.assertRaisesRegexp( @@ -967,18 +1071,21 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase): 'you should specify the `steps` argument'): model.predict(dataset, verbose=0) - def test_calling_with_unsupported_predefined_callbacks(self): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_calling_with_unsupported_predefined_callbacks(self, distribution): with self.cached_session(): model = get_model() optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae'] - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', - '/device:GPU:0']) - model.compile(optimizer, loss, metrics=metrics, distribute=strategy) + model.compile(optimizer, loss, metrics=metrics, distribute=distribution) - dataset = get_dataset(strategy) + dataset = get_dataset(distribution) def schedule(_): return 0.001 @@ -1001,11 +1108,17 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase): callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)]) -class TestDistributionStrategyWithLossMasking(test.TestCase): +class TestDistributionStrategyWithLossMasking(test.TestCase, + parameterized.TestCase): # TODO(priyag): Enable all strategies for this test. Currently it does not # work for TPU due to some invalid datatype. - def test_masking(self): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_two_gpus], + mode=['graph'])) + def test_masking(self, distribution): with self.cached_session(): np.random.seed(1337) x = np.array([[[1], [1]], [[0], [0]]]) @@ -1014,12 +1127,9 @@ class TestDistributionStrategyWithLossMasking(test.TestCase): model.add( keras.layers.TimeDistributed( keras.layers.Dense(1, kernel_initializer='one'))) - strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1', - '/device:GPU:0']) - model.compile(loss='mse', optimizer=gradient_descent.GradientDescentOptimizer(0.01), - distribute=strategy) + distribute=distribution) y = np.array([[[1], [1]], [[1], [1]]]) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) dataset = dataset.repeat(100) @@ -1086,7 +1196,9 @@ class TestDistributionStrategyCorrectness(test.TestCase, distribute=distribution) batch_size = 64 - batch_size //= distribution.num_replicas_in_sync + if not distributed_training_utils.global_batch_size_supported( + distribution): + batch_size //= distribution.num_replicas_in_sync train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train)) train_dataset = batch_wrapper(train_dataset, batch_size, distribution) @@ -1098,7 +1210,8 @@ class TestDistributionStrategyCorrectness(test.TestCase, with self.cached_session(): tolerance = 1e-5 - if isinstance(distribution, mirrored_strategy.MirroredStrategy): + if isinstance(distribution, (mirrored_strategy.MirroredStrategy, + mirrored_strategy.CoreMirroredStrategy)): # TODO(b/119257215): use the default one once the flakyness is fixed. tolerance = 1e-4 @@ -1163,8 +1276,5 @@ class TestDistributionStrategyCorrectness(test.TestCase, predict_with_ds, predict_without_ds, atol=tolerance, rtol=tolerance) -# TODO(priyag): Add a test for TPUStrategy with steps_per_run > 1. - - if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py index a2fc118173a18e26fd6f8425a102a940fb8b92c5..8ac659abe96370b751ed1556cc699fe20788a0fd 100644 --- a/tensorflow/contrib/distribute/python/metrics_v1_test.py +++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py @@ -77,7 +77,9 @@ def all_combinations(): distribution=[combinations.default_strategy, combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, - combinations.mirrored_strategy_with_two_gpus], + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus], mode=["graph"]) @@ -98,7 +100,7 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase): if isinstance(distribution, tpu_strategy.TPUStrategy): def step_fn(ctx, inputs): value, update = distribution.call_for_each_replica( - metric_fn, args=[inputs]) + metric_fn, args=inputs) ctx.set_non_tensor_output(name="value", output=value) return distribution.group(update) diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py index 1f57dd1754c18be9108981d545d2ef3cacaa0ab1..129b394bb6902d1f7b9756c67ac5a9f9ed655980 100644 --- a/tensorflow/contrib/distribute/python/minimize_loss_test.py +++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py @@ -64,7 +64,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) - def step_fn(ctx, *inputs): + def step_fn(ctx, inputs): del ctx # Unused return distribution.group( distribution.call_for_each_replica(model_fn, args=inputs)) @@ -158,7 +158,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): use_callable_loss=True, create_optimizer_inside_model_fn=True) - def step_fn(ctx, *inputs): + def step_fn(ctx, inputs): del ctx # Unused return distribution.group( distribution.call_for_each_replica(model_fn, args=inputs)) @@ -227,7 +227,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): renorm=renorm, update_ops_in_replica_mode=not update_ops_in_cross_replica_mode) - def step_fn(ctx, *inputs): + def step_fn(ctx, inputs): del ctx # Unused fetches = distribution.unwrap( distribution.call_for_each_replica(model_fn, args=inputs)) @@ -286,7 +286,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): distribution=[ combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, - combinations.mirrored_strategy_with_two_gpus + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus ]), combinations.combine( mode=["graph"], use_callable_loss=[True, False]) + @@ -322,10 +324,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) return dataset_ops.Dataset.zip((features, labels)).repeat() - def step_fn(ctx, x, y): + def step_fn(ctx, inputs): del ctx # Unused return distribution.group( - distribution.call_for_each_replica(model_fn, args=(x, y))) + distribution.call_for_each_replica(model_fn, args=inputs)) iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn)) @@ -342,7 +344,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): run_step() v = all_vars[0] - self.assertTrue(all([v is vi for vi in all_vars[1:]])) + self.assertTrue(all(v is vi for vi in all_vars[1:])) weight = numpy.squeeze(self.evaluate(v)) # Our model is: # predict = x * w @@ -409,7 +411,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): output_context.set_non_tensor_output(key1, value1) return (train_op, loss) - def step_fn(output_context, *inputs): + def step_fn(output_context, inputs): (train_op, loss) = distribution.call_for_each_replica( model_fn, args=(output_context,) + inputs) output_context.set_last_step_output( diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py index 960cd09696f70a60b8d8b4b14513fe4562143e28..a3bcc8db88f9466811aa15d37e14a22eb5ce485e 100644 --- a/tensorflow/contrib/distribute/python/mirrored_strategy.py +++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py @@ -12,293 +12,33 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Class MirroredStrategy implementing DistributionStrategy.""" +"""Contrib version of MirroredStrategy.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function -import contextlib -from functools import partial -import threading +import functools -from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib -from tensorflow.contrib.distribute.python import shared_variable_creator -from tensorflow.contrib.distribute.python import values -from tensorflow.python import pywrap_tensorflow -from tensorflow.python.distribute import multi_worker_util -from tensorflow.python.distribute import reduce_util -from tensorflow.python.eager import context -from tensorflow.python.eager import tape -from tensorflow.python.framework import constant_op -from tensorflow.python.framework import device as tf_device -from tensorflow.python.framework import dtypes -from tensorflow.python.framework import ops -from tensorflow.python.framework import tensor_util -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops import variable_scope -from tensorflow.python.training import coordinator -from tensorflow.python.training import device_util +from tensorflow.python.distribute import mirrored_strategy +from tensorflow.python.distribute import values from tensorflow.python.training import distribute as distribute_lib -from tensorflow.python.util import nest -# TODO(josh11b): Replace asserts in this file with if ...: raise ... - - -@contextlib.contextmanager -def _enter_graph(g): - if context.executing_eagerly(): - with g.as_default(), context.eager_mode(): - yield - else: - with g.as_default(): - yield - - -def _cpu_device(device): - cpu_device = tf_device.DeviceSpec.from_string(device) - cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0)) - return cpu_device.to_string() - - -class _RequestedStop(Exception): - pass - - -# _call_for_each_replica and _reduce_non_distributed_value are not members of -# MirroredStrategy so that they are generally not allowed to use anything -# specific to MirroredStrategy and thus can be shared with other distribution -# strategies. - - -# TODO(yuefengz): maybe create a common class for those who need to call this -# _call_for_each_replica. -def _call_for_each_replica(distribution, fn, args, kwargs): - """Run `fn` in separate threads, once per replica/worker device. - - Args: - distribution: the DistributionStrategy object. - fn: function to run (will be run once per device, each in its own thread). - args: positional arguments for `fn` - kwargs: keyword arguments for `fn`. - - Returns: - Merged return value of `fn` across all replicas. - - Raises: - RuntimeError: If fn() calls get_replica_context().merge_call() a different - number of times from the available devices. - """ - # TODO(josh11b): Add this option once we add synchronization to variable - # creation. Until then, this is pretty unsafe to use. - run_concurrently = False - if not context.executing_eagerly(): - # Needed for per-thread device, etc. contexts in graph mode. - ops.get_default_graph().switch_to_thread_local() - - coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,)) - - shared_variable_store = {} - - # TODO(isaprykin): Create these threads once instead of during every run() - # call. - threads = [] - for index, d in enumerate(distribution.extended.worker_devices): - variable_creator_fn = shared_variable_creator.make_fn( - shared_variable_store, index) - t = MirroredExtended._MirroredReplicaThread( # pylint: disable=protected-access - distribution, coord, d, variable_creator_fn, fn, - *values.select_device(d, args), **values.select_device(d, kwargs)) - threads.append(t) - - for t in threads: - t.start() - - # When `fn` starts `should_run` event is set on _MirroredReplicaThread - # (`MRT`) threads. The execution waits until - # `MRT.has_paused` is set, which indicates that either `fn` is - # complete or a `get_replica_context().merge_call()` is called. If `fn` is - # complete, then `MRT.done` is set to True. Otherwise, arguments - # of `get_replica_context().merge_call` from all paused threads are grouped - # and the `merge_fn` is performed. Results of the - # `get_replica_context().merge_call` are then set to `MRT.merge_result`. - # Each such `get_replica_context().merge_call` call returns the - # `MRT.merge_result` for that thread when `MRT.should_run` event - # is reset again. Execution of `fn` resumes. - - try: - with coord.stop_on_exception(): - all_done = False - while not all_done and not coord.should_stop(): - done = [] - if run_concurrently: - for t in threads: - t.should_run.set() - for t in threads: - t.has_paused.wait() - t.has_paused.clear() - if coord.should_stop(): - return None - done.append(t.done) - else: - for t in threads: - t.should_run.set() - t.has_paused.wait() - t.has_paused.clear() - if coord.should_stop(): - return None - done.append(t.done) - if coord.should_stop(): - return None - all_done = all(done) - if not all_done: - if any(done): - raise RuntimeError("Some replicas made a different number of " - "replica_context().merge_call() calls.") - # get_replica_context().merge_call() case - merge_args = values.regroup({t.device: t.merge_args for t in threads}) - merge_kwargs = values.regroup( - {t.device: t.merge_kwargs for t in threads}) - # We capture the name_scope of the MRT when we call merge_fn - # to ensure that if we have opened a name scope in the MRT, - # it will be respected when executing the merge function. We only - # capture the name_scope from the first MRT and assume it is - # the same for all other MRTs. - mtt_captured_name_scope = threads[0].captured_name_scope - with ops.name_scope(mtt_captured_name_scope): - merge_result = threads[0].merge_fn(distribution, *merge_args, - **merge_kwargs) - for t in threads: - t.merge_result = values.select_device(t.device, merge_result) - finally: - for t in threads: - t.should_run.set() - coord.join(threads) - - return values.regroup({t.device: t.main_result for t in threads}) - - -def _reduce_non_distributed_value(extended, reduce_op, value, destinations): - """Reduce a non-DistributedValue `value` to `destinations`.""" - if isinstance(value, values.DistributedValues): - raise ValueError("You are passing a `DistributedValue` to " - "`_reduce_non_distributed_value`, which is not allowed.") - - # If the same value is present on all replicas then the PerReplica value will - # be a single value. We also handle the case when `value` is a single value - # and equal to 0. - if value == 0: - return 0 - # If the reduce op is MEAN or ONLY_FIRST_REPLICA, then this - # essentially means that the same value should be on all destinations. - if reduce_op in (reduce_util.ReduceOp.MEAN, - reduce_util.ReduceOp.ONLY_FIRST_REPLICA): - return value - - cross_tower_ops_lib.validate_destinations(destinations) - # We do not support a reduce op of SUM if the value is the same across - # all replicas. We call this as part of assign functions for MirroredVariables - # and summing up identical values across replicas is not clearly defined. - if (len(extended.worker_devices) != 1 or - not cross_tower_ops_lib.check_destinations(destinations)): - raise ValueError("A non-DistributedValues value %s cannot be reduced with " - "the given reduce op %s." % (value, reduce_op)) - # TODO(anjalisridhar): Moves these methods to a device utility file? - devices = cross_tower_ops_lib.get_devices_from(destinations) - if len(devices) == 1: - with ops.device(devices[0]): - return array_ops.identity(value) - else: - value_updates = {} - for d in devices: - with ops.device(d): - value_updates[d] = array_ops.identity(value) - return values.Mirrored(value_updates) - - -def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs): # pylint: disable=g-missing-docstring - # Figure out what collections this variable should be added to. - # We'll add the MirroredVariable to those collections instead. - collections = kwargs.pop("collections", None) - if collections is None: - collections = [ops.GraphKeys.GLOBAL_VARIABLES] - kwargs["collections"] = [] - - # Get synchronization value - synchronization = kwargs.get("synchronization", - variable_scope.VariableSynchronization.ON_WRITE) - if synchronization == variable_scope.VariableSynchronization.NONE: - raise ValueError("`NONE` variable synchronization mode is not " - "supported with `Mirrored` distribution strategy. Please" - " change the `synchronization` for variable: " + - kwargs["name"]) - elif synchronization == variable_scope.VariableSynchronization.ON_READ: - # Variables that are to be synced on read are replica local. - is_replica_local = True - kwargs["trainable"] = False - elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or - synchronization == variable_scope.VariableSynchronization.AUTO): - # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`. - is_replica_local = False - else: - raise ValueError("Invalid variable synchronization mode: " + - synchronization + " for variable: " + kwargs["name"]) - - # Get aggregation value - aggregation = kwargs.pop("aggregation", - variable_scope.VariableAggregation.NONE) - if aggregation not in ( - variable_scope.VariableAggregation.NONE, - variable_scope.VariableAggregation.SUM, - variable_scope.VariableAggregation.MEAN, - variable_scope.VariableAggregation.ONLY_FIRST_REPLICA - ): - raise ValueError("Invalid variable aggregation mode: " + aggregation + - " for variable: " + kwargs["name"]) - - # Ignore user-specified caching device, not needed for mirrored variables. - kwargs.pop("caching_device", None) - - # TODO(josh11b,apassos): It would be better if variable initialization - # was never recorded on the tape instead of having to do this manually - # here. - with tape.stop_recording(): - index = real_mirrored_creator(devices, *args, **kwargs) - - if is_replica_local: - result = values.ReplicaLocalVariable( - index, index[devices[0]], aggregation) - else: - result = values.MirroredVariable(index, index[devices[0]], aggregation) - - # Add the wrapped variable to the requested collections. - # The handling of eager mode and the global step matches - # ResourceVariable._init_from_args(). - if not context.executing_eagerly(): - g = ops.get_default_graph() - # If "trainable" is True, next_creator() will add the member variables - # to the TRAINABLE_VARIABLES collection, so we manually remove - # them and replace with the MirroredVariable. We can't set - # "trainable" to False for next_creator() since that causes functions - # like implicit_gradients to skip those variables. - if kwargs.get("trainable", True): - collections.append(ops.GraphKeys.TRAINABLE_VARIABLES) - l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES) - for v in index.values(): - if v in l: - l.remove(v) - g.add_to_collections(collections, result) - elif ops.GraphKeys.GLOBAL_STEP in collections: - ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result) - - return result +# pylint: disable=protected-access,invalid-name +_call_for_each_replica = mirrored_strategy._call_for_each_replica +_reduce_non_distributed_value = mirrored_strategy._reduce_non_distributed_value +_create_mirrored_variable = mirrored_strategy._create_mirrored_variable +CoreMirroredStrategy = mirrored_strategy.MirroredStrategy +CoreMirroredExtended = mirrored_strategy.MirroredExtended +# pylint: enable=protected-access,invalid-name class MirroredStrategy(distribute_lib.DistributionStrategy): """Mirrors vars to distribute across multiple devices and machines. + *** contrib version *** + This strategy uses one replica per device and sync replication for its multi-GPU version. @@ -354,494 +94,59 @@ class MirroredStrategy(distribute_lib.DistributionStrategy): auto_shard_dataset=False, cross_tower_ops=None): assert not (cross_device_ops and cross_tower_ops) - extended = MirroredExtended( - self, devices, num_gpus, num_gpus_per_worker, - cross_device_ops or cross_tower_ops, auto_shard_dataset) + if num_gpus is not None and num_gpus_per_worker is not None: + raise ValueError( + "You cannot specify both `num_gpus` and `num_gpus_per_worker`.") + if num_gpus is None: + num_gpus = num_gpus_per_worker + extended = MirroredExtended(self, devices, num_gpus, + cross_device_ops or cross_tower_ops, + auto_shard_dataset) super(MirroredStrategy, self).__init__(extended) -class MirroredExtended(distribute_lib.DistributionStrategyExtended): - """Implementation of MirroredStrategy.""" +class MirroredExtended(CoreMirroredExtended): + """Implementation of (contrib) MirroredStrategy.""" def __init__(self, container_strategy, devices=None, - num_gpus=None, num_gpus_per_worker=None, cross_device_ops=None, auto_shard_dataset=False): - super(MirroredExtended, self).__init__(container_strategy) - # TODO(josh11b): Rename self._cross_tower_ops -> self._cross_device_ops - self._cross_tower_ops = cross_device_ops + super(MirroredExtended, self).__init__( + container_strategy, devices, num_gpus_per_worker, cross_device_ops) self._auto_shard_dataset = auto_shard_dataset - # Remember num GPUs which might be needed by `configure` method. - if num_gpus is not None and num_gpus_per_worker is not None: - raise ValueError( - "You cannot specify both `num_gpus` and `num_gpus_per_worker`.") - if num_gpus is not None: - self._num_gpus = num_gpus - else: - self._num_gpus = num_gpus_per_worker - - self._initialize_local(self._num_gpus, devices) - - def _initialize_local(self, num_gpus, devices): - """Initializes the object for local training.""" - self._cluster_spec = None - # Convert `num_gpus` into `devices`, shouldn't specify both. - if devices is None: - if num_gpus is None: - num_gpus = context.num_gpus() - if num_gpus == 0: - devices = ["/device:CPU:0"] - else: - devices = ["/device:GPU:%d" % d for d in range(num_gpus)] - elif num_gpus is not None: - raise ValueError("Must only specify one of `devices` and `num_gpus`.") - self._num_gpus = num_gpus - # TODO(yuefengz): consider setting the default device. - - assert devices, "Must specify at least one device." - assert len(set(devices)) == len(devices), ( - "No duplicates allowed in `devices` argument.") - # TODO(josh11b): Require at least 2 devices? - self._devices = [device_util.resolve(d) for d in devices] - self._canonical_device_set = set(self._devices) - self._device_index = values.PerReplica( - {d: i for i, d in enumerate(devices)}) - def _initialize_multi_worker(self, num_gpus, cluster_spec): - """Initializes the object for multi-worker training.""" - cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) - self._cluster_spec = cluster_spec + def _make_dataset_iterator(self, dataset): + """Make iterator from dataset without splitting the batch. - self._workers = [] - for job in ["chief", "worker"]: - for task in range(len(cluster_spec.as_dict().get(job, []))): - self._workers.append("/job:%s/task:%d" % (job, task)) + This implementation is different than the one in + `tf.distribute.MirroredStrategy` for purposes of backward compatibility. + We treat the incoming dataset's batch size as per replica batch size. - if num_gpus is None: - raise ValueError("`num_gpus` is required if `cluster_spec` is given.") - if num_gpus > 0: - self._worker_devices = [ - (worker, [ - device_util.canonicalize(worker + "/device:GPU:%d" % gpu) - for gpu in range(num_gpus) - ]) for worker in self._workers - ] + Args: + dataset: `tf.data.Dataset` for input. + Returns: + An `InputIterator` which returns inputs for each step of the computation. + """ + if self._cluster_spec: + worker_device_pairs = self._worker_devices else: - self._worker_devices = [ - (worker, [device_util.canonicalize(worker, "/device:CPU:0")]) - for worker in self._workers - ] - - devices = nest.flatten([l for _, l in self._worker_devices]) - - # Setting `_default_device` will add a device scope in the - # distribution.scope. We set the default device to the first worker. When - # users specify device under distribution.scope by - # with tf.device("/cpu:0"): - # ... - # their ops will end up on the cpu device of its first worker, e.g. - # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode. - self._default_device = self._workers[0] - - assert devices, "Must specify at least one device." - assert len(set(devices)) == len(devices), ( - "No duplicates allowed in `devices` argument.") - # TODO(josh11b): Require at least 2 devices? - self._devices = [device_util.resolve(d) for d in devices] - self._canonical_device_set = set(self._devices) - self._device_index = values.PerReplica( - {d: i for i, d in enumerate(devices)}) - - def _create_variable(self, next_creator, *args, **kwargs): - """Create a mirrored variable. See `DistributionStrategy.scope`.""" - colocate_with = kwargs.pop("colocate_with", None) - devices = self._get_devices_from(colocate_with) - - def _real_mirrored_creator(devices, *args, **kwargs): # pylint: disable=g-missing-docstring - index = {} - for i, d in enumerate(devices): - with ops.device(d): - if i > 0: - # Give replicas meaningful distinct names: - var0name = index[devices[0]].name.split(":")[0] - # We append a / to variable names created on replicas with id > 0 to - # ensure that we ignore the name scope and instead use the given - # name as the absolute name of the variable. - kwargs["name"] = "%s/replica_%d/" % (var0name, i) - # Initialize replicas with the same value: - def initial_value_fn(device=d): - if context.executing_eagerly(): - init_value = index[devices[0]].value() - return array_ops.identity(init_value) - else: - with ops.device(device): - init_value = index[devices[0]].initial_value - return array_ops.identity(init_value) - kwargs["initial_value"] = initial_value_fn - with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT): - # Don't record operations (e.g. other variable reads) during - # variable creation. - with tape.stop_recording(): - v = next_creator(*args, **kwargs) - assert not isinstance(v, values.DistributedVariable) - index[d] = v - return index - - return _create_mirrored_variable(devices, _real_mirrored_creator, *args, - **kwargs) + worker_device_pairs = [("/job:localhost", self._devices)] + return values.DatasetIterator(dataset, worker_device_pairs) def _distribute_dataset(self, dataset_fn): if self._cluster_spec: return values.MultiWorkerDataset( - partial(self._call_dataset_fn, dataset_fn), self._worker_devices, + functools.partial(self._call_dataset_fn, dataset_fn), + self._worker_devices, auto_shard=self._auto_shard_dataset) else: return values.PerReplicaDataset( self._call_dataset_fn(dataset_fn), self._devices) - def _make_input_fn_iterator( - self, - input_fn, - replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): - if self._cluster_spec: - input_fns = [] - for i in range(len(self._worker_devices)): - input_context = distribute_lib.InputContext( - num_input_pipelines=len(self._worker_devices), - input_pipeline_id=i, - num_replicas_in_sync=self._num_replicas_in_sync) - input_fns.append( - partial(self._call_dataset_fn, input_fn, input_context)) - - return values.MultiWorkerDataset(input_fns, self._worker_devices, - self._auto_shard_dataset) - else: - input_context = distribute_lib.InputContext( - num_input_pipelines=1, - input_pipeline_id=0, - num_replicas_in_sync=self._num_replicas_in_sync) - return values.PerReplicaDataset( - self._call_dataset_fn(input_fn, input_context), self._devices) - - # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed. - def _experimental_run_steps_on_iterator(self, fn, iterator, iterations, - initial_loop_values=None): - if initial_loop_values is None: - initial_loop_values = {} - initial_loop_values = nest.flatten(initial_loop_values) - - ctx = values.MultiStepContext() - def body(i, *args): - """A wrapper around `fn` to create the while loop body.""" - del args - fn_inputs = iterator.get_next() - if not isinstance(fn_inputs, tuple): - fn_inputs = (fn_inputs,) - fn_result = fn(ctx, *fn_inputs) - for (name, output) in ctx.last_step_outputs.items(): - # Convert all outputs to tensors, potentially from `DistributedValues`. - ctx.last_step_outputs[name] = self._unwrap(output) - flat_last_step_outputs = nest.flatten(ctx.last_step_outputs) - with ops.control_dependencies([fn_result]): - return [i + 1] + flat_last_step_outputs - - # We capture the control_flow_context at this point, before we run `fn` - # inside a while_loop. This is useful in cases where we might need to exit - # these contexts and get back to the outer context to do some things, for - # e.g. create an op which should be evaluated only once at the end of the - # loop on the host. One such usage is in creating metrics' value op. - self._outer_control_flow_context = ( - ops.get_default_graph()._get_control_flow_context()) # pylint: disable=protected-access - - cond = lambda i, *args: i < iterations - i = constant_op.constant(0) - loop_result = control_flow_ops.while_loop( - cond, body, [i] + initial_loop_values, name="", - parallel_iterations=1, back_prop=False, swap_memory=False, - return_same_structure=True) - del self._outer_control_flow_context - - ctx.run_op = control_flow_ops.group(loop_result) - - # Convert the last_step_outputs from a list to the original dict structure - # of last_step_outputs. - last_step_tensor_outputs = loop_result[1:] - last_step_tensor_outputs_dict = nest.pack_sequence_as( - ctx.last_step_outputs, last_step_tensor_outputs) - - for name, reduce_op in ctx._last_step_outputs_reduce_ops.items(): # pylint: disable=protected-access - output = last_step_tensor_outputs_dict[name] - # For outputs that have already been reduced, wrap them in a Mirrored - # container, else in a PerReplica container. - if reduce_op is None: - last_step_tensor_outputs_dict[name] = values.regroup( - {d: t for d, t in zip(self._devices, output)}, values.PerReplica) - else: - assert len(output) == 1 - last_step_tensor_outputs_dict[name] = output[0] - - ctx._set_last_step_outputs(last_step_tensor_outputs_dict) # pylint: disable=protected-access - return ctx - - def _broadcast_to(self, tensor, destinations): - # TODO(josh11b): In eager mode, use one thread per device, or async mode. - return self._get_cross_tower_ops().broadcast(tensor, destinations or - self._devices) - - def _call_for_each_replica(self, fn, args, kwargs): - return _call_for_each_replica(self._container_strategy(), fn, args, kwargs) - - def _configure(self, - session_config=None, - cluster_spec=None, - task_type=None, - task_id=None): - del task_type, task_id - - if session_config: - session_config.isolate_session_state = True - - if cluster_spec: - self._initialize_multi_worker(self._num_gpus, cluster_spec) - - if self._cross_tower_ops is None: - if self._cluster_spec: - # It currently cannot detect the toplogy of remote workers. So we - # hard-code the multi-worker all-reduce algorithm for now. - if len(self._workers) == 1: - # The default is "nccl". - self._cross_tower_ops = cross_tower_ops_lib.AllReduceCrossDeviceOps() - else: - # The default is hierarchical reduce and broadcast. - self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce( - self._workers, self._num_gpus) - else: - self._cross_tower_ops = cross_tower_ops_lib.choose_the_best( - self._devices, session_config=session_config) - - def _get_cross_tower_ops(self): - if self._cross_tower_ops is None: - self._cross_tower_ops = ( - cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps()) - return self._cross_tower_ops - - def _reduce_to(self, reduce_op, value, destinations): - assert not isinstance(value, values.Mirrored) - if not isinstance(value, values.DistributedValues): - # This function handles reducing values that are not PerReplica or - # Mirrored values. For example, the same value could be present on all - # replicas in which case `value` would be a single value or value could - # be 0. - return _reduce_non_distributed_value(self, reduce_op, value, - destinations) - if reduce_op == reduce_util.ReduceOp.ONLY_FIRST_REPLICA: - value = value.get(self._devices[0]) - if isinstance(value, (int, float)): - return value - return self.broadcast_to(value, destinations) - return self._get_cross_tower_ops().reduce( - reduce_op, value, destinations=destinations) - - def _batch_reduce_to(self, reduce_op, value_destination_pairs): - if reduce_op == reduce_util.ReduceOp.ONLY_FIRST_REPLICA: - return [self.broadcast_to(v.get(self._devices[0]), d) - for v, d in value_destination_pairs] - return self._get_cross_tower_ops().batch_reduce(reduce_op, - value_destination_pairs) - - def _update(self, var, fn, args, kwargs, group): - # TODO(josh11b): In eager mode, use one thread per device. - assert isinstance(var, values.DistributedVariable) - updates = {} - for d, v in var._index.items(): # pylint: disable=protected-access - name = "update_%d" % self._device_index.get(d) - with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name): - # If args and kwargs are not mirrored, the value is returned as is. - updates[d] = fn(v, - *values.select_device_mirrored(d, args), - **values.select_device_mirrored(d, kwargs)) - return values.update_regroup(self, updates, group) - - def _update_non_slot(self, colocate_with, fn, args, kwargs, group): - assert isinstance(colocate_with, list) - # TODO(josh11b): In eager mode, use one thread per device. - updates = {} - for d in colocate_with: - name = "update_%d" % self._device_index.get(d) - with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name): - updates[d] = fn(*values.select_device_mirrored(d, args), - **values.select_device_mirrored(d, kwargs)) - return values.update_regroup(self, updates, group) - - def read_var(self, replica_local_var): - """Read the aggregate value of a replica-local variable.""" - if isinstance(replica_local_var, values.ReplicaLocalVariable): - return replica_local_var._get_cross_replica() # pylint: disable=protected-access - assert isinstance(replica_local_var, values.Mirrored) - return array_ops.identity(replica_local_var.get()) - - def _unwrap(self, val): - if isinstance(val, values.DistributedValues): - # Return in a deterministic order. - if set(val.devices) == self._canonical_device_set: - return [val.get(device=d) for d in self._devices] - return [val.get(device=d) for d in sorted(val.devices)] - return [val] - - def value_container(self, val): - return values.value_container(val) - - @property - def _num_replicas_in_sync(self): - return len(self._devices) - - @property - def worker_devices(self): - # Make a copy to prevent users from accidentally mutating our copy. - return list(self._devices) - - @property - def parameter_devices(self): - return list(self._devices) - + # TODO(priyag): Delete this once all strategies use global batch size. @property - def experimental_between_graph(self): + def _global_batch_size(self): return False - - @property - def experimental_should_init(self): - return True - - @property - def should_checkpoint(self): - return True - - @property - def should_save_summary(self): - return True - - def non_slot_devices(self, var_list): - del var_list - return list(self._devices) - - def _get_devices_from(self, colocate_with=None): - if colocate_with is None: - return self._devices - else: - return cross_tower_ops_lib.get_devices_from(colocate_with) - - class _MirroredReplicaThread(threading.Thread): - """A thread that runs() a function on a device.""" - - def __init__(self, dist, coord, device, variable_creator_fn, fn, *args, - **kwargs): - super(MirroredExtended._MirroredReplicaThread, self).__init__() # pylint: disable=protected-access - self.coord = coord - self.distribution = dist - self.device = device - self.replica_id = dist.worker_devices.index(device) - self.variable_creator_fn = variable_creator_fn - # State needed to run and return the results of `fn`. - self.main_fn = fn - self.main_args = args - self.main_kwargs = kwargs - self.main_result = None - self.done = False - # State needed to run the next merge_call() (if any) requested via - # ReplicaContext. - self.merge_fn = None - self.merge_args = None - self.merge_kwargs = None - self.merge_result = None - self.captured_name_scope = None - # We use a thread.Event for the main thread to signal when this - # thread should start running (`should_run`), and another for - # this thread to transfer control back to the main thread - # (`has_paused`, either when it gets to a - # `get_replica_context().merge_call` or when `fn` returns). In - # either case the event starts cleared, is signaled by calling - # set(). The receiving thread waits for the signal by calling - # wait() and then immediately clearing the event using clear(). - self.should_run = threading.Event() - self.has_paused = threading.Event() - # These fields have to do with inheriting various contexts from the - # parent thread: - # pylint: disable=protected-access - self.context_mode = context.context()._eager_context.mode - if not context.context()._context_handle: - context.context()._initialize_handle_and_devices() - self.context_device_policy = ( - pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy( - context.context()._context_handle)) - self.graph = ops.get_default_graph() - self._variable_creator_stack = self.graph._variable_creator_stack[:] - self._captured_var_scope = variable_scope.get_variable_scope() - # Adding a "/" at end lets us re-enter this scope later. - self._name_scope = self.graph.get_name_scope() - if self._name_scope: - self._name_scope += "/" - if self.replica_id > 0: - if not self._name_scope: - self._name_scope = "" - self._name_scope += "replica_%d/" % self.replica_id - - def run(self): - # pylint: disable=protected-access - self.graph._variable_creator_stack = self._variable_creator_stack - self.should_run.wait() - self.should_run.clear() - try: - if self.coord.should_stop(): - return - with self.coord.stop_on_exception(), \ - context.context()._mode(self.context_mode), \ - context.context().device_policy(self.context_device_policy), \ - _enter_graph(self.graph), \ - MirroredReplicaContext(self.distribution, constant_op.constant( - self.replica_id, dtypes.int32)), \ - ops.device(self.device), \ - ops.name_scope(self._name_scope), \ - variable_scope.variable_scope( - self._captured_var_scope, reuse=self.replica_id > 0), \ - variable_scope.variable_creator_scope(self.variable_creator_fn): - self.main_result = self.main_fn(*self.main_args, **self.main_kwargs) - self.done = True - finally: - self.has_paused.set() - - -class MirroredReplicaContext(distribute_lib.ReplicaContext): - """ReplicaContext used in MirroredStrategy.call_for_each_replica(). - - Opened in `_MirroredReplicaThread`, to allow the user to invoke - `MirroredStrategy`'s specific implementation of `merge_call()`, - which works by delegating the function and its arguments to - the main thread (the one that invoked - `MirroredStrategy.call_for_each_replica()`). - """ - - def _merge_call(self, fn, args, kwargs): - """Delegate to the main thread to actually perform merge_call().""" - t = threading.current_thread() # a _MirroredReplicaThread - t.merge_fn = fn - t.merge_args = args - t.merge_kwargs = kwargs - t.captured_name_scope = t.graph.get_name_scope() - # Adding a "/" at end lets us re-enter this scope later. - if t.captured_name_scope: - t.captured_name_scope += "/" - t.has_paused.set() - t.should_run.wait() - t.should_run.clear() - if t.coord.should_stop(): - raise _RequestedStop() - return t.merge_result - - @property - def devices(self): - distribute_lib.require_replica_context(self) - replica_id = tensor_util.constant_value(self._replica_id_in_sync_group) - return [self._distribution_strategy.worker_devices[replica_id]] diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py index 78d6876723cd56261b2da55ae9f04dd621c965df..1027da857d3042e5f3699bf9e373c4be4d3a754a 100644 --- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py +++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py @@ -20,15 +20,16 @@ from __future__ import print_function import sys +from absl.testing import parameterized import numpy as np +from tensorflow.contrib.distribute.python import combinations from tensorflow.contrib.distribute.python import mirrored_strategy from tensorflow.contrib.distribute.python import multi_worker_test_base from tensorflow.contrib.distribute.python import strategy_test_lib -from tensorflow.contrib.distribute.python import values -from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.ops import dataset_ops from tensorflow.python.distribute import reduce_util +from tensorflow.python.distribute import values from tensorflow.python.eager import backprop from tensorflow.python.eager import context from tensorflow.python.eager import function @@ -36,7 +37,6 @@ from tensorflow.python.eager import test from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops -from tensorflow.python.framework import test_util from tensorflow.python.keras.engine import training as keras_training from tensorflow.python.keras.layers import core as keras_core from tensorflow.python.layers import core @@ -57,165 +57,193 @@ from tensorflow.python.training import server_lib GPU_TEST = "test_gpu" in sys.argv[0] -class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase): +@combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus], + mode=["graph", "eager"])) +class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase, + parameterized.TestCase): - def _get_distribution_strategy(self): - devices = ["/device:CPU:0", "/device:GPU:0"] - if GPU_TEST: - self.assertGreater(context.num_gpus(), 0) - if context.num_gpus() > 1: - devices = ["/device:GPU:0", "/device:GPU:1"] - print(self.id().split(".")[-1], "devices:", ", ".join(devices)) - return mirrored_strategy.MirroredStrategy(devices) + def testMinimizeLoss(self, distribution): + if context.executing_eagerly(): + self._test_minimize_loss_eager(distribution) + else: + self._test_minimize_loss_graph(distribution) - def testMinimizeLossEager(self): - if not GPU_TEST: - self.skipTest("Not GPU test") - self._test_minimize_loss_eager(self._get_distribution_strategy()) + def testReplicaId(self, distribution): + self._test_replica_id(distribution) - def testMinimizeLossGraph(self): - soft_placement = not GPU_TEST - print("testMinimizeLossGraph soft_placement:", soft_placement) - self._test_minimize_loss_graph( - self._get_distribution_strategy(), soft_placement=soft_placement) - - def testReplicaId(self): - if not GPU_TEST: - self.skipTest("Not GPU test") - self._test_replica_id(self._get_distribution_strategy()) - - def testNumReplicasInSync(self): - if not GPU_TEST: - self.skipTest("Not GPU test") - self.assertEqual(2, self._get_distribution_strategy(). - num_replicas_in_sync) - - @test_util.run_in_graph_and_eager_modes - def testCallAndMergeExceptions(self): - if not GPU_TEST: - self.skipTest("Not GPU test") - self._test_call_and_merge_exceptions(self._get_distribution_strategy()) - - @test_util.run_in_graph_and_eager_modes - def testRunRegroupError(self): + def testNumReplicasInSync(self, distribution): + self.assertEqual(2, distribution.num_replicas_in_sync) + def testCallAndMergeExceptions(self, distribution): + self._test_call_and_merge_exceptions(distribution) + + def testRunRegroupError(self, distribution): def run_fn(): replica_id = int(self.evaluate(_replica_id())) # Generates a list with different lengths on different devices. # Will fail in _regroup() (if more than one device). return list(range(replica_id)) - dist = self._get_distribution_strategy() - with dist.scope(), self.assertRaises(AssertionError): - dist.call_for_each_replica(run_fn) - - @test_util.run_in_graph_and_eager_modes - def testReduceToCpu(self): - if not GPU_TEST: - self.skipTest("Not GPU test") + with distribution.scope(), self.assertRaises(AssertionError): + distribution.extended.call_for_each_replica(run_fn) - dist = self._get_distribution_strategy() - with dist.scope(): - result = dist.call_for_each_replica(_replica_id) - reduced = dist.reduce( + def testReduceToCpu(self, distribution): + with distribution.scope(): + result = distribution.extended.call_for_each_replica(_replica_id) + reduced = distribution.reduce( reduce_util.ReduceOp.SUM, result, destinations="/device:CPU:0") - unwrapped = dist.unwrap(reduced) + unwrapped = distribution.unwrap(reduced) self.assertEqual(1, len(unwrapped)) - expected = sum(range(dist.num_replicas_in_sync)) + expected = sum(range(distribution.num_replicas_in_sync)) self.assertEqual(expected, self.evaluate(unwrapped[0])) - @test_util.run_in_graph_and_eager_modes - def testReduceOnlyFirstReplicaUpdates(self): - if not GPU_TEST: - self.skipTest("Not GPU test") + def testMakeInputFnIterator(self, distribution): + dataset_fn = lambda: dataset_ops.Dataset.range(10) + expected_values = [[i, i+1] for i in range(0, 10, 2)] + + input_fn = self._input_fn_to_test_input_context( + dataset_fn, + expected_num_replicas_in_sync=2, + expected_num_input_pipelines=1, + expected_input_pipeline_id=0) + iterator = distribution.make_input_fn_iterator(input_fn) + self._test_input_fn_iterator(iterator, distribution.extended.worker_devices, + expected_values) + + def testGlobalStepUpdate(self, distribution): + self._test_global_step_update(distribution) + + +def one_device_combinations(): + return combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_one_cpu, + combinations.mirrored_strategy_with_one_gpu, + combinations.core_mirrored_strategy_with_one_cpu, + combinations.core_mirrored_strategy_with_one_gpu], + mode=["graph", "eager"]) + + +class MirroredOneDeviceDistributionTest( + strategy_test_lib.DistributionTestBase, + parameterized.TestCase): + + @combinations.generate(combinations.combine( + distribution=[ + combinations.NamedDistribution( + "Mirrored1CPU", + lambda: mirrored_strategy.MirroredStrategy(["/device:CPU:0"]), + required_gpus=1), + combinations.mirrored_strategy_with_one_gpu, + combinations.NamedDistribution( + "CoreMirrored1CPU", + lambda: mirrored_strategy.CoreMirroredStrategy(["/device:CPU:0"]), + required_gpus=1), + combinations.core_mirrored_strategy_with_one_gpu], + mode=["graph", "eager"])) + def testReduceToMultipleDestinations(self, distribution): + with distribution.scope(): + reduced = distribution.extended.reduce_to( + reduce_util.ReduceOp.SUM, + 1.0, + destinations=["/device:CPU:0", "/device:GPU:0"]) + unwrapped = distribution.unwrap(reduced) + self.assertLen(unwrapped, 2) + self.assertEqual(1.0, self.evaluate(unwrapped[0])) - def run_fn(): - return 3 + 5 * _replica_id() + @combinations.generate(one_device_combinations()) + def testMinimizeLoss(self, distribution): + if context.executing_eagerly(): + self._test_minimize_loss_eager(distribution) + else: + self._test_minimize_loss_graph(distribution) - dist = self._get_distribution_strategy() - with dist.scope(): - result = dist.call_for_each_replica(run_fn) - reduced = dist.reduce( - reduce_util.ReduceOp.ONLY_FIRST_REPLICA, - result, - destinations="/device:CPU:0") - unwrapped = dist.unwrap(reduced) - self.assertEqual(1, len(unwrapped)) - self.assertEqual(3, self.evaluate(unwrapped[0])) + @combinations.generate(one_device_combinations()) + def testReplicaId(self, distribution): + self._test_replica_id(distribution) - @test_util.run_in_graph_and_eager_modes() - def testReduceToMultipleDestinations(self): - if not GPU_TEST: - self.skipTest("Not GPU test") + @combinations.generate(one_device_combinations()) + def testCallAndMergeExceptions(self, distribution): + self._test_call_and_merge_exceptions(distribution) - devices = ["/device:GPU:0"] - if GPU_TEST: - self.assertGreater(context.num_gpus(), 0) - print(self.id().split(".")[-1], "devices:", ", ".join(devices)) - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): - reduced = dist.reduce( - reduce_util.ReduceOp.SUM, - 1.0, - destinations=["/device:CPU:0", "/device:GPU:0"]) - unwrapped = dist.unwrap(reduced) - self.assertEqual(2, len(unwrapped)) - self.assertEqual(1.0, self.evaluate(unwrapped[0])) +class MirroredStrategyVariableCreatorStackTest( + test.TestCase, parameterized.TestCase): + @combinations.generate(combinations.combine( + distribution=[combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=["graph"])) + def testCreatorStacksAreThreadLocal(self, distribution): + def model_fn(): + replica_id_str = str(self.evaluate(_replica_id())) -class MirroredStrategyVariableCreationTest(test.TestCase): + def thread_creator_fn(next_creator, *args, **kwargs): + return next_creator(*args, **kwargs) + ":thread_" + replica_id_str - config = config_pb2.ConfigProto() - config.allow_soft_placement = True + with variable_scope.variable_creator_scope(thread_creator_fn): + # Create a variable in this scope. + v = variable_scope.variable(1.0) - def _skip_eager_if_gpus_less_than(self, num_gpus): - if context.num_gpus() < num_gpus and context.executing_eagerly(): - self.skipTest("Enough GPUs not available for this test in eager mode.") + # This will pause the current thread, and execute the other thread. + ds_context.get_replica_context().merge_call(lambda _: _) + return v - @test_util.run_in_graph_and_eager_modes(config=config) - def testSingleVariable(self): - self._skip_eager_if_gpus_less_than(1) + def main_thread_creator(next_creator, *args, **kwargs): + # We are not using the underlying next_creator for test purposes. + del next_creator, args, kwargs + return "main_thread" + + with context.graph_mode(), \ + distribution.scope(), \ + variable_scope.variable_creator_scope(main_thread_creator): + result = distribution.extended.call_for_each_replica(model_fn) + result = distribution.unwrap(result) + expected = ["main_thread:thread_0", "main_thread:thread_1"] + self.assertEqual(expected, result) + + +@combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=["graph", "eager"])) +class MirroredStrategyVariableCreationTest(test.TestCase): + def testSingleVariable(self, distribution): def model_fn(): # This variable should be created only once across the threads because of - # special variable_creator functions used by `dist.call_for_each_replica`. + # special variable_creator functions used by + # `distribution.extended.call_for_each_replica`. v = variable_scope.variable(1.0, name="foo") ds_context.get_replica_context().merge_call(lambda _: _) return v - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - result = dist.call_for_each_replica(model_fn) + with distribution.scope(): + result = distribution.extended.call_for_each_replica(model_fn) self.assertIsInstance(result, values.MirroredVariable) - self.assertEquals("foo:0", result.name) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testUnnamedVariable(self): - self._skip_eager_if_gpus_less_than(1) + self.assertEqual("foo:0", result.name) + def testUnnamedVariable(self, distribution): def model_fn(): v = variable_scope.variable(1.0) ds_context.get_replica_context().merge_call(lambda _: _) return v - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - result = dist.call_for_each_replica(model_fn) + with distribution.scope(): + result = distribution.extended.call_for_each_replica(model_fn) self.assertIsInstance(result, values.MirroredVariable) # Default name of "Variable" will be used. - self.assertEquals("Variable:0", result.name) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testMultipleVariables(self): - self._skip_eager_if_gpus_less_than(1) + self.assertEqual("Variable:0", result.name) + def testMultipleVariables(self, distribution): def model_fn(): vs = [] for i in range(5): @@ -223,19 +251,13 @@ class MirroredStrategyVariableCreationTest(test.TestCase): ds_context.get_replica_context().merge_call(lambda _: _) return vs - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - result = dist.call_for_each_replica(model_fn) + with distribution.scope(): + result = distribution.extended.call_for_each_replica(model_fn) for i, v in enumerate(result): self.assertIsInstance(v, values.MirroredVariable) - self.assertEquals("foo" + str(i) + ":0", v.name) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testMultipleVariablesWithSameCanonicalName(self): - self._skip_eager_if_gpus_less_than(1) + self.assertEqual("foo" + str(i) + ":0", v.name) + def testMultipleVariablesWithSameCanonicalName(self, distribution): def model_fn(): vs = [] vs.append(variable_scope.variable(1.0, name="foo/bar")) @@ -245,41 +267,30 @@ class MirroredStrategyVariableCreationTest(test.TestCase): ds_context.get_replica_context().merge_call(lambda _: _) return vs - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - result = dist.call_for_each_replica(model_fn) + with distribution.scope(): + result = distribution.extended.call_for_each_replica(model_fn) for v in result: self.assertIsInstance(v, values.MirroredVariable) - self.assertEquals(4, len(result)) - self.assertEquals("foo/bar:0", result[0].name) - self.assertEquals("foo_1/bar:0", result[1].name) - self.assertEquals("foo_1/bar_1:0", result[2].name) - self.assertEquals("foo/bar_1:0", result[3].name) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testVariableWithSameCanonicalNameAcrossThreads(self): - self._skip_eager_if_gpus_less_than(1) + self.assertEqual(4, len(result)) + self.assertEqual("foo/bar:0", result[0].name) + self.assertEqual("foo_1/bar:0", result[1].name) + self.assertEqual("foo_1/bar_1:0", result[2].name) + self.assertEqual("foo/bar_1:0", result[3].name) + def testVariableWithSameCanonicalNameAcrossThreads(self, distribution): def model_fn(): replica_id = self.evaluate(_replica_id()) v = variable_scope.variable(1.0, name="foo_" + str(replica_id)) ds_context.get_replica_context().merge_call(lambda _: _) return v - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - result = dist.call_for_each_replica(model_fn) + with distribution.scope(): + result = distribution.extended.call_for_each_replica(model_fn) self.assertIsInstance(result, values.MirroredVariable) # The resulting mirrored variable will use the name from the first device. - self.assertEquals("foo_0:0", result.name) + self.assertEqual("foo_0:0", result.name) - @test_util.run_in_graph_and_eager_modes(config=config) - def testWithLayers(self): - self._skip_eager_if_gpus_less_than(1) + def testWithLayers(self, distribution): def model_fn(features): with variable_scope.variable_scope("common"): layer1 = core.Dense(1) @@ -294,9 +305,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase): (layer2.kernel, layer2.bias), (layer3.kernel, layer3.bias)] - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - ds = dist.distribute_dataset( + ds = distribution.distribute_dataset( lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)) if context.executing_eagerly(): iterator = ds.make_one_shot_iterator() @@ -306,19 +315,17 @@ class MirroredStrategyVariableCreationTest(test.TestCase): features = iterator.get_next() - with dist.scope(): - result = dist.call_for_each_replica(model_fn, args=(features,)) + with distribution.scope(): + result = distribution.extended.call_for_each_replica( + model_fn, args=(features,)) suffixes = ["", "_1", "_2"] for (kernel, bias), suffix in zip(result, suffixes): self.assertIsInstance(kernel, values.MirroredVariable) - self.assertEquals("common/dense" + suffix + "/kernel:0", kernel.name) + self.assertEqual("common/dense" + suffix + "/kernel:0", kernel.name) self.assertIsInstance(bias, values.MirroredVariable) - self.assertEquals("common/dense" + suffix + "/bias:0", bias.name) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testWithVariableAndVariableScope(self): - self._skip_eager_if_gpus_less_than(1) + self.assertEqual("common/dense" + suffix + "/bias:0", bias.name) + def testWithVariableAndVariableScope(self, distribution): def model_fn(): v0 = variable_scope.variable(1.0, name="var0", aggregation=None) with variable_scope.variable_scope("common"): @@ -338,30 +345,25 @@ class MirroredStrategyVariableCreationTest(test.TestCase): return v0, v1, v2, v3 - devices = ["/device:CPU:0", "/device:GPU:0"] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): + with distribution.scope(): v = variable_scope.variable(1.0, name="var-main0") - self.assertEquals("var-main0:0", v.name) + self.assertEqual("var-main0:0", v.name) - result = dist.call_for_each_replica(model_fn) - self.assertEquals(4, len(result)) + result = distribution.extended.call_for_each_replica(model_fn) + self.assertEqual(4, len(result)) v0, v1, v2, v3 = result self.assertIsInstance(v0, values.MirroredVariable) - self.assertEquals("var0:0", v0.name) + self.assertEqual("var0:0", v0.name) self.assertIsInstance(v1, values.MirroredVariable) - self.assertEquals("common/var1:0", v1.name) + self.assertEqual("common/var1:0", v1.name) self.assertIsInstance(v2, values.ReplicaLocalVariable) - self.assertEquals("common/var2:0", v2.name) - self.assertEquals(variable_scope.VariableAggregation.SUM, v2.aggregation) + self.assertEqual("common/var2:0", v2.name) + self.assertEqual(variable_scope.VariableAggregation.SUM, v2.aggregation) self.assertIsInstance(v3, values.MirroredVariable) - self.assertEquals("common/var3:0", v3.name) - self.assertEquals(variable_scope.VariableAggregation.MEAN, v3.aggregation) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testWithGetVariableAndVariableScope(self): - self._skip_eager_if_gpus_less_than(1) + self.assertEqual("common/var3:0", v3.name) + self.assertEqual(variable_scope.VariableAggregation.MEAN, v3.aggregation) + def testWithGetVariableAndVariableScope(self, distribution): def model_fn(): v0 = variable_scope.get_variable("var0", [1]) with variable_scope.variable_scope("common"): @@ -379,33 +381,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase): return v0, v1, v2, v3 - devices = ["/device:CPU:0", "/device:GPU:0"] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): + with distribution.scope(): with variable_scope.variable_scope("main"): v = variable_scope.get_variable("var-main0", [1]) - self.assertEquals("main/var-main0:0", v.name) + self.assertEqual("main/var-main0:0", v.name) - result = dist.call_for_each_replica(model_fn) - self.assertEquals(4, len(result)) + result = distribution.extended.call_for_each_replica(model_fn) + self.assertEqual(4, len(result)) v0, v1, v2, v3 = result self.assertIsInstance(v0, values.MirroredVariable) - self.assertEquals("main/var0:0", v0.name) + self.assertEqual("main/var0:0", v0.name) self.assertIsInstance(v1, values.MirroredVariable) - self.assertEquals("main/common/var1:0", v1.name) + self.assertEqual("main/common/var1:0", v1.name) self.assertIsInstance(v2, values.ReplicaLocalVariable) - self.assertEquals("main/common/var2:0", v2.name) - self.assertEquals(variable_scope.VariableAggregation.SUM, - v2.aggregation) + self.assertEqual("main/common/var2:0", v2.name) + self.assertEqual(variable_scope.VariableAggregation.SUM, + v2.aggregation) self.assertIsInstance(v3, values.MirroredVariable) - self.assertEquals("main/common/var3:0", v3.name) - self.assertEquals(variable_scope.VariableAggregation.MEAN, - v3.aggregation) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testOnlyFirstReplicaUpdatesVariables(self): - self._skip_eager_if_gpus_less_than(1) + self.assertEqual("main/common/var3:0", v3.name) + self.assertEqual(variable_scope.VariableAggregation.MEAN, + v3.aggregation) + def testOnlyFirstReplicaUpdatesVariables(self, distribution): def create_fn(): aggregation = variable_scope.VariableAggregation.ONLY_FIRST_REPLICA v0 = variable_scope.variable( @@ -421,17 +418,16 @@ class MirroredStrategyVariableCreationTest(test.TestCase): return v0, v1 devices = ["/device:GPU:0", "/device:CPU:0"] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): - v0, v1 = dist.call_for_each_replica(create_fn) + with distribution.scope(): + v0, v1 = distribution.extended.call_for_each_replica(create_fn) self.evaluate(v0.initializer) self.assertEqual(2.0, self.evaluate(v0.get(devices[0]))) self.assertEqual(2.0, self.evaluate(v0.get(devices[1]))) - self.assertEqual(2.0, self.evaluate(dist.read_var(v0))) + self.assertEqual(2.0, self.evaluate(distribution.extended.read_var(v0))) self.evaluate(v1.initializer) self.assertEqual(3.0, self.evaluate(v1.get(devices[0]))) self.assertEqual(3.0, self.evaluate(v1.get(devices[1]))) - self.assertEqual(3.0, self.evaluate(dist.read_var(v1))) + self.assertEqual(3.0, self.evaluate(distribution.extended.read_var(v1))) def replica_id_plus_one(): return math_ops.cast(_replica_id() + 1, dtype=dtypes.float32) @@ -442,24 +438,27 @@ class MirroredStrategyVariableCreationTest(test.TestCase): update1 = v1.assign_add(7.0 * replica_id_plus_one()) return update0, update1 - update0a, update1a = dist.call_for_each_replica(update_member_fn) + update0a, update1a = distribution.extended.call_for_each_replica( + update_member_fn) # Update "sync on read" variable. - self.evaluate(dist.group(update0a)) + self.evaluate(distribution.group(update0a)) self.assertEqual(2.0 + 5.0, self.evaluate(v0.get(devices[0]))) # Writes are not synchronized for "sync on read" variables, # so device[1] can end up with a different value. self.assertEqual(2.0 + 2*5.0, self.evaluate(v0.get(devices[1]))) # Always reads from device 0. - self.assertEqual(2.0 + 5.0, self.evaluate(dist.read_var(v0))) + self.assertEqual(2.0 + 5.0, self.evaluate( + distribution.extended.read_var(v0))) # Update "sync on write" variable. - self.evaluate(dist.group(update1a)) + self.evaluate(distribution.group(update1a)) self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[0]))) # Writes are synchronized for v1, only the argument to assign_add on # device[0] is used. self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[1]))) - self.assertEqual(3.0 + 7.0, self.evaluate(dist.read_var(v1))) + self.assertEqual(3.0 + 7.0, self.evaluate( + distribution.extended.read_var(v1))) # Update using state_ops.assign_add global function. def update_state_ops_fn(): @@ -467,26 +466,25 @@ class MirroredStrategyVariableCreationTest(test.TestCase): update1 = state_ops.assign_add(v1, 13.0 * replica_id_plus_one()) return update0, update1 - update0b, update1b = dist.call_for_each_replica(update_state_ops_fn) - self.evaluate(dist.group(update0b)) + update0b, update1b = distribution.extended.call_for_each_replica( + update_state_ops_fn) + self.evaluate(distribution.group(update0b)) # Update "sync on read" variable. self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(v0.get(devices[0]))) self.assertEqual(2.0 + 2*5.0 + 2*11.0, self.evaluate(v0.get(devices[1]))) - self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(dist.read_var(v0))) + self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate( + distribution.extended.read_var(v0))) # Update "sync on write" variable. - self.evaluate(dist.group(update1b)) + self.evaluate(distribution.group(update1b)) self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[0]))) self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[1]))) - self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(dist.read_var(v1))) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testNoneSynchronizationWithGetVariable(self): - self._skip_eager_if_gpus_less_than(1) - devices = ["/device:CPU:0", "/device:GPU:0"] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): + self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate( + distribution.extended.read_var(v1))) + + def testNoneSynchronizationWithGetVariable(self, distribution): + with distribution.scope(): with self.assertRaisesRegexp( ValueError, "`NONE` variable synchronization mode is not " "supported with `Mirrored` distribution strategy. Please change " @@ -495,12 +493,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase): "v", [1], synchronization=variable_scope.VariableSynchronization.NONE) - @test_util.run_in_graph_and_eager_modes(config=config) - def testNoneSynchronizationWithVariable(self): - self._skip_eager_if_gpus_less_than(1) - devices = ["/device:CPU:0", "/device:GPU:0"] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): + def testNoneSynchronizationWithVariable(self, distribution): + with distribution.scope(): with self.assertRaisesRegexp( ValueError, "`NONE` variable synchronization mode is not " "supported with `Mirrored` distribution strategy. Please change " @@ -510,23 +504,15 @@ class MirroredStrategyVariableCreationTest(test.TestCase): name="v", synchronization=variable_scope.VariableSynchronization.NONE) - @test_util.run_in_graph_and_eager_modes(config=config) - def testInvalidSynchronizationWithVariable(self): - self._skip_eager_if_gpus_less_than(1) - devices = ["/device:CPU:0", "/device:GPU:0"] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): + def testInvalidSynchronizationWithVariable(self, distribution): + with distribution.scope(): with self.assertRaisesRegexp( ValueError, "Invalid variable synchronization mode: Invalid for " "variable: v"): variable_scope.variable(1.0, name="v", synchronization="Invalid") - @test_util.run_in_graph_and_eager_modes(config=config) - def testInvalidAggregationWithGetVariable(self): - self._skip_eager_if_gpus_less_than(1) - devices = ["/device:CPU:0", "/device:GPU:0"] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): + def testInvalidAggregationWithGetVariable(self, distribution): + with distribution.scope(): with self.assertRaisesRegexp( ValueError, "Invalid variable aggregation mode: invalid for " "variable: v"): @@ -535,12 +521,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase): synchronization=variable_scope.VariableSynchronization.ON_WRITE, aggregation="invalid") - @test_util.run_in_graph_and_eager_modes(config=config) - def testInvalidAggregationWithVariable(self): - self._skip_eager_if_gpus_less_than(1) - devices = ["/device:CPU:0", "/device:GPU:0"] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): + def testInvalidAggregationWithVariable(self, distribution): + with distribution.scope(): with self.assertRaisesRegexp( ValueError, "Invalid variable aggregation mode: invalid for " "variable: v"): @@ -550,47 +532,21 @@ class MirroredStrategyVariableCreationTest(test.TestCase): synchronization=variable_scope.VariableSynchronization.ON_WRITE, aggregation="invalid") - @test_util.run_in_graph_and_eager_modes(config=config) - def testThreeDevices(self): - self._skip_eager_if_gpus_less_than(2) - - def model_fn(): - v = variable_scope.variable(1.0, name="foo") - ds_context.get_replica_context().merge_call(lambda _: _) - return v - - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]) - - with dist.scope(): - result = dist.call_for_each_replica(model_fn) - self.assertIsInstance(result, values.MirroredVariable) - self.assertEquals("foo:0", result.name) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testNonMatchingVariableCreation(self): - self._skip_eager_if_gpus_less_than(1) - + def testNonMatchingVariableCreation(self, distribution): def model_fn(name): v = variable_scope.variable(1.0, name=name) ds_context.get_replica_context().merge_call(lambda _: _) return v - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): + with distribution.scope(): names = values.DistributedValues({ "/device:CPU:0": "foo", "/device:GPU:0": "bar" }) with self.assertRaises(RuntimeError): - _ = dist.call_for_each_replica(model_fn, args=(names,)) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testReplicaLocalVariable(self): - self._skip_eager_if_gpus_less_than(1) + _ = distribution.extended.call_for_each_replica(model_fn, args=(names,)) + def testReplicaLocalVariable(self, distribution): all_v_sum = {} all_v_mean = {} components_sum = {} @@ -620,13 +576,10 @@ class MirroredStrategyVariableCreationTest(test.TestCase): self.assertIsNot(v_mean, c_mean) return updates, v_sum, v_mean, c_sum, c_mean - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): + with distribution.scope(): # Create "sum" and "mean" versions of ReplicaLocalVariables. ret_ops, ret_v_sum, ret_v_mean, regrouped_sum, regrouped_mean = ( - dist.call_for_each_replica(model_fn)) + distribution.extended.call_for_each_replica(model_fn)) # Should see the same wrapping instance in all replicas. self.assertIs(all_v_sum[0], ret_v_sum) self.assertIs(all_v_mean[0], ret_v_mean) @@ -641,10 +594,10 @@ class MirroredStrategyVariableCreationTest(test.TestCase): # Apply updates self.evaluate(variables.global_variables_initializer()) - self.evaluate([y for x in ret_ops for y in dist.unwrap(x)]) + self.evaluate([y for x in ret_ops for y in distribution.unwrap(x)]) expected_sum = 0.0 expected_mean = 0.0 - for i, d in enumerate(dist.extended.worker_devices): + for i, d in enumerate(distribution.extended.worker_devices): # Should see different values on different devices. v_sum_value = self.evaluate(ret_v_sum.get(d).read_value()) v_mean_value = self.evaluate(ret_v_mean.get(d).read_value()) @@ -654,22 +607,86 @@ class MirroredStrategyVariableCreationTest(test.TestCase): expected = i * 6.0 self.assertEqual(expected, v_mean_value) expected_mean += expected - expected_mean /= len(dist.extended.worker_devices) + expected_mean /= len(distribution.extended.worker_devices) # Without get(device), should return the value you get by # applying the reduction across all replicas (whether you use # read_var(), get(), or nothing). - self.assertEqual(expected_sum, self.evaluate(dist.read_var(ret_v_sum))) - self.assertEqual(expected_mean, self.evaluate(dist.read_var(ret_v_mean))) + self.assertEqual(expected_sum, self.evaluate( + distribution.extended.read_var(ret_v_sum))) + self.assertEqual(expected_mean, self.evaluate( + distribution.extended.read_var(ret_v_mean))) self.assertEqual(expected_sum, self.evaluate(ret_v_sum.get())) self.assertEqual(expected_mean, self.evaluate(ret_v_mean.get())) self.assertEqual(expected_sum, self.evaluate(ret_v_sum)) self.assertEqual(expected_mean, self.evaluate(ret_v_mean)) + # TODO(priyag): Update this test to work in eager mode as well. + def testDynamicRnnVariables(self, distribution): + def model_fn(): + inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]]) + cell_fw = rnn_cell_impl.LSTMCell(300) + cell_bw = rnn_cell_impl.LSTMCell(300) + (outputs, _) = rnn.bidirectional_dynamic_rnn( + cell_fw, + cell_bw, + inputs, + dtype=dtypes.float32) + return outputs + + with context.graph_mode(), distribution.scope(): + result = distribution.extended.call_for_each_replica(model_fn) + # Two variables are created by the RNN layer. + self.assertEqual(2, len(result)) + for v in result: + self.assertIsInstance(v, values.DistributedValues) + _, v1 = distribution.unwrap(v) + self.assertStartsWith(v1._op.name, "replica_1/") + + def testReplicaLocalVariableUpdate(self, distribution): + def model_fn(): + v_sum = variable_scope.variable( + 1.0, + synchronization=variable_scope.VariableSynchronization.ON_READ, + aggregation=variable_scope.VariableAggregation.SUM) + self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable)) + return v_sum + + def update(var, value): + return var.assign(value) + + with distribution.scope(): + ret_v_sum = distribution.extended.call_for_each_replica(model_fn) + + # Initialize variables. + self.evaluate(variables.global_variables_initializer()) + # Assert that the aggregated value of the replica local vars is the sum + # of the individual values before running the update ops. + self.assertEqual(1.0, self.evaluate(ret_v_sum.get( + distribution.extended.worker_devices[0]).read_value())) + self.assertEqual(2.0, self.evaluate(ret_v_sum)) + + # Apply updates. + update_ops = distribution.extended.update( + ret_v_sum, update, args=(5.0,), group=False) + self.evaluate(update_ops) + # Assert that the aggregated value of the replica local vars is the sum + # of the individual values after running the update ops. + self.assertEqual(5.0, self.evaluate(ret_v_sum.get( + distribution.extended.worker_devices[0]).read_value())) + self.assertEqual(10.0, self.evaluate(ret_v_sum)) + + +@combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=["graph"])) +class MirroredStrategyNameScopeTest(test.TestCase): # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not # testing this in eager mode. - def testNameScope(self): + def testNameScope(self, distribution): def model_fn(): with ops.name_scope("foo"): a = constant_op.constant(1.0, name="a") @@ -677,20 +694,17 @@ class MirroredStrategyVariableCreationTest(test.TestCase): b = constant_op.constant(1.0, name="b") return a, b - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with context.graph_mode(), dist.scope(): + with context.graph_mode(), distribution.scope(): with ops.name_scope("main"): - result = dist.call_for_each_replica(model_fn) - self.assertEquals(2, len(result)) + result = distribution.extended.call_for_each_replica(model_fn) + self.assertEqual(2, len(result)) for v, name in zip(result, ["a", "b"]): self.assertIsInstance(v, values.DistributedValues) - v0, v1 = dist.unwrap(v) - self.assertEquals("main/foo/" + name + ":0", v0.name) - self.assertEquals("main/replica_1/foo/" + name + ":0", v1.name) + v0, v1 = distribution.unwrap(v) + self.assertEqual("main/foo/" + name + ":0", v0.name) + self.assertEqual("main/replica_1/foo/" + name + ":0", v1.name) - def testWithDefaultName(self): + def testWithDefaultName(self, distribution): def model_fn(): with ops.name_scope(None, "foo"): a = constant_op.constant(1.0, name="a") @@ -698,23 +712,20 @@ class MirroredStrategyVariableCreationTest(test.TestCase): b = constant_op.constant(2.0, name="b") return a, b - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with context.graph_mode(), dist.scope(): - result = dist.call_for_each_replica(model_fn) - self.assertEquals(2, len(result)) + with context.graph_mode(), distribution.scope(): + result = distribution.extended.call_for_each_replica(model_fn) + self.assertEqual(2, len(result)) for v, name in zip(result, ["a", "b"]): self.assertIsInstance(v, values.DistributedValues) - v0, v1 = dist.unwrap(v) - self.assertEquals("foo/" + name + ":0", v0.name) - self.assertEquals("replica_1/foo/" + name + ":0", v1.name) + v0, v1 = distribution.unwrap(v) + self.assertEqual("foo/" + name + ":0", v0.name) + self.assertEqual("replica_1/foo/" + name + ":0", v1.name) # variable_scope.variable() respects name scopes when creating # variables. On the other hand variable_scope.get_variable() ignores name # scopes when creating variables. We test both methods of creating variables # to make sure that we have the same variable names in both cases. - def testNameScopeWithVariable(self): + def testNameScopeWithVariable(self, distribution): def in_cross_replica(_): c = variable_scope.variable(1.0, name="c") return c @@ -725,28 +736,25 @@ class MirroredStrategyVariableCreationTest(test.TestCase): c = ds_context.get_replica_context().merge_call(in_cross_replica) return b, c - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with context.graph_mode(), dist.scope(): + with context.graph_mode(), distribution.scope(): with ops.name_scope("main"): a = variable_scope.variable(1.0, name="a") - result = dist.call_for_each_replica(model_fn) + result = distribution.extended.call_for_each_replica(model_fn) result_b = result[0] result_c = result[1] self.assertIsInstance(result_b, values.DistributedValues) self.assertIsInstance(result_c, values.DistributedValues) - a0, a1 = dist.unwrap(a) - b0, b1 = dist.unwrap(result_b) - c0, c1 = dist.unwrap(result_c) - self.assertEquals("main/a:0", a0.name) - self.assertEquals("main/a/replica_1:0", a1.name) - self.assertEquals("main/b:0", b0.name) - self.assertEquals("main/b/replica_1:0", b1.name) - self.assertEquals("main/foo/c:0", c0.name) - self.assertEquals("main/foo/c/replica_1:0", c1.name) - - def testNameScopeWithGetVariable(self): + a0, a1 = distribution.unwrap(a) + b0, b1 = distribution.unwrap(result_b) + c0, c1 = distribution.unwrap(result_c) + self.assertEqual("main/a:0", a0.name) + self.assertEqual("main/a/replica_1:0", a1.name) + self.assertEqual("main/b:0", b0.name) + self.assertEqual("main/b/replica_1:0", b1.name) + self.assertEqual("main/foo/c:0", c0.name) + self.assertEqual("main/foo/c/replica_1:0", c1.name) + + def testNameScopeWithGetVariable(self, distribution): def in_cross_replica(_): c = variable_scope.get_variable("c", [1]) return c @@ -757,114 +765,75 @@ class MirroredStrategyVariableCreationTest(test.TestCase): c = ds_context.get_replica_context().merge_call(in_cross_replica) return b, c - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with context.graph_mode(), dist.scope(): + with context.graph_mode(), distribution.scope(): with ops.name_scope("main"): a = variable_scope.get_variable("a", [1]) - result = dist.call_for_each_replica(model_fn) + result = distribution.extended.call_for_each_replica(model_fn) result_b = result[0] result_c = result[1] self.assertIsInstance(result_b, values.DistributedValues) self.assertIsInstance(result_c, values.DistributedValues) - a0, a1 = dist.unwrap(a) - b0, b1 = dist.unwrap(result_b) - c0, c1 = dist.unwrap(result_c) - self.assertEquals("a:0", a0.name) - self.assertEquals("a/replica_1:0", a1.name) - self.assertEquals("b:0", b0.name) - self.assertEquals("b/replica_1:0", b1.name) - self.assertEquals("c:0", c0.name) - self.assertEquals("c/replica_1:0", c1.name) - - def testDynamicRnnVariables(self): + a0, a1 = distribution.unwrap(a) + b0, b1 = distribution.unwrap(result_b) + c0, c1 = distribution.unwrap(result_c) + self.assertEqual("a:0", a0.name) + self.assertEqual("a/replica_1:0", a1.name) + self.assertEqual("b:0", b0.name) + self.assertEqual("b/replica_1:0", b1.name) + self.assertEqual("c:0", c0.name) + self.assertEqual("c/replica_1:0", c1.name) + + +@combinations.generate(combinations.combine( + distribution=[ + combinations.NamedDistribution( + "Mirrored3Devices", + # pylint: disable=g-long-lambda + lambda: mirrored_strategy.MirroredStrategy( + ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]), + required_gpus=2), + combinations.NamedDistribution( + "CoreMirrored3Devices", + # pylint: disable=g-long-lambda + lambda: mirrored_strategy.CoreMirroredStrategy( + ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]), + required_gpus=2)], + mode=["graph", "eager"])) +class MirroredThreeDeviceDistributionTest( + strategy_test_lib.DistributionTestBase, + parameterized.TestCase): + + def testThreeDevices(self, distribution): def model_fn(): - inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]]) - cell_fw = rnn_cell_impl.LSTMCell(300) - cell_bw = rnn_cell_impl.LSTMCell(300) - (outputs, _) = rnn.bidirectional_dynamic_rnn( - cell_fw, - cell_bw, - inputs, - dtype=dtypes.float32) - return outputs - - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with context.graph_mode(), dist.scope(): - result = dist.call_for_each_replica(model_fn) - # Two variables are created by the RNN layer. - self.assertEquals(2, len(result)) - for v in result: - self.assertIsInstance(v, values.DistributedValues) - _, v1 = dist.unwrap(v) - self.assertStartsWith(v1.name, "replica_1/") - - @test_util.run_in_graph_and_eager_modes(config=config) - def testReplicaLocalVariableUpdate(self): - with context.graph_mode(): - - def model_fn(): - v_sum = variable_scope.variable( - 1.0, - synchronization=variable_scope.VariableSynchronization.ON_READ, - aggregation=variable_scope.VariableAggregation.SUM) - self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable)) - return v_sum - - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:GPU:1"]) - - def update(var, value): - return var.assign(value) - - with dist.scope(): - ret_v_sum = dist.call_for_each_replica(model_fn) - update_ops = dist.update(ret_v_sum, update, 5.0, grouped=False) - - # Initialize variables. - self.evaluate(variables.global_variables_initializer()) - # Assert that the aggregated value of the replica local vars is the sum - # of the individual values before running the update ops. - self.assertEquals(1.0, self.evaluate( - ret_v_sum.get(dist.extended.worker_devices[0]).read_value())) - self.assertEquals(2.0, self.evaluate(ret_v_sum)) + v = variable_scope.variable(1.0, name="foo") + ds_context.get_replica_context().merge_call(lambda _: _) + return v - # Apply updates. - self.evaluate(update_ops) - # Assert that the aggregated value of the replica local vars is the sum - # of the individual values after running the update ops. - self.assertEquals(5.0, self.evaluate( - ret_v_sum.get(dist.extended.worker_devices[0]).read_value())) - self.assertEquals(10.0, self.evaluate(ret_v_sum)) + with distribution.scope(): + result = distribution.extended.call_for_each_replica(model_fn) + self.assertIsInstance(result, values.MirroredVariable) + self.assertEqual("foo:0", result.name) +@combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=["graph", "eager"])) class MirroredVariableUpdateTest(test.TestCase): # The following tests check assign, assign_add and assign_sub on Mirrored # variables in replica and cross replica context. - config = config_pb2.ConfigProto() - config.allow_soft_placement = True - - def _skip_eager_if_gpus_less_than(self, num_gpus): - if context.num_gpus() < num_gpus and context.executing_eagerly(): - self.skipTest("Enough GPUs not available for this test in eager mode.") - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignMirroredVarReplicaContextWithoutAggregationType(self): + def testAssignMirroredVarReplicaContextWithoutAggregationType(self, + distribution): # Test that we always have an aggregation type set on the mirrored variable # if we assign to it in replica mode. - self._skip_eager_if_gpus_less_than(1) def var_fn(): v = variable_scope.variable(1.0, name="foo") return v - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) @@ -874,23 +843,19 @@ class MirroredVariableUpdateTest(test.TestCase): with self.assertRaisesRegexp( ValueError, "You must specify an aggregation method to update a " "MirroredVariable in Replica Context."): - self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn))) + self.evaluate(distribution.unwrap( + distribution.extended.call_for_each_replica(model_fn))) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignMirroredVarReplicaContextWithSum(self): + def testAssignMirroredVarReplicaContextWithSum(self, distribution): # Test that we don't reduce a non-per-replica value with the "sum" # aggregation type. - self._skip_eager_if_gpus_less_than(1) def var_fn(): v = variable_scope.variable( 1.0, name="foo", aggregation=variable_scope.VariableAggregation.SUM) return v - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) @@ -900,40 +865,31 @@ class MirroredVariableUpdateTest(test.TestCase): with self.assertRaisesRegexp( ValueError, "A non-DistributedValues value 5.0 cannot be reduced " "with the given reduce op ReduceOp.SUM."): - self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn))) + self.evaluate(distribution.unwrap( + distribution.extended.call_for_each_replica(model_fn))) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignMirroredVarCrossDeviceContext(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignMirroredVarCrossDeviceContext(self, distribution): def var_fn(): return variable_scope.variable(1.0, name="foo") - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) - self.assertEquals(1.0, self.evaluate(mirrored_var)) + self.assertEqual(1.0, self.evaluate(mirrored_var)) mirrored_var_result = self.evaluate(mirrored_var.assign(6.0)) - self.assertEquals(6.0, mirrored_var_result) + self.assertEqual(6.0, mirrored_var_result) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignMirroredVarReplicaContext(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignMirroredVarReplicaContext(self, distribution): def var_fn(): return variable_scope.variable( 1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN) - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) - self.assertEquals(1.0, self.evaluate(mirrored_var)) + self.assertEqual(1.0, self.evaluate(mirrored_var)) def model_fn(): value = math_ops.cast( @@ -941,73 +897,60 @@ class MirroredVariableUpdateTest(test.TestCase): mirrored_var.dtype) return mirrored_var.assign(value) - self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn))) - self.assertEquals(0.5, self.evaluate(mirrored_var)) + self.evaluate(distribution.unwrap( + distribution.extended.call_for_each_replica(model_fn))) + self.assertEqual(0.5, self.evaluate(mirrored_var)) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignMirroredVarReplicaContextWithSingleValue(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignMirroredVarReplicaContextWithSingleValue(self, distribution): def var_fn(): return variable_scope.variable( 1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN) - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) - self.assertEquals(1.0, self.evaluate(mirrored_var)) + self.assertEqual(1.0, self.evaluate(mirrored_var)) def model_fn(): return mirrored_var.assign(5.0) - self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn))) - self.assertEquals(5.0, self.evaluate(mirrored_var)) + self.evaluate(distribution.unwrap( + distribution.extended.call_for_each_replica(model_fn))) + self.assertEqual(5.0, self.evaluate(mirrored_var)) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignAddMirroredVarCrossDeviceContext(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignAddMirroredVarCrossDeviceContext(self, distribution): def var_fn(): return variable_scope.variable(1.0, name="foo") - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) - self.assertEquals(1.0, self.evaluate(mirrored_var)) + self.assertEqual(1.0, self.evaluate(mirrored_var)) # read_value == True mirrored_var_result = self.evaluate( mirrored_var.assign_add(6.0, read_value=True)) - self.assertEquals(7.0, mirrored_var_result) - self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:CPU:0"))) - self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:GPU:0"))) + self.assertEqual(7.0, mirrored_var_result) + self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:CPU:0"))) + self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:GPU:0"))) # read_value == False self.evaluate(mirrored_var.assign_add(2.0, read_value=False)) - self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:CPU:0"))) - self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:GPU:0"))) + self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:CPU:0"))) + self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:GPU:0"))) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignAddMirroredVarReplicaContext(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignAddMirroredVarReplicaContext(self, distribution): def var_fn(): return variable_scope.variable( 1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN) - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) - self.assertEquals(1.0, self.evaluate(mirrored_var)) + self.assertEqual(1.0, self.evaluate(mirrored_var)) def model_fn(): value = math_ops.cast( @@ -1015,65 +958,52 @@ class MirroredVariableUpdateTest(test.TestCase): mirrored_var.dtype) return mirrored_var.assign_add(value) - self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn))) - self.assertEquals(1.5, self.evaluate(mirrored_var)) + self.evaluate(distribution.unwrap( + distribution.extended.call_for_each_replica(model_fn))) + self.assertEqual(1.5, self.evaluate(mirrored_var)) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignAddMirroredVarReplicaContextWithSingleValue(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignAddMirroredVarReplicaContextWithSingleValue(self, distribution): def var_fn(): return variable_scope.variable( 1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN) - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) - self.assertEquals(1.0, self.evaluate(mirrored_var)) + self.assertEqual(1.0, self.evaluate(mirrored_var)) def model_fn(): return mirrored_var.assign_add(5.0) - self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn))) - self.assertEquals(6.0, self.evaluate(mirrored_var)) + self.evaluate(distribution.unwrap( + distribution.extended.call_for_each_replica(model_fn))) + self.assertEqual(6.0, self.evaluate(mirrored_var)) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignSubMirroredVarCrossDeviceContext(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignSubMirroredVarCrossDeviceContext(self, distribution): def var_fn(): return variable_scope.variable(5.0, name="foo") - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) - self.assertEquals(5.0, self.evaluate(mirrored_var)) + self.assertEqual(5.0, self.evaluate(mirrored_var)) mirrored_var_result = self.evaluate(mirrored_var.assign_sub(2.0)) - self.assertEquals(3.0, mirrored_var_result) - self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:GPU:0"))) - self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:CPU:0"))) + self.assertEqual(3.0, mirrored_var_result) + self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:GPU:0"))) + self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:CPU:0"))) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignSubMirroredVarReplicaContext(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignSubMirroredVarReplicaContext(self, distribution): def var_fn(): return variable_scope.variable( 5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN) - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) - self.assertEquals(5.0, self.evaluate(mirrored_var)) + self.assertEqual(5.0, self.evaluate(mirrored_var)) def model_fn(): value = math_ops.cast( @@ -1081,37 +1011,37 @@ class MirroredVariableUpdateTest(test.TestCase): mirrored_var.dtype) return mirrored_var.assign_sub(value) - self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn))) - self.assertEquals(4.5, self.evaluate(mirrored_var)) + self.evaluate(distribution.unwrap( + distribution.extended.call_for_each_replica(model_fn))) + self.assertEqual(4.5, self.evaluate(mirrored_var)) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignSubMirroredVarReplicaContextWithSingleValue(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignSubMirroredVarReplicaContextWithSingleValue(self, distribution): def var_fn(): return variable_scope.variable( 5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN) - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.evaluate(variables.global_variables_initializer()) - self.assertEquals(5.0, self.evaluate(mirrored_var)) + self.assertEqual(5.0, self.evaluate(mirrored_var)) def model_fn(): return mirrored_var.assign_sub(1.0) - self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn))) - self.assertEquals(4.0, self.evaluate(mirrored_var)) + self.evaluate(distribution.unwrap( + distribution.extended.call_for_each_replica(model_fn))) + self.assertEqual(4.0, self.evaluate(mirrored_var)) +@combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=["graph", "eager"])) class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase): - config = config_pb2.ConfigProto() - config.allow_soft_placement = True - def testAssignMirroredVarInitializer(self): + def testAssignMirroredVarInitializer(self, distribution): # This test is not eager compatible since in eager variables are initialized # upon construction instead of once the initialization op is run. with context.graph_mode(): @@ -1119,17 +1049,14 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase): v = variable_scope.variable(1.0, name="foo") return v - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - mirrored_var = dist.call_for_each_replica(var_fn) + with distribution.scope(): + mirrored_var = distribution.extended.call_for_each_replica(var_fn) self.assertIsInstance(mirrored_var, values.MirroredVariable) self.assertFalse(self.evaluate(mirrored_var.is_initialized())) self.evaluate(mirrored_var.initializer) self.assertTrue(self.evaluate(mirrored_var.is_initialized())) - def testAssignReplicaLocalVarInitializer(self): + def testAssignReplicaLocalVarInitializer(self, distribution): # This test is not eager compatible since in eager variables are initialized # upon construction instead of once the initialization op is run. with context.graph_mode(): @@ -1141,11 +1068,9 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase): self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable)) return v_sum - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - replica_local_var = dist.call_for_each_replica(model_fn) + with distribution.scope(): + replica_local_var = distribution.extended.call_for_each_replica( + model_fn) self.assertTrue(isinstance(replica_local_var, values.ReplicaLocalVariable)) self.assertFalse(self.evaluate(replica_local_var.is_initialized())) @@ -1153,17 +1078,14 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase): self.assertTrue(self.evaluate(replica_local_var.is_initialized())) +@combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=["graph", "eager"])) class ReplicaLocalVariableAssignTest(test.TestCase): - config = config_pb2.ConfigProto() - config.allow_soft_placement = True - def _skip_eager_if_gpus_less_than(self, num_gpus): - if context.num_gpus() < num_gpus and context.executing_eagerly(): - self.skipTest("Not enough GPUs available for this test in eager mode.") - - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignReplicaLocalVarSumAggregation(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignReplicaLocalVarSumAggregation(self, distribution): def model_fn(): v_sum = variable_scope.variable( 1.0, @@ -1171,18 +1093,16 @@ class ReplicaLocalVariableAssignTest(test.TestCase): aggregation=variable_scope.VariableAggregation.SUM) return v_sum - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - replica_local_var = dist.call_for_each_replica(model_fn) + with distribution.scope(): + replica_local_var = distribution.extended.call_for_each_replica(model_fn) self.assertTrue(isinstance(replica_local_var, values.ReplicaLocalVariable)) self.evaluate(variables.global_variables_initializer()) # Each replica has a value of 1.0 assigned to it in replica context. # When we read the value using `read_var` we should see the SUM of each of # values on each of the replicas. - self.assertEqual(2.0, self.evaluate(dist.read_var(replica_local_var))) + self.assertEqual(2.0, self.evaluate( + distribution.read_var(replica_local_var))) # Assigning 6.0 in cross replica context will assign a value of # 6.0/num_replicas to each replica. tlv_ops = replica_local_var.assign(6.0) @@ -1190,11 +1110,10 @@ class ReplicaLocalVariableAssignTest(test.TestCase): # On reading the replica local var we should get the assigned value back. # The value on all the replicas are added before being returned by # `read_var`. - self.assertEqual(6.0, self.evaluate(dist.read_var(replica_local_var))) + self.assertEqual(6.0, self.evaluate( + distribution.read_var(replica_local_var))) - @test_util.run_in_graph_and_eager_modes(config=config) - def testAssignReplicaLocalVarMeanAggregation(self): - self._skip_eager_if_gpus_less_than(1) + def testAssignReplicaLocalVarMeanAggregation(self, distribution): def model_fn(): v_sum = variable_scope.variable( 1.0, @@ -1202,23 +1121,22 @@ class ReplicaLocalVariableAssignTest(test.TestCase): aggregation=variable_scope.VariableAggregation.MEAN) return v_sum - dist = mirrored_strategy.MirroredStrategy( - ["/device:GPU:0", "/device:CPU:0"]) - - with dist.scope(): - replica_local_var = dist.call_for_each_replica(model_fn) + with distribution.scope(): + replica_local_var = distribution.extended.call_for_each_replica(model_fn) self.assertTrue(isinstance(replica_local_var, values.ReplicaLocalVariable)) self.evaluate(variables.global_variables_initializer()) # Each replica has a value of 1.0 assigned to it in replica context. # When we read the value using `read_var` we should see the MEAN of values # on all replicas which is the value assigned in replica context. - self.assertEqual(1.0, self.evaluate(dist.read_var(replica_local_var))) + self.assertEqual(1.0, self.evaluate( + distribution.read_var(replica_local_var))) tlv_ops = replica_local_var.assign(6.0) self.evaluate(tlv_ops) # On reading the replica local var we should get the MEAN of all values # which is equal to the value assigned. - self.assertEqual(6.0, self.evaluate(dist.read_var(replica_local_var))) + self.assertEqual(6.0, self.evaluate( + distribution.read_var(replica_local_var))) class MockModel(object): @@ -1252,24 +1170,25 @@ class MiniModel(keras_training.Model): return self.fc(inputs) +@combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=["graph", "eager"])) class MirroredStrategyDefunTest(test.TestCase): - def _skip_eager_if_gpus_less_than(self, num_gpus): - if context.num_gpus() < num_gpus and context.executing_eagerly(): - self.skipTest("Not enough GPUs available for this test in eager mode.") - - def _call_and_check(self, model_fn, inputs, expected_result, defuns, - two_variables=False): + def _call_and_check(self, distribution, model_fn, inputs, expected_result, + defuns, two_variables=False): cpu_dev = device_util.canonicalize("CPU:0") gpu_dev = device_util.canonicalize("GPU:0") devices = [cpu_dev, gpu_dev] - dist = mirrored_strategy.MirroredStrategy(devices) - with dist.scope(): + with distribution.scope(): mock_model = MockModel(two_variables) self.evaluate(variables.global_variables_initializer()) - result = dist.call_for_each_replica(model_fn, args=[mock_model] + inputs) + result = distribution.extended.call_for_each_replica( + model_fn, args=[mock_model] + inputs) for device in devices: device_result = values.select_device(device, result) device_expected_result = values.select_device(device, expected_result) @@ -1281,17 +1200,15 @@ class MirroredStrategyDefunTest(test.TestCase): # call_for_each has one trace per device. To check that the expected set # of variables was accessed on each trace, we first retrieve each # device-specific graph function. - per_replica_graph_functions = dist.call_for_each_replica( - defun.get_concrete_function, args=[mock_model] + inputs) + per_replica_graph_functions = ( + distribution.extended.call_for_each_replica( + defun.get_concrete_function, args=[mock_model] + inputs)) for device in devices: graph_function = per_replica_graph_functions.get(device=device) self.assertEqual(set(mock_model.variables), set(graph_function.graph.variables)) - @test_util.run_in_graph_and_eager_modes() - def testVariableInDefun(self): - self._skip_eager_if_gpus_less_than(1) - + def testVariableInDefun(self, distribution): @function.defun def times_two(mock_model): return mock_model() @@ -1299,12 +1216,9 @@ class MirroredStrategyDefunTest(test.TestCase): def model_fn(mock_model): return times_two(mock_model) - self._call_and_check(model_fn, [], 2.5, [times_two]) - - @test_util.run_in_graph_and_eager_modes() - def testVariableInNestedDefun(self): - self._skip_eager_if_gpus_less_than(1) + self._call_and_check(distribution, model_fn, [], 2.5, [times_two]) + def testVariableInNestedDefun(self, distribution): @function.defun def times_two(mock_model): return mock_model() @@ -1316,12 +1230,10 @@ class MirroredStrategyDefunTest(test.TestCase): def model_fn(mock_model): return two_x_plus_one(mock_model) - self._call_and_check(model_fn, [], 3.5, [times_two, two_x_plus_one]) - - @test_util.run_in_graph_and_eager_modes() - def testTwoVariablesInNestedDefun(self): - self._skip_eager_if_gpus_less_than(1) + self._call_and_check(distribution, model_fn, [], 3.5, + [times_two, two_x_plus_one]) + def testTwoVariablesInNestedDefun(self, distribution): @function.defun def fn1(mock_model): return mock_model() @@ -1333,12 +1245,10 @@ class MirroredStrategyDefunTest(test.TestCase): def model_fn(mock_model): return fn2(mock_model) - self._call_and_check(model_fn, [], 5.5, [fn1, fn2], two_variables=True) - - @test_util.run_in_graph_and_eager_modes() - def testGradientTapeOverNestedDefuns(self): - self._skip_eager_if_gpus_less_than(1) + self._call_and_check(distribution, model_fn, [], 5.5, [fn1, fn2], + two_variables=True) + def testGradientTapeOverNestedDefuns(self, distribution): @function.defun def fn1(mock_model): return mock_model() @@ -1354,13 +1264,10 @@ class MirroredStrategyDefunTest(test.TestCase): [v.get() for v in mock_model.variables]) return grads - self._call_and_check(model_fn, [], [2.0, 1.0], [fn1, fn2], + self._call_and_check(distribution, model_fn, [], [2.0, 1.0], [fn1, fn2], two_variables=True) - @test_util.run_in_graph_and_eager_modes() - def testPassPerReplica(self): - self._skip_eager_if_gpus_less_than(1) - + def testPassPerReplica(self, distribution): @function.defun def fn1(mock_model, factor): return mock_model(factor) @@ -1368,18 +1275,10 @@ class MirroredStrategyDefunTest(test.TestCase): factors = values.PerReplica({"CPU:0": 5.0, "GPU:0": 3.0}) expected_result = values.PerReplica({"CPU:0": 5.0 * 1.25, "GPU:0": 3.0 * 1.25}) - self._call_and_check(fn1, [factors], expected_result, [fn1]) - - @test_util.run_in_graph_and_eager_modes() - def testTrain(self): - self._skip_eager_if_gpus_less_than(1) - - cpu_dev = device_util.canonicalize("CPU:0") - gpu_dev = device_util.canonicalize("GPU:0") - devices = [cpu_dev, gpu_dev] - dist = mirrored_strategy.MirroredStrategy(devices) + self._call_and_check(distribution, fn1, [factors], expected_result, [fn1]) - with dist.scope(): + def testTrain(self, distribution): + with distribution.scope(): mock_model = MiniModel() mock_model.call = function.defun(mock_model.call) @@ -1389,10 +1288,11 @@ class MirroredStrategyDefunTest(test.TestCase): gradients_fn = backprop.implicit_grad(loss_fn) gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) - grads_and_vars = dist.call_for_each_replica(gradients_fn, args=(None,)) + grads_and_vars = distribution.extended.call_for_each_replica( + gradients_fn, args=(None,)) optimizer = gradient_descent.GradientDescentOptimizer(0.25) - update_ops = optimizer._distributed_apply(dist, grads_and_vars) # pylint: disable=protected-access + update_ops = optimizer._distributed_apply(distribution, grads_and_vars) # pylint: disable=protected-access if not context.executing_eagerly(): self.evaluate(variables.global_variables_initializer()) @@ -1404,30 +1304,73 @@ class MirroredStrategyDefunTest(test.TestCase): self.assertAllEqual([0.5], updated_var_values[1]) +@combinations.generate( + combinations.combine( + distribution=[ + combinations.NamedDistribution( + "Mirrored", + # pylint: disable=g-long-lambda + lambda: mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=context.num_gpus()), + required_gpus=1), + combinations.NamedDistribution( + "CoreMirrored", + # pylint: disable=g-long-lambda + lambda: mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=context.num_gpus()), + required_gpus=1) + ], + mode=["graph"])) class MultiWorkerMirroredStrategyTest( multi_worker_test_base.MultiWorkerTestBase, strategy_test_lib.DistributionTestBase): - def _get_distribution_strategy(self): + def _configure_distribution_strategy(self, distribution): cluster_spec = server_lib.ClusterSpec({ "worker": ["/job:worker/task:0", "/job:worker/task:1"] }) - strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus()) - strategy.configure(cluster_spec=cluster_spec) - return strategy - - def test_num_replicas_in_sync(self): - if not GPU_TEST: - self.skipTest("Not GPU test") + distribution.configure(cluster_spec=cluster_spec) - strategy = self._get_distribution_strategy() + def test_num_replicas_in_sync(self, distribution): + self._configure_distribution_strategy(distribution) # We calculate the total number of gpus across the workers(2) specified in # the cluster spec. - self.assertEqual(context.num_gpus() * 2, strategy.num_replicas_in_sync) - - def testMinimizeLossGraph(self): - self._test_minimize_loss_graph(self._get_distribution_strategy(), - learning_rate=0.05) + self.assertEqual(context.num_gpus() * 2, distribution.num_replicas_in_sync) + + def testMinimizeLossGraph(self, distribution): + self._configure_distribution_strategy(distribution) + self._test_minimize_loss_graph(distribution, learning_rate=0.05) + + def testDeviceScope(self, distribution): + """Test the device scope of multi-worker MirroredStrategy.""" + self._configure_distribution_strategy(distribution) + with distribution.scope(): + a = constant_op.constant(1.) + with ops.device("/cpu:0"): + b = constant_op.constant(1.) + self.assertEqual(a.device, "/job:worker/task:0") + self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0") + + def testMakeInputFnIterator(self, distribution): + self._configure_distribution_strategy(distribution) + dataset_fn = lambda: dataset_ops.Dataset.range(100) + num_gpus = context.num_gpus() + num_workers = 2 + + expected_values = [[i+j for j in range(num_gpus)] * num_workers + for i in range(0, 100, num_gpus)] + + with context.graph_mode(), self.cached_session() as sess: + # `expected_input_pipeline_id` is None because the input_fn will be called + # multiple times, each with a different input_pipeline_id. + input_fn = self._input_fn_to_test_input_context( + dataset_fn, + expected_num_replicas_in_sync=num_workers*num_gpus, + expected_num_input_pipelines=num_workers, + expected_input_pipeline_id=None) + iterator = distribution.make_input_fn_iterator(input_fn) + self._test_input_fn_iterator( + iterator, distribution.extended.worker_devices, expected_values, sess) class MultiWorkerMirroredStrategyTestWithChief( @@ -1447,6 +1390,12 @@ class MultiWorkerMirroredStrategyTestWithChief( strategy.configure(cluster_spec=self._cluster_spec) self._test_minimize_loss_graph(strategy, learning_rate=0.05) + def testMinimizeLossGraphCoreMirroredStrategy(self): + strategy = mirrored_strategy.CoreMirroredStrategy( + num_gpus_per_worker=context.num_gpus()) + strategy.configure(cluster_spec=self._cluster_spec) + self._test_minimize_loss_graph(strategy, learning_rate=0.05) + def _replica_id(): replica_id = ds_context.get_replica_context().replica_id_in_sync_group diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py deleted file mode 100644 index 0886d0ef341ab18b1fe066d36850e161939c970b..0000000000000000000000000000000000000000 --- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2018 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for class MirroredStrategy.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from tensorflow.contrib.distribute.python import mirrored_strategy -from tensorflow.contrib.distribute.python import strategy_test_lib -from tensorflow.python.eager import context -from tensorflow.python.eager import test -from tensorflow.python.framework import constant_op -from tensorflow.python.framework import ops -from tensorflow.python.framework import test_util -from tensorflow.python.ops import variable_scope -from tensorflow.python.training import distribution_strategy_context as ds_context - - -class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase): - - def _get_distribution_strategy(self): - return mirrored_strategy.MirroredStrategy(["/device:CPU:0"]) - - def testMinimizeLossEager(self): - self._test_minimize_loss_eager(self._get_distribution_strategy()) - - def testMinimizeLossGraph(self): - self._test_minimize_loss_graph(self._get_distribution_strategy()) - - def testReplicaId(self): - self._test_replica_id(self._get_distribution_strategy()) - - @test_util.run_in_graph_and_eager_modes - def testCallAndMergeExceptions(self): - self._test_call_and_merge_exceptions(self._get_distribution_strategy()) - - @test_util.run_in_graph_and_eager_modes - def testInputContextPropertyLocal(self): - d = mirrored_strategy.MirroredStrategy(num_gpus_per_worker=2) - input_fn = self._input_fn_to_test_input_context( - expected_num_replicas_in_sync=2, - expected_num_input_pipelines=1, - expected_input_pipeline_id=0) - d.make_input_fn_iterator(input_fn) - - def testInputContextPropertyMultiWorker(self): - d = mirrored_strategy.MirroredStrategy(num_gpus_per_worker=2) - cluster_spec = {"worker": ["worker1", "worker2", "worker3"]} - d.configure(cluster_spec=cluster_spec) - with context.graph_mode(): - # `expected_input_pipeline_id` is None because the input_fn will be called - # multiple times, each with a different input_pipeline_id. - input_fn = self._input_fn_to_test_input_context( - expected_num_replicas_in_sync=6, - expected_num_input_pipelines=3, - expected_input_pipeline_id=None) - d.make_input_fn_iterator(input_fn) - - -class VariableCreatorStackTest(test.TestCase): - - def testCreatorStacksAreThreadLocal(self): - devices = ["/device:CPU:0", "/device:GPU:0"] - dist = mirrored_strategy.MirroredStrategy(devices) - - def model_fn(): - replica_id_str = str(self.evaluate(_replica_id())) - - def thread_creator_fn(next_creator, *args, **kwargs): - return next_creator(*args, **kwargs) + ":thread_" + replica_id_str - - with variable_scope.variable_creator_scope(thread_creator_fn): - # Create a variable in this scope. - v = variable_scope.variable(1.0) - - # This will pause the current thread, and execute the other thread. - ds_context.get_replica_context().merge_call(lambda _: _) - return v - - def main_thread_creator(next_creator, *args, **kwargs): - # We are not using the underlying next_creator for test purposes. - del next_creator, args, kwargs - return "main_thread" - - with context.graph_mode(), \ - dist.scope(), \ - variable_scope.variable_creator_scope(main_thread_creator): - result = dist.call_for_each_replica(model_fn) - result = dist.unwrap(result) - expected = ["main_thread:thread_0", "main_thread:thread_1"] - self.assertEqual(expected, result) - - -def _replica_id(): - replica_id = ds_context.get_replica_context().replica_id_in_sync_group - if not isinstance(replica_id, ops.Tensor): - replica_id = constant_op.constant(replica_id) - return replica_id - - -class MultiWorkerMirroredStrategyTest(test.TestCase): - - def testDeviceScope(self): - """Test the device scope of multi-worker MirroredStrategy.""" - with context.graph_mode(): - strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus()) - strategy.configure( - cluster_spec={"worker": ["/job:worker/task:0", "/job:worker/task:1"]}) - with strategy.scope(): - a = constant_op.constant(1.) - with ops.device("/cpu:0"): - b = constant_op.constant(1.) - self.assertEqual(a.device, "/job:worker/task:0") - self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0") - - -if __name__ == "__main__": - test.main() diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py index 7ecc852d20508cc7063f3598c9fef03d6ce536a5..c492d8bafc9024ed059f05b92e5466f3702726b9 100644 --- a/tensorflow/contrib/distribute/python/moving_averages_test.py +++ b/tensorflow/contrib/distribute/python/moving_averages_test.py @@ -32,7 +32,8 @@ from tensorflow.python.training import moving_averages all_combinations = combinations.combine( distribution=[combinations.default_strategy, combinations.one_device_strategy, - combinations.mirrored_strategy_with_gpu_and_cpu], + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], mode=["graph"]) diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py index 893f073f00e9f0866f1ed23348b145dabc0c40f8..421507232ac26915741d422d8a23008ddb7bf143 100644 --- a/tensorflow/contrib/distribute/python/one_device_strategy.py +++ b/tensorflow/contrib/distribute/python/one_device_strategy.py @@ -20,7 +20,7 @@ from __future__ import print_function import six -from tensorflow.contrib.distribute.python import values +from tensorflow.python.distribute import values from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -67,9 +67,8 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended): return next_creator(*args, **kwargs) def _make_dataset_iterator(self, dataset): - distributed_dataset = values.PerReplicaDataset(dataset, [self._device]) - # TODO(priyag): Return distribution strategy specific InputIterator - return distributed_dataset.make_initializable_iterator() + """Make iterator from dataset without splitting the batch.""" + return values.DatasetIterator(dataset, [("/job:localhost", [self._device])]) def _distribute_dataset(self, dataset_fn): return values.PerReplicaDataset( @@ -79,9 +78,9 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended): self, input_fn, replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): - return values.PerReplicaDataset( - self._call_dataset_fn(input_fn, distribute_lib.InputContext()), - [self._device]) + return values.InputFunctionIterator( + input_fn, [("/job:localhost", [self._device])], + [distribute_lib.InputContext()]) def _broadcast_to(self, tensor, destinations): del destinations @@ -101,7 +100,7 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended): fn_inputs = iterator.get_next() if not isinstance(fn_inputs, tuple): fn_inputs = (fn_inputs,) - fn_result = fn(ctx, *fn_inputs) + fn_result = fn(ctx, fn_inputs) flat_last_step_outputs = nest.flatten(ctx.last_step_outputs) with ops.control_dependencies([fn_result]): return [i + 1] + flat_last_step_outputs @@ -195,6 +194,11 @@ class OneDeviceExtended(distribute_lib.DistributionStrategyExtended): def should_save_summary(self): return True + # TODO(priyag): Delete this once all strategies use global batch size. + @property + def _global_batch_size(self): + return True + class _OneDeviceReplicaContext(distribute_lib.ReplicaContext): """ReplicaContext for OneDeviceStrategy.""" @@ -205,10 +209,6 @@ class _OneDeviceReplicaContext(distribute_lib.ReplicaContext): distribution_strategy, replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)) - @property - def device(self): - raise RuntimeError("Use .devices instead") - @property def devices(self): - return [self._distribution_strategy.worker_devices[0]] + return [self._distribution_strategy.extended.worker_devices[0]] diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py index cf6d890390c39a4672f5bc1a89ffa97512ff5b4a..d46cd6f529e363f76bfa2b22339add63530cfde8 100644 --- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py +++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py @@ -20,6 +20,7 @@ from __future__ import print_function from tensorflow.contrib.distribute.python import one_device_strategy from tensorflow.contrib.distribute.python import strategy_test_lib +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import test from tensorflow.python.framework import test_util @@ -43,13 +44,18 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase): self._test_call_and_merge_exceptions(self._get_distribution_strategy()) @test_util.run_in_graph_and_eager_modes - def testInputContextPropertyLocal(self): + def testMakeInputFnIterator(self): d = one_device_strategy.OneDeviceStrategy("/device:CPU:0") + dataset_fn = lambda: dataset_ops.Dataset.range(10) + expected_values = [[i] for i in range(10)] input_fn = self._input_fn_to_test_input_context( + dataset_fn, expected_num_replicas_in_sync=1, expected_num_input_pipelines=1, expected_input_pipeline_id=0) - d.make_input_fn_iterator(input_fn) + iterator = d.make_input_fn_iterator(input_fn) + self._test_input_fn_iterator( + iterator, d.extended.worker_devices, expected_values) if __name__ == "__main__": diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py index 58b6c8717919a5e8d1aa5a949b88e05abdc4b040..fc2d2b20c95f0260d8243b662a020ddee8a00b14 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py @@ -18,11 +18,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib from tensorflow.contrib.distribute.python import mirrored_strategy -from tensorflow.contrib.distribute.python import values +from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib from tensorflow.python.distribute import multi_worker_util -from tensorflow.python.distribute import reduce_util +from tensorflow.python.distribute import values from tensorflow.python.eager import context from tensorflow.python.framework import device as tf_device from tensorflow.python.framework import ops @@ -108,8 +107,8 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended): self._initialize_local(num_gpus_per_worker) # We typically don't need to do all-reduce in this strategy. - self._cross_tower_ops = ( - cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps( + self._cross_device_ops = ( + cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps( reduce_to_device=_LOCAL_CPU)) def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec, @@ -198,6 +197,7 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended): def _initialize_local(self, num_gpus_per_worker): """Initialize internal devices for local training.""" + self._worker_device = "/job:localhost" # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. @@ -252,14 +252,21 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended): num_input_pipelines=num_input_pipelines, input_pipeline_id=input_pipeline_id, num_replicas_in_sync=self._num_replicas_in_sync) - return values.PerReplicaDataset( - self._call_dataset_fn(input_fn, input_context), self._compute_devices, - True) + worker_device_pairs = [(self._worker_device, self._compute_devices)] + return values.InputFunctionIterator( + input_fn, worker_device_pairs, [input_context]) def _broadcast_to(self, tensor, destinations): - if not cross_tower_ops_lib.check_destinations(destinations): + # This is both a fast path for Python constants, and a way to delay + # converting Python values to a tensor until we know what type it + # should be converted to. Otherwise we have trouble with: + # global_step.assign_add(1) + # since the `1` gets broadcast as an int32 but global_step is int64. + if isinstance(tensor, (float, int)): + return tensor + if not cross_device_ops_lib.check_destinations(destinations): destinations = self._compute_devices - return self._cross_tower_ops.broadcast(tensor, destinations) + return self._cross_device_ops.broadcast(tensor, destinations) def _allow_variable_partition(self): return not context.executing_eagerly() @@ -331,7 +338,7 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended): return if destinations is None: return - for d in cross_tower_ops_lib.get_devices_from(destinations): + for d in cross_device_ops_lib.get_devices_from(destinations): d_spec = tf_device.DeviceSpec.from_string(d) if d_spec.job == self._task_type and d_spec.task != self._task_id: raise ValueError( @@ -344,20 +351,14 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended): # pylint: disable=protected-access return mirrored_strategy._reduce_non_distributed_value( self, reduce_op, value, destinations) - if reduce_op == reduce_util.ReduceOp.ONLY_FIRST_REPLICA: - return self.broadcast_to( - value.get(self._compute_devices[0]), destinations) - return self._cross_tower_ops.reduce( + return self._cross_device_ops.reduce( reduce_op, value, destinations=destinations) def _batch_reduce_to(self, reduce_op, value_destination_pairs): - if reduce_op == reduce_util.ReduceOp.ONLY_FIRST_REPLICA: - return [self.broadcast_to(v.get(self._compute_devices[0]), d) - for v, d in value_destination_pairs] for _, destinations in value_destination_pairs: self._verify_destinations_not_different_worker(destinations) - return self._cross_tower_ops.batch_reduce(reduce_op, - value_destination_pairs) + return self._cross_device_ops.batch_reduce(reduce_op, + value_destination_pairs) def _select_single_value(self, structured): """Select any single values in `structured`.""" @@ -509,3 +510,8 @@ class ParameterServerExtended(distribute_lib.DistributionStrategyExtended): @property def should_save_summary(self): return self._is_chief + + # TODO(priyag): Delete this once all strategies use global batch size. + @property + def _global_batch_size(self): + return False diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py index 091a358fea0678e72647c6f628ce9a4761dc2655..1ada6a6ba493563cd56342854f8d84a8ed5a7d40 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py @@ -26,14 +26,16 @@ from tensorflow.contrib.distribute.python import combinations from tensorflow.contrib.distribute.python import multi_worker_test_base from tensorflow.contrib.distribute.python import parameter_server_strategy from tensorflow.contrib.distribute.python import strategy_test_lib -from tensorflow.contrib.distribute.python import values from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.distribute import multi_worker_util from tensorflow.python.distribute import reduce_util +from tensorflow.python.distribute import values from tensorflow.python.eager import backprop from tensorflow.python.eager import context from tensorflow.python.estimator import run_config from tensorflow.python.framework import constant_op +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util from tensorflow.python.layers import core @@ -516,8 +518,40 @@ class ParameterServerStrategyTestBase( self.assertLess(error_after, error_before) return error_after < error_before + def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn, + expected_values): + distribution, master_target, config = self._get_test_objects( + task_type, task_id, num_gpus) + devices = distribution.extended.worker_devices + + with ops.Graph().as_default(), \ + self.cached_session(config=config, + target=master_target) as sess: + iterator = distribution.make_input_fn_iterator(input_fn) + sess.run(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = sess.run( + [values.select_device(d, next_element) for d in devices]) + self.assertEqual(expected_value, computed_value) + + with self.assertRaises(errors.OutOfRangeError): + next_element = iterator.get_next() + sess.run([values.select_device(d, next_element) for d in devices]) + + # After re-initializing the iterator, should be able to iterate again. + sess.run(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = sess.run( + [values.select_device(d, next_element) for d in devices]) + self.assertEqual(expected_value, computed_value) + class ParameterServerStrategyTest(ParameterServerStrategyTestBase, + strategy_test_lib.DistributionTestBase, parameterized.TestCase): @classmethod @@ -582,6 +616,46 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase, def testMinimizeLossGraphLocal(self, num_gpus): self._test_minimize_loss_graph(None, None, num_gpus) + # TODO(priyag): Refactor this and other multi worker tests. + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1)) + def testMakeInputFnIteratorDistributed(self, num_gpus): + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + dataset_fn = lambda: dataset_ops.Dataset.range(100) + expected_values = [[i+j for j in range(num_gpus)] + for i in range(0, 100, num_gpus)] + + input_fn = self._input_fn_to_test_input_context( + dataset_fn, + expected_num_replicas_in_sync=num_gpus, + expected_num_input_pipelines=3, + expected_input_pipeline_id=1) # because task_id = 1 + self._test_input_fn_iterator('worker', 1, num_gpus, + input_fn, expected_values) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1)) + def testMakeInputFnIteratorLocal(self, num_gpus): + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + dataset_fn = lambda: dataset_ops.Dataset.range(100) + expected_values = [[i+j for j in range(num_gpus)] + for i in range(0, 100, num_gpus)] + + input_fn = self._input_fn_to_test_input_context( + dataset_fn, + expected_num_replicas_in_sync=num_gpus, + expected_num_input_pipelines=1, + expected_input_pipeline_id=0) # only one worker and pipeline for local. + self._test_input_fn_iterator(None, None, num_gpus, + input_fn, expected_values) + + def testGlobalStepUpdate(self): + strategy = parameter_server_strategy.ParameterServerStrategy( + num_gpus_per_worker=context.num_gpus()) + self._test_global_step_update(strategy) + class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase, parameterized.TestCase): @@ -624,32 +698,9 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase, v = variable_scope.get_variable('v', initializer=10.0) _ = v * v v, = tape.watched_variables() - w = distribution.value_container(v) + w = distribution.extended.value_container(v) self.assertIs(values.AggregatingVariable, type(w)) - distribution.call_for_each_replica(f) - - -class InputContextTest(strategy_test_lib.DistributionTestBase): - - def testInputContextPropertyLocal(self): - d = parameter_server_strategy.ParameterServerStrategy(num_gpus_per_worker=2) - with context.graph_mode(): - input_fn = self._input_fn_to_test_input_context( - expected_num_replicas_in_sync=2, - expected_num_input_pipelines=1, - expected_input_pipeline_id=0) - d.make_input_fn_iterator(input_fn) - - def testInputContextPropertyMultiWorker(self): - d = parameter_server_strategy.ParameterServerStrategy(num_gpus_per_worker=2) - cluster_spec = {'worker': ['worker1', 'worker2', 'worker3'], 'ps': ['ps1']} - d.configure(cluster_spec=cluster_spec, task_type='worker', task_id=1) - with context.graph_mode(): - input_fn = self._input_fn_to_test_input_context( - expected_num_replicas_in_sync=2, - expected_num_input_pipelines=3, - expected_input_pipeline_id=1) # because task_id =1 - d.make_input_fn_iterator(input_fn) + distribution.extended.call_for_each_replica(f) if __name__ == '__main__': diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py index 3dc815f0371002bd3a8657f18ccc09a27bb14961..c928b6d9f1f21508edd753f94c38ab2723cc0a9f 100644 --- a/tensorflow/contrib/distribute/python/step_fn.py +++ b/tensorflow/contrib/distribute/python/step_fn.py @@ -94,7 +94,7 @@ class StandardSingleLossStep(StandardInputStep): def __call__(self): with self._distribution.scope(): - def step_fn(ctx, *inputs): + def step_fn(ctx, inputs): """Function to run one iteration with one input.""" gradients_fn = backprop.implicit_grad(self._loss_fn) gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py index 7c6f7123da15217f9ef725bf2792696f13ac5756..5a8e8ed0dda0b99e759edbe916a46dab953929a0 100644 --- a/tensorflow/contrib/distribute/python/strategy_test_lib.py +++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py @@ -19,15 +19,19 @@ from __future__ import division from __future__ import print_function from tensorflow.core.protobuf import config_pb2 -from tensorflow.python.data.ops import dataset_ops from tensorflow.python.distribute import reduce_util +from tensorflow.python.distribute import values from tensorflow.python.eager import backprop from tensorflow.python.eager import context from tensorflow.python.eager import test from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.layers import core from tensorflow.python.ops import array_ops +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables from tensorflow.python.training import distribution_strategy_context as ds_context from tensorflow.python.training import optimizer @@ -187,16 +191,18 @@ class DistributionTestBase(test.TestCase): def _test_replica_id(self, d): with d.scope(): - expected_devices = [False] * len(d.worker_devices) + expected_devices = [False] * len(d.extended.worker_devices) def mark_devices_fn(): - replica_id = ds_context.get_replica_context().replica_id_in_sync_group - self.assertLess(replica_id, len(d.worker_devices)) + replica_id = self.evaluate( + ds_context.get_replica_context().replica_id_in_sync_group) + self.assertLess(replica_id, len(d.extended.worker_devices)) self.assertFalse(expected_devices[replica_id]) expected_devices[replica_id] = True d.call_for_each_replica(mark_devices_fn) - self.assertAllEqual(expected_devices, [True] * len(d.worker_devices)) + self.assertAllEqual(expected_devices, + [True] * len(d.extended.worker_devices)) def _test_call_and_merge_exceptions(self, dist): with dist.scope(): @@ -209,7 +215,9 @@ class DistributionTestBase(test.TestCase): with self.assertRaises(_TestException): dist.call_for_each_replica(_merge_call_merge_raises_fn) - def _input_fn_to_test_input_context(self, expected_num_replicas_in_sync, + def _input_fn_to_test_input_context(self, + dataset_fn, + expected_num_replicas_in_sync, expected_num_input_pipelines, expected_input_pipeline_id): # Use a list of one element as counter so that it can be captured by the @@ -232,6 +240,52 @@ class DistributionTestBase(test.TestCase): self.assertEqual(worker_id_counter[0], input_context.input_pipeline_id) worker_id_counter[0] += 1 - return dataset_ops.Dataset.from_tensors([[1.]]).repeat() + return dataset_fn() return _input_fn + + def _test_input_fn_iterator(self, iterator, devices, expected_values, + sess=None): + evaluate = lambda x: sess.run(x) if sess else self.evaluate(x) + evaluate(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = evaluate( + [values.select_device(d, next_element) for d in devices]) + self.assertEqual(expected_value, computed_value) + + with self.assertRaises(errors.OutOfRangeError): + next_element = iterator.get_next() + evaluate([values.select_device(d, next_element) for d in devices]) + + # After re-initializing the iterator, should be able to iterate again. + evaluate(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = evaluate( + [values.select_device(d, next_element) for d in devices]) + self.assertEqual(expected_value, computed_value) + + def _test_global_step_update(self, strategy): + with strategy.scope(): + global_step = variable_scope.get_variable( + "global_step", + shape=[], + dtype=dtypes.int64, + initializer=init_ops.zeros_initializer(), + trainable=False, + aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA) + self.evaluate(variables.global_variables_initializer()) + + def model_fn(): + train_op = global_step.assign_add(1) + value = global_step.read_value() + return train_op, value + + train_ops, value = strategy.call_for_each_replica(model_fn) + self.evaluate(strategy.group(train_ops)) + global_step_tensors = strategy.unwrap(value) + global_step_values = self.evaluate(global_step_tensors) + self.assertEqual([1] * len(global_step_tensors), global_step_values) diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py index 6e42bf9d972fa41ef8ccf4b269f4e347914f3378..94cf548cb44e4137d5b62320b43afda19819e2f7 100644 --- a/tensorflow/contrib/distribute/python/tpu_strategy.py +++ b/tensorflow/contrib/distribute/python/tpu_strategy.py @@ -23,15 +23,13 @@ from __future__ import print_function import functools -from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib -from tensorflow.contrib.distribute.python import values from tensorflow.contrib.tpu.python.ops import tpu_ops from tensorflow.contrib.tpu.python.tpu import tpu from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib from tensorflow.contrib.tpu.python.tpu import training_loop -from tensorflow.python.data.experimental.ops import batching -from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib from tensorflow.python.distribute import reduce_util +from tensorflow.python.distribute import values from tensorflow.python.eager import context from tensorflow.python.eager import tape from tensorflow.python.framework import constant_op @@ -232,59 +230,14 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended): return enqueue_op_per_host def _make_dataset_iterator(self, dataset): - """Make iterators for each of the TPU hosts. - - We first unbatch the users input dataset and then rebatch it with the - per replica batch size that is calculated using - `global_batch_size // num_replicas_in_sync`. The currently supported cases - are as follows: - `dataset.batch()` is the last operation on the dataset. - `dataset.apply(map_and_batch)` is the last operation on the dataset. - `dataset.batch().prefetch()` are the last 2 operations on the dataset. - `dataset.apply(map_and_batch).prefetch()` are the last 2 operations. - - Args: - dataset: The `tf.data` dataset passed by the user. - - Returns: - iterator: InputIterator created for each of the host machines. - """ - # TODO(sourabhbajaj): Remove this in lieu of distributed datasets - def _get_dataset_batch_size(dataset): - """Get the global batch size from the dataset object.""" - # pylint: disable=protected-access - if isinstance(dataset, dataset_ops.DatasetV1Adapter): - dataset = dataset._dataset - if isinstance(dataset, dataset_ops.BatchDataset): - return tensor_util.constant_value(dataset._batch_size) - elif isinstance(dataset, batching._MapAndBatchDataset): - return dataset._batch_size - elif isinstance(dataset, dataset_ops.PrefetchDataset): - return _get_dataset_batch_size(dataset._input_dataset) - # pylint: enable=protected-access - raise ValueError( - "Unable to fetch the batch size from the input dataset. `batch` " - "`map_and_batch` need to be the last operations on the dataset. " - "The batch operations can be followed by a prefetch.") - - global_batch_size = _get_dataset_batch_size(dataset) - if global_batch_size % self._num_replicas_in_sync: - raise ValueError( - "Batch size %s cannot be sharded evenly across replicas %s" % ( - global_batch_size, self.num_replicas_in_sync)) - per_replica_batch_size = global_batch_size // self._num_replicas_in_sync - dataset = dataset.apply(batching.unbatch()) - dataset = dataset.batch(per_replica_batch_size, drop_remainder=True) + """Make iterators for each of the TPU hosts.""" worker_devices = [ (self.get_host(hid), [self.get_host_cpu_device(hid)]) for hid in range(self.num_hosts) ] - distributed_dataset = values.MultiWorkerDataset( - functools.partial(self._call_dataset_fn, lambda: dataset), - worker_devices) - # TODO(priyag): Return distribution strategy specific InputIterator - return distributed_dataset.make_initializable_iterator() + return values.DatasetIterator(dataset, worker_devices, + self._num_replicas_in_sync) def _distribute_dataset(self, dataset_fn): worker_devices = [ @@ -301,7 +254,7 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended): self, fn, multi_worker_iterator, iterations, initial_loop_values=None): output_shapes = multi_worker_iterator.output_shapes shapes = nest.flatten(output_shapes) - if any([not s.is_fully_defined() for s in shapes]): + if any(not s.is_fully_defined() for s in shapes): raise ValueError( "TPU currently requires fully defined shapes. Either use " "set_shape() on the input tensors or use " @@ -328,7 +281,7 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended): fn_inputs = dequeue_fn() if not isinstance(fn_inputs, tuple): fn_inputs = (fn_inputs,) - fn_result = fn(ctx, *fn_inputs) + fn_result = fn(ctx, fn_inputs) flat_last_step_outputs = nest.flatten(ctx.last_step_outputs) if flat_last_step_outputs: with ops.control_dependencies([fn_result]): @@ -467,15 +420,13 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended): # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. - devices = cross_tower_ops_lib.get_devices_from(destinations) + devices = cross_device_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize(devices[0]) == device_util.canonicalize( self._host_device) else: raise ValueError("Multiple devices are not supported for TPUStrategy") - if reduce_op == reduce_util.ReduceOp.ONLY_FIRST_REPLICA: - return value[0] output = math_ops.add_n(value) if reduce_op == reduce_util.ReduceOp.MEAN: return output * (1. / len(value)) @@ -593,6 +544,11 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended): if cluster_spec: session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) + # TODO(priyag): Delete this once all strategies use global batch size. + @property + def _global_batch_size(self): + return True + class _TPUReplicaContext(distribute_lib.ReplicaContext): """Replication Context class for TPU Strategy.""" @@ -605,13 +561,9 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext): # TODO(b/118385803): properly initialize replica_id, instead of always 0 replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)) - @property - def device(self): - raise RuntimeError("Use .devices instead") - @property def devices(self): distribute_lib.require_replica_context(self) ds = self._distribution_strategy replica_id = tensor_util.constant_value(self._replica_id_in_sync_group) - return [ds.worker_devices[replica_id]] + return [ds.extended.worker_devices[replica_id]] diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py index b4b56eb4e4dbc39c2210fbb27237386dd46d0244..855b9c29aec0c0a65f1a715eea764067a41ba2f3 100644 --- a/tensorflow/contrib/distribute/python/values_test.py +++ b/tensorflow/contrib/distribute/python/values_test.py @@ -19,12 +19,13 @@ from __future__ import division from __future__ import print_function import os +from absl.testing import parameterized -from tensorflow.contrib.distribute.python import mirrored_strategy +from tensorflow.contrib.distribute.python import combinations from tensorflow.contrib.distribute.python import multi_worker_test_base -from tensorflow.contrib.distribute.python import values from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.distribute import values from tensorflow.python.eager import context from tensorflow.python.eager import test from tensorflow.python.estimator import model_fn as model_fn_lib @@ -34,10 +35,12 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import random_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables as variables_lib from tensorflow.python.training import device_util +from tensorflow.python.training import distribute as distribute_lib from tensorflow.python.training import saver as saver_lib from tensorflow.python.util import nest @@ -324,20 +327,20 @@ class RegroupAndSelectDeviceTest(test.TestCase): self.assertTrue( isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec)) - self.assertEquals(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode) + self.assertEqual(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode) for device_id in range(3): d = _device_str(device_id) - self.assertEquals(created_estimator_specs[device_id].loss, - merged_estimator_spec.loss.get(d)) - self.assertEquals(created_estimator_specs[device_id].train_op, - merged_estimator_spec.train_op.get(d)) + self.assertEqual(created_estimator_specs[device_id].loss, + merged_estimator_spec.loss.get(d)) + self.assertEqual(created_estimator_specs[device_id].train_op, + merged_estimator_spec.train_op.get(d)) # Scaffold is populated by `EstimatorSpec.__new__`. - self.assertEquals(created_estimator_specs[device_id].scaffold, - merged_estimator_spec.scaffold.get(d)) + self.assertEqual(created_estimator_specs[device_id].scaffold, + merged_estimator_spec.scaffold.get(d)) # Also test that we can undo the merge using select_device() - self.assertEquals(created_estimator_specs[device_id], - values.select_device(_device_str(device_id), - merged_estimator_spec)) + self.assertEqual(created_estimator_specs[device_id], + values.select_device(_device_str(device_id), + merged_estimator_spec)) class PerReplicaDatasetTest(test.TestCase): @@ -568,90 +571,126 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase): multi_worker_iterator.get_next() -class InputFunctionIteratorTestBase(test.TestCase): +class InputIteratorTestBase(test.TestCase): - def _test_iterator(self, input_fn, worker_device_pairs, expected_values, - sess=None): + def _test_iterator(self, input_type, dataset_fn, worker_device_pairs, + expected_values, sess=None, split_batch_by=None): devices = nest.flatten([ds for _, ds in worker_device_pairs]) - iterator = values.InputFunctionIterator(input_fn, worker_device_pairs) + + if input_type == "input_fn": + input_contexts = [ + distribute_lib.InputContext() for _ in worker_device_pairs] + input_fn = lambda _: dataset_fn() + iterator = values.InputFunctionIterator(input_fn, worker_device_pairs, + input_contexts) + else: + iterator = values.DatasetIterator(dataset_fn(), worker_device_pairs, + split_batch_by) evaluate = lambda x: sess.run(x) if sess else self.evaluate(x) - evaluate(iterator.initialize()) + evaluate(control_flow_ops.group(iterator.initialize())) for expected_value in expected_values: next_element = iterator.get_next() computed_value = evaluate( [values.select_device(d, next_element) for d in devices]) - self.assertEqual(expected_value, computed_value) + self.assertAllEqual(expected_value, computed_value) with self.assertRaises(errors.OutOfRangeError): next_element = iterator.get_next() evaluate([values.select_device(d, next_element) for d in devices]) # After re-initializing the iterator, should be able to iterate again. - evaluate(iterator.initialize()) + evaluate(control_flow_ops.group(iterator.initialize())) for expected_value in expected_values: next_element = iterator.get_next() computed_value = evaluate( [values.select_device(d, next_element) for d in devices]) - self.assertEqual(expected_value, computed_value) + self.assertAllEqual(expected_value, computed_value) -class InputFunctionIteratorSingleWorkerTest(InputFunctionIteratorTestBase): +class InputIteratorSingleWorkerTest(InputIteratorTestBase, + parameterized.TestCase): - @test_util.run_in_graph_and_eager_modes - def testOneDeviceCPU(self): + @combinations.generate(combinations.combine( + mode=["graph", "eager"], + input_type=["input_fn", "dataset"])) + def testOneDeviceCPU(self, input_type): worker_device_pairs = [("", ["/device:CPU:0"])] - input_fn = lambda: dataset_ops.Dataset.range(10) + dataset_fn = lambda: dataset_ops.Dataset.range(10) expected_values = [[i] for i in range(10)] - self._test_iterator(input_fn, worker_device_pairs, expected_values) - - @test_util.run_in_graph_and_eager_modes() - def testTwoDevicesOneGPUOneCPU(self): - if context.num_gpus() < 1: - self.skipTest("A GPU is not available for this test.") + self._test_iterator(input_type, dataset_fn, worker_device_pairs, + expected_values) + @combinations.generate(combinations.combine( + mode=["graph", "eager"], + input_type=["input_fn", "dataset"], + required_gpus=1)) + def testTwoDevicesOneGPUOneCPU(self, input_type): worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])] - input_fn = lambda: dataset_ops.Dataset.range(10) + dataset_fn = lambda: dataset_ops.Dataset.range(10) expected_values = [[i, i+1] for i in range(0, 10, 2)] - self._test_iterator(input_fn, worker_device_pairs, expected_values) - - @test_util.run_in_graph_and_eager_modes() - def testTupleDataset(self): - if context.num_gpus() < 1: - self.skipTest("A GPU is not available for this test.") + self._test_iterator(input_type, dataset_fn, worker_device_pairs, + expected_values) + @combinations.generate(combinations.combine( + mode=["graph", "eager"], + input_type=["input_fn", "dataset"], + required_gpus=1)) + def testTupleDataset(self, input_type): worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])] - def input_fn(): + def dataset_fn(): dataset1 = dataset_ops.Dataset.range(10) dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2) return dataset_ops.Dataset.zip((dataset1, dataset2)) expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)] - self._test_iterator(input_fn, worker_device_pairs, expected_values) - - @test_util.run_in_graph_and_eager_modes() - def testUnevenDatasetBatches(self): - if context.num_gpus() < 1: - self.skipTest("A GPU is not available for this test.") + self._test_iterator(input_type, dataset_fn, worker_device_pairs, + expected_values) + @combinations.generate(combinations.combine( + mode=["graph", "eager"], + input_type=["input_fn", "dataset"], + required_gpus=1)) + def testUnevenDatasetBatches(self, input_type): worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])] - input_fn = lambda: dataset_ops.Dataset.range(11) + dataset_fn = lambda: dataset_ops.Dataset.range(11) expected_values = [[i, i+1] for i in range(0, 10, 2)] - self._test_iterator(input_fn, worker_device_pairs, expected_values) + self._test_iterator(input_type, dataset_fn, worker_device_pairs, + expected_values) + + @combinations.generate(combinations.combine( + mode=["graph", "eager"], + input_type=["dataset"], + split_batch_by=[None, 2], + required_gpus=1)) + def testBatchSplitting(self, input_type, split_batch_by): + worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])] + batch_size = 10 + dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size) + + updated_batch_size = ( + batch_size // split_batch_by if split_batch_by else batch_size) + expected_values = [[range(i, i+updated_batch_size), + range(i+updated_batch_size, i+2*updated_batch_size)] + for i in range(0, 100, updated_batch_size*2)] + self._test_iterator(input_type, dataset_fn, worker_device_pairs, + expected_values, sess=None, + split_batch_by=split_batch_by) -class InputFunctionIteratorMultiWorkerTest( - multi_worker_test_base.MultiWorkerTestBase, - InputFunctionIteratorTestBase): + +class InputIteratorMultiWorkerTest( + multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase, + parameterized.TestCase): def _cpu_devices(self): return [ @@ -672,35 +711,44 @@ class InputFunctionIteratorMultiWorkerTest( ]) ] - def testOneDevicePerWorker(self): + @combinations.generate(combinations.combine( + mode=["graph"], + input_type=["input_fn", "dataset"])) + def testOneDevicePerWorker(self, input_type): worker_devices = self._cpu_devices() with context.graph_mode(), self.cached_session() as sess: - input_fn = lambda: dataset_ops.Dataset.range(4) - self._test_iterator(input_fn, worker_devices, + dataset_fn = lambda: dataset_ops.Dataset.range(4) + self._test_iterator(input_type, dataset_fn, worker_devices, [[0, 0], [1, 1], [2, 2], [3, 3]], sess) - def testTwoDevicesPerWorker(self): - if context.num_gpus() < 1: - self.skipTest("A GPU is not available for this test.") + @combinations.generate(combinations.combine( + mode=["graph"], + input_type=["input_fn", "dataset"], + required_gpus=1)) + def testTwoDevicesPerWorker(self, input_type): worker_devices = self._cpu_and_one_gpu_devices() with context.graph_mode(), self.cached_session() as sess: - input_fn = lambda: dataset_ops.Dataset.range(4) - self._test_iterator(input_fn, worker_devices, + dataset_fn = lambda: dataset_ops.Dataset.range(4) + self._test_iterator(input_type, dataset_fn, worker_devices, [[0, 1, 0, 1], [2, 3, 2, 3]], sess) - def testTupleDataset(self): + @combinations.generate(combinations.combine( + mode=["graph"], + input_type=["input_fn", "dataset"])) + def testTupleDataset(self, input_type): worker_devices = self._cpu_devices() with context.graph_mode(), self.cached_session() as sess: - def input_fn(): + def dataset_fn(): dataset1 = dataset_ops.Dataset.range(4) dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2) return dataset_ops.Dataset.zip((dataset1, dataset2)) expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)] - self._test_iterator(input_fn, worker_devices, expected_values, sess) + self._test_iterator(input_type, dataset_fn, worker_devices, + expected_values, sess) -class MirroredVariableTest(test.TestCase): +class MirroredVariableTest(test.TestCase, parameterized.TestCase): config = config_pb2.ConfigProto() config.allow_soft_placement = True @@ -712,9 +760,9 @@ class MirroredVariableTest(test.TestCase): v, _, mirrored = _make_mirrored() - self.assertEquals(v[0].name, mirrored.name) - self.assertEquals(v[0].dtype, mirrored.dtype) - self.assertEquals(v[0].shape, mirrored.shape) + self.assertEqual(v[0].name, mirrored.name) + self.assertEqual(v[0].dtype, mirrored.dtype) + self.assertEqual(v[0].shape, mirrored.shape) @test_util.run_in_graph_and_eager_modes(config=config) def testVariableOnAnotherDevice(self): @@ -724,9 +772,9 @@ class MirroredVariableTest(test.TestCase): mirrored = values.MirroredVariable(index, v, variable_scope.VariableAggregation.MEAN) - self.assertEquals(v.name, mirrored.name) - self.assertEquals(v.dtype, mirrored.dtype) - self.assertEquals(v.shape, mirrored.shape) + self.assertEqual(v.name, mirrored.name) + self.assertEqual(v.dtype, mirrored.dtype) + self.assertEqual(v.shape, mirrored.shape) def _assign_mirrored(self, devices, v, new): for d, var, n in zip(devices, v, new): @@ -846,14 +894,13 @@ class MirroredVariableTest(test.TestCase): save_path = self._save_normal() self._restore_mirrored(save_path) - @test_util.run_in_graph_and_eager_modes(config=config) - def testFetchAMirroredVariable(self): - if context.num_gpus() < 1 or context.executing_eagerly(): - self.skipTest("A GPU is not available for this test or it's eager mode.") - - with self.session( - graph=ops.Graph()) as sess, mirrored_strategy.MirroredStrategy( - ["/device:GPU:0"]).scope(): + @combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_one_gpu, + combinations.core_mirrored_strategy_with_one_gpu], + mode=["graph"])) + def testFetchAMirroredVariable(self, distribution): + with self.session(graph=ops.Graph()) as sess, distribution.scope(): with ops.device("/device:GPU:0"): v = variable_scope.get_variable( name="v", initializer=1., use_resource=True) @@ -879,7 +926,7 @@ def _make_replica_local(method): return v, replica_local -class ReplicaLocalVariableTest(test.TestCase): +class ReplicaLocalVariablePropertiesTest(test.TestCase): config = config_pb2.ConfigProto() config.allow_soft_placement = True @@ -888,15 +935,14 @@ class ReplicaLocalVariableTest(test.TestCase): def testProperties(self): if context.num_gpus() < 1 and context.executing_eagerly(): self.skipTest("A GPU is not available for this test in eager mode.") - v, replica_local = _make_replica_local( variable_scope.VariableAggregation.SUM) - self.assertEquals(v[0].name, replica_local.name) - self.assertEquals(v[0].dtype, replica_local.dtype) - self.assertEquals(v[0].shape, replica_local.shape) - self.assertEquals(variable_scope.VariableAggregation.SUM, - replica_local.aggregation) + self.assertEqual(v[0].name, replica_local.name) + self.assertEqual(v[0].dtype, replica_local.dtype) + self.assertEqual(v[0].shape, replica_local.shape) + self.assertEqual(variable_scope.VariableAggregation.SUM, + replica_local.aggregation) @test_util.run_in_graph_and_eager_modes(config=config) def testVariableOnAnotherDevice(self): @@ -906,11 +952,32 @@ class ReplicaLocalVariableTest(test.TestCase): replica_local = values.ReplicaLocalVariable( index, v, variable_scope.VariableAggregation.MEAN) - self.assertEquals(v.name, replica_local.name) - self.assertEquals(v.dtype, replica_local.dtype) - self.assertEquals(v.shape, replica_local.shape) - self.assertEquals(variable_scope.VariableAggregation.MEAN, - replica_local.aggregation) + self.assertEqual(v.name, replica_local.name) + self.assertEqual(v.dtype, replica_local.dtype) + self.assertEqual(v.shape, replica_local.shape) + self.assertEqual(variable_scope.VariableAggregation.MEAN, + replica_local.aggregation) + + def testTensorConversion(self): + with context.graph_mode(): + _, replica_local = _make_replica_local( + variable_scope.VariableAggregation.SUM) + converted = ops.internal_convert_to_tensor(replica_local, as_ref=False) + self.assertIsInstance(converted, ops.Tensor) + self.assertEqual(converted.dtype, replica_local.dtype) + + converted = ops.internal_convert_to_tensor(replica_local, as_ref=True) + # Resources variable are converted to tensors as well when as_ref is True. + self.assertIsInstance(converted, ops.Tensor) + self.assertEqual(converted.dtype, replica_local.dtype) + + +@combinations.generate(combinations.combine( + distribution=[ + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_gpu_and_cpu], + mode=["graph", "eager"])) +class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase): def _assign_replica_local(self, devices, v, new): for d, var, n in zip(devices, v, new): @@ -927,22 +994,15 @@ class ReplicaLocalVariableTest(test.TestCase): save_path, _ = self._save_return_saver(sess, var) return save_path - def _dist_scope(self): - return mirrored_strategy.MirroredStrategy(_devices).scope() - - @test_util.run_in_graph_and_eager_modes(config=config) - def testSaveAndRestoreReplicaLocalSumOneGraph(self): - if context.num_gpus() < 1 and context.executing_eagerly(): - self.skipTest("A GPU is not available for this test in eager mode.") - - with self.cached_session(config=self.config) as sess: + def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution): + with self.cached_session() as sess: v, replica_local = _make_replica_local( variable_scope.VariableAggregation.SUM) # Overwrite the initial values. self._assign_replica_local(_devices, v, [3., 4.]) - with self._dist_scope(): + with distribution.scope(): # Saves the current value of v[0] + v[1], 7. save_path, saver = self._save_return_saver(sess, replica_local) @@ -954,19 +1014,18 @@ class ReplicaLocalVariableTest(test.TestCase): saver.restore(sess, save_path) self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]])) - @test_util.run_in_graph_and_eager_modes(config=config) - def testSaveAndRestoreReplicaLocalMeanOneGraph(self): + def testSaveAndRestoreReplicaLocalMeanOneGraph(self, distribution): if context.num_gpus() < 1 and context.executing_eagerly(): self.skipTest("A GPU is not available for this test in eager mode.") - with self.cached_session(config=self.config) as sess: + with self.cached_session() as sess: v, replica_local = _make_replica_local( variable_scope.VariableAggregation.MEAN) # Overwrite the initial values. self._assign_replica_local(_devices, v, [3., 4.]) - with self._dist_scope(): + with distribution.scope(): # Saves the current value of (v[0] + v[1])/2, 3.5. save_path, saver = self._save_return_saver(sess, replica_local) @@ -977,7 +1036,7 @@ class ReplicaLocalVariableTest(test.TestCase): saver.restore(sess, save_path) self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]])) - def _save_replica_local_mean(self): + def _save_replica_local_mean(self, distribution): """Save variables with mirroring, returns save_path.""" with self.session(graph=ops.Graph()) as sess: v, replica_local = _make_replica_local( @@ -986,7 +1045,7 @@ class ReplicaLocalVariableTest(test.TestCase): # Overwrite the initial values. self._assign_replica_local(_devices, v, [3., 4.]) - with self._dist_scope(): + with distribution.scope(): # Saves the current value of (v[0] + v[1])/2, 3.5 save_path = self._save(sess, replica_local) @@ -994,7 +1053,7 @@ class ReplicaLocalVariableTest(test.TestCase): self._assign_replica_local(_devices, v, [5., 6.]) return save_path - def _save_replica_local_sum(self): + def _save_replica_local_sum(self, distribution): """Save variables with mirroring, returns save_path.""" with self.session(graph=ops.Graph()) as sess: v, replica_local = _make_replica_local("sum") @@ -1002,7 +1061,7 @@ class ReplicaLocalVariableTest(test.TestCase): # Overwrite the initial values. self._assign_replica_local(_devices, v, [1.5, 2.]) - with self._dist_scope(): + with distribution.scope(): # Saves the current value of v[0] + v[1], 3.5 save_path = self._save(sess, replica_local) @@ -1040,7 +1099,7 @@ class ReplicaLocalVariableTest(test.TestCase): saver.restore(sess, save_path) self.assertEqual(3.5, self.evaluate(var)) - def _restore_replica_local_mean(self, save_path): + def _restore_replica_local_mean(self, save_path, distribution): """Restore to variables with mirroring in a fresh graph.""" with self.session(graph=ops.Graph()) as sess: v, replica_local = _make_replica_local( @@ -1049,13 +1108,13 @@ class ReplicaLocalVariableTest(test.TestCase): # Overwrite the initial values. self._assign_replica_local(_devices, v, [7., 8.]) - with self._dist_scope(): + with distribution.scope(): # Restores the saved value of 3.5 to both variables. saver = saver_lib.Saver(var_list=[replica_local]) saver.restore(sess, save_path) self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]])) - def _restore_replica_local_sum(self, save_path): + def _restore_replica_local_sum(self, save_path, distribution): """Restore to variables with mirroring in a fresh graph.""" with self.session(graph=ops.Graph()) as sess: v, replica_local = _make_replica_local( @@ -1064,72 +1123,35 @@ class ReplicaLocalVariableTest(test.TestCase): # Overwrite the initial values. self._assign_replica_local(_devices, v, [7., 8.]) - with self._dist_scope(): + with distribution.scope(): # Restores the saved value of 3.5 to both variables. saver = saver_lib.Saver(var_list=[replica_local]) saver.restore(sess, save_path) self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]])) - @test_util.run_in_graph_and_eager_modes(config=config) - def testSaveReplicaLocalRestoreReplicaLocalMean(self): - if context.num_gpus() < 1 and context.executing_eagerly(): - self.skipTest("A GPU is not available for this test in eager mode.") + def testSaveReplicaLocalRestoreReplicaLocalMean(self, distribution): + save_path = self._save_replica_local_mean(distribution) + self._restore_replica_local_mean(save_path, distribution) - save_path = self._save_replica_local_mean() - self._restore_replica_local_mean(save_path) + def testSaveReplicaLocalRestoreReplicaLocalSum(self, distribution): + save_path = self._save_replica_local_sum(distribution) + self._restore_replica_local_sum(save_path, distribution) - @test_util.run_in_graph_and_eager_modes(config=config) - def testSaveReplicaLocalRestoreReplicaLocalSum(self): - if context.num_gpus() < 1 and context.executing_eagerly(): - self.skipTest("A GPU is not available for this test in eager mode.") - - save_path = self._save_replica_local_sum() - self._restore_replica_local_sum(save_path) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testSaveReplicaLocalMeanRestoreNormal(self): - if context.num_gpus() < 1 and context.executing_eagerly(): - self.skipTest("A GPU is not available for this test in eager mode.") - - save_path = self._save_replica_local_mean() + def testSaveReplicaLocalMeanRestoreNormal(self, distribution): + save_path = self._save_replica_local_mean(distribution) self._restore_normal(save_path) - @test_util.run_in_graph_and_eager_modes(config=config) - def testSaveReplicaLocalSumRestoreNormal(self): - if context.num_gpus() < 1 and context.executing_eagerly(): - self.skipTest("A GPU is not available for this test in eager mode.") - - save_path = self._save_replica_local_sum() + def testSaveReplicaLocalSumRestoreNormal(self, distribution): + save_path = self._save_replica_local_sum(distribution) self._restore_normal(save_path) - @test_util.run_in_graph_and_eager_modes(config=config) - def testSaveNormalRestoreReplicaLocalMean(self): - if context.num_gpus() < 1 and context.executing_eagerly(): - self.skipTest("A GPU is not available for this test in eager mode.") - + def testSaveNormalRestoreReplicaLocalMean(self, distribution): save_path = self._save_normal() - self._restore_replica_local_mean(save_path) - - @test_util.run_in_graph_and_eager_modes(config=config) - def testSaveNormalRestoreReplicaLocalSum(self): - if context.num_gpus() < 1 and context.executing_eagerly(): - self.skipTest("A GPU is not available for this test in eager mode.") + self._restore_replica_local_mean(save_path, distribution) + def testSaveNormalRestoreReplicaLocalSum(self, distribution): save_path = self._save_normal() - self._restore_replica_local_sum(save_path) - - def testTensorConversion(self): - with context.graph_mode(): - _, replica_local = _make_replica_local( - variable_scope.VariableAggregation.SUM) - converted = ops.internal_convert_to_tensor(replica_local, as_ref=False) - self.assertIsInstance(converted, ops.Tensor) - self.assertEqual(converted.dtype, replica_local.dtype) - - converted = ops.internal_convert_to_tensor(replica_local, as_ref=True) - # Resources variable are converted to tensors as well when as_ref is True. - self.assertIsInstance(converted, ops.Tensor) - self.assertEqual(converted.dtype, replica_local.dtype) + self._restore_replica_local_sum(save_path, distribution) if __name__ == "__main__": diff --git a/tensorflow/contrib/distribute/python/warm_starting_util_test.py b/tensorflow/contrib/distribute/python/warm_starting_util_test.py index 5d57d144c1c16a08280970ecd89eb54f7cf1ffd4..b0bcf9b17456c938204a4892451928daf90b6743 100644 --- a/tensorflow/contrib/distribute/python/warm_starting_util_test.py +++ b/tensorflow/contrib/distribute/python/warm_starting_util_test.py @@ -44,7 +44,9 @@ class WarmStartingUtilWithDistributionStrategyTest( distribution=[combinations.default_strategy, combinations.one_device_strategy, combinations.mirrored_strategy_with_gpu_and_cpu, - combinations.mirrored_strategy_with_two_gpus], + combinations.mirrored_strategy_with_two_gpus, + combinations.core_mirrored_strategy_with_gpu_and_cpu, + combinations.core_mirrored_strategy_with_two_gpus], save_with_distribution=[True, False], restore_with_distribution=[True, False], mode=["graph"])) diff --git a/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py index 29eeaf43c5185ce5519d4a1211f66e99ce61c6ab..ab3c07172a68255f4e387e071ac2f8341e93b90c 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py @@ -82,7 +82,7 @@ class NormalTest(test.TestCase): x = constant_op.constant( [[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], [2.5, -2.5, -4.0, 0.0, 1.0, -2.0]], dtype=dtypes.float32) - s = math_ops.reduce_sum(x, reduction_indices=[1]) + s = math_ops.reduce_sum(x, axis=[1]) x = array_ops.transpose(x) # Reshape to shape (6, 2) n = constant_op.constant([6] * 2) prior = distributions.Normal(loc=mu0, scale=sigma0) diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py index a60056c444a3fe7262939c5b3c73673f9a7c1469..cdee30bbc42e661952a9c757d7a30ebcd393f794 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py @@ -147,14 +147,13 @@ class WishartCholeskyTest(test.TestCase): x = chol_w.sample(10000, seed=42) self.assertAllEqual((10000, 3, 3), x.get_shape()) - moment1_estimate = math_ops.reduce_mean(x, reduction_indices=[0]).eval() + moment1_estimate = math_ops.reduce_mean(x, axis=[0]).eval() self.assertAllClose(chol_w.mean().eval(), moment1_estimate, rtol=0.05) # The Variance estimate uses the squares rather than outer-products # because Wishart.Variance is the diagonal of the Wishart covariance # matrix. - variance_estimate = (math_ops.reduce_mean( - math_ops.square(x), reduction_indices=[0]) - + variance_estimate = (math_ops.reduce_mean(math_ops.square(x), axis=[0]) - math_ops.square(moment1_estimate)).eval() self.assertAllClose( chol_w.variance().eval(), variance_estimate, rtol=0.05) diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py index 15c241d5d7a29d0e317cb6e5f46a40516e8a834f..74765f19e584c5de07c6aee4a36ec4e85438f862 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py @@ -168,7 +168,7 @@ class SoftmaxCentered(bijector.Bijector): # log_normalization = 1 + reduce_sum(exp(logits)) # -log_normalization + reduce_sum(logits - log_normalization) log_normalization = nn_ops.softplus( - math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True)) + math_ops.reduce_logsumexp(x, axis=-1, keepdims=True)) return array_ops.squeeze( (-log_normalization + math_ops.reduce_sum( x - log_normalization, axis=-1, keepdims=True)), axis=-1) diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb index 480777d948769b56ac1cc3be2052fe48459e98d6..66d52a74943d0d81fde05ce51b019558b327978d 100644 --- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb +++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb @@ -768,7 +768,7 @@ }, "outputs": [], "source": [ - "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" + "translate(u'hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" ] }, { @@ -781,7 +781,7 @@ }, "outputs": [], "source": [ - "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" + "translate(u'esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" ] }, { @@ -794,7 +794,7 @@ }, "outputs": [], "source": [ - "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" + "translate(u'todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" ] }, { @@ -808,7 +808,7 @@ "outputs": [], "source": [ "# wrong translation\n", - "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" + "translate(u'trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)" ] }, { diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py index c88c0f52eead58c7562cda1a49d164c1d857822d..566246de4957c1dc5919c10e22146706f9e50be8 100644 --- a/tensorflow/contrib/eager/python/metrics_impl.py +++ b/tensorflow/contrib/eager/python/metrics_impl.py @@ -24,6 +24,7 @@ from tensorflow.python.eager import context from tensorflow.python.eager import function from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops +from tensorflow.python.framework import smart_cond from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops from tensorflow.python.ops import control_flow_ops @@ -354,9 +355,10 @@ class Mean(Metric): def write_summary_f(): summary_ops.scalar(name=self.name, tensor=t) return t - control_flow_ops.cond(write_summary, + smart_cond.smart_cond(write_summary, write_summary_f, - lambda: t) + lambda: t, + name="") return t diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py index 9d2d172752c7f3f3ee6eaa11ab8952313a4a3543..39e5957f5d1760613f2c33607c0bdb163040efb4 100644 --- a/tensorflow/contrib/eager/python/metrics_test.py +++ b/tensorflow/contrib/eager/python/metrics_test.py @@ -49,18 +49,6 @@ class MetricsTest(test.TestCase): self.assertEqual(dtypes.float64, m.dtype) self.assertEqual(dtypes.float64, m.result().dtype) - def testSummaryArg(self): - m = metrics.Mean() - m([1, 10, 100]) - m(1000) - m([10000.0, 100000.0]) - self.assertEqual(111111.0/6, m.result(write_summary=True).numpy()) - self.assertEqual(111111.0/6, m.result(write_summary=False).numpy()) - with self.assertRaises(ValueError): - m.result(write_summary=5) - with self.assertRaises(ValueError): - m.result(write_summary=[True]) - def testVariableCollections(self): with context.graph_mode(), ops.Graph().as_default(): m = metrics.Mean() diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py index 4454abfb9667f824b9de0100bb81bae24ad5f7a6..8c35dddb5a515aa09cc70c173a9f0605e8567e82 100644 --- a/tensorflow/contrib/eager/python/tfe_test.py +++ b/tensorflow/contrib/eager/python/tfe_test.py @@ -87,8 +87,8 @@ class TFETest(test_util.TensorFlowTestCase): x += 1. # Without a device context, heuristics are used to place ops. # In this case, ops.reduce_mean runs on the GPU. - reduction_indices = range(x.shape.ndims) - m = math_ops.reduce_mean(x, reduction_indices) + axis = range(x.shape.ndims) + m = math_ops.reduce_mean(x, axis) # m is on GPU, bring it back to CPU and compare. self.assertEqual(3.5, m.cpu().numpy()) diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD index bbe335be3e1384e8a86872165a4e37230e28b6c9..1cd83bdb5de7c2f6dc91c980750b49aca1a7790b 100644 --- a/tensorflow/contrib/feature_column/BUILD +++ b/tensorflow/contrib/feature_column/BUILD @@ -14,6 +14,7 @@ py_library( srcs_version = "PY2AND3", deps = [ ":sequence_feature_column", + ":sequence_feature_column_v2", "//tensorflow/python:util", ], ) @@ -32,7 +33,7 @@ py_library( "//tensorflow/python:sparse_ops", "//tensorflow/python:tensor_shape", "//tensorflow/python:variable_scope", - "//tensorflow/python/feature_column", + "//tensorflow/python/feature_column:feature_column_py", ], ) @@ -51,7 +52,7 @@ py_test( "//tensorflow/python:parsing_ops", "//tensorflow/python:sparse_tensor", "//tensorflow/python:training", - "//tensorflow/python/feature_column", + "//tensorflow/python/feature_column:feature_column_py", "//third_party/py/numpy", "@absl_py//absl/testing:parameterized", ], @@ -69,7 +70,7 @@ py_test( "//tensorflow/python:parsing_ops", "//tensorflow/python:training", "//tensorflow/python:util", - "//tensorflow/python/feature_column", + "//tensorflow/python/feature_column:feature_column_py", "//tensorflow/python/keras:layers", ], ) @@ -89,7 +90,7 @@ py_library( "//tensorflow/python:tensor_shape", "//tensorflow/python:variable_scope", "//tensorflow/python/feature_column", - "//tensorflow/python/feature_column:feature_column_v2", + "//tensorflow/python/feature_column:feature_column_py", ], ) @@ -110,7 +111,7 @@ py_test( "//tensorflow/python:sparse_tensor", "//tensorflow/python:training", "//tensorflow/python/feature_column", - "//tensorflow/python/feature_column:feature_column_v2", + "//tensorflow/python/feature_column:feature_column_py", "//third_party/py/numpy", "@absl_py//absl/testing:parameterized", ], diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py index dd6da35ed009c07ad3819e7860a283c7837c1f83..9b3a5c58aaa9498257fc971ac60b97f31d5185d8 100644 --- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py +++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py @@ -222,10 +222,8 @@ def sequence_categorical_column_with_identity( ValueError: if `default_value` is not in range `[0, num_buckets)`. """ return fc._SequenceCategoricalColumn( - fc.categorical_column_with_identity( - key=key, - num_buckets=num_buckets, - default_value=default_value)) + fc._categorical_column_with_identity( + key=key, num_buckets=num_buckets, default_value=default_value)) def sequence_categorical_column_with_hash_bucket( @@ -265,10 +263,8 @@ def sequence_categorical_column_with_hash_bucket( ValueError: `dtype` is neither string nor integer. """ return fc._SequenceCategoricalColumn( - fc.categorical_column_with_hash_bucket( - key=key, - hash_bucket_size=hash_bucket_size, - dtype=dtype)) + fc._categorical_column_with_hash_bucket( + key=key, hash_bucket_size=hash_bucket_size, dtype=dtype)) def sequence_categorical_column_with_vocabulary_file( @@ -324,7 +320,7 @@ def sequence_categorical_column_with_vocabulary_file( ValueError: `dtype` is neither string nor integer. """ return fc._SequenceCategoricalColumn( - fc.categorical_column_with_vocabulary_file( + fc._categorical_column_with_vocabulary_file( key=key, vocabulary_file=vocabulary_file, vocabulary_size=vocabulary_size, @@ -384,7 +380,7 @@ def sequence_categorical_column_with_vocabulary_list( ValueError: if `dtype` is not integer or string. """ return fc._SequenceCategoricalColumn( - fc.categorical_column_with_vocabulary_list( + fc._categorical_column_with_vocabulary_list( key=key, vocabulary_list=vocabulary_list, dtype=dtype, diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py index d8ca363627eace15e039679545366648df174c33..bcc25b8de895a769f9e11b207c2092e23d029b1f 100644 --- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py +++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py @@ -53,19 +53,20 @@ class SequenceFeatureColumnIntegrationTest(test.TestCase): return example def _build_feature_columns(self): - col = fc.categorical_column_with_identity( - 'int_ctx', num_buckets=100) + col = fc._categorical_column_with_identity('int_ctx', num_buckets=100) ctx_cols = [ - fc.embedding_column(col, dimension=10), - fc.numeric_column('float_ctx')] + fc._embedding_column(col, dimension=10), + fc._numeric_column('float_ctx') + ] identity_col = sfc.sequence_categorical_column_with_identity( 'int_list', num_buckets=10) bucket_col = sfc.sequence_categorical_column_with_hash_bucket( 'bytes_list', hash_bucket_size=100) seq_cols = [ - fc.embedding_column(identity_col, dimension=10), - fc.embedding_column(bucket_col, dimension=20)] + fc._embedding_column(identity_col, dimension=10), + fc._embedding_column(bucket_col, dimension=20) + ] return ctx_cols, seq_cols @@ -148,8 +149,8 @@ class SequenceExampleParsingTest(test.TestCase): """ example = _make_sequence_example() columns = [ - fc.categorical_column_with_identity('int_ctx', num_buckets=100), - fc.numeric_column('float_ctx'), + fc._categorical_column_with_identity('int_ctx', num_buckets=100), + fc._numeric_column('float_ctx'), col_fn(col_name, col_arg) ] context, seq_features = parsing_ops.parse_single_sequence_example( diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py index 2163af0b43864c96483df529f07881f2f985a80e..d5f74028298ee7015f5b2e3aaee7d9330c1acac1 100644 --- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py +++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py @@ -24,6 +24,7 @@ import numpy as np from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc from tensorflow.python.feature_column import feature_column as fc +from tensorflow.python.feature_column import feature_column_lib as fc_lib from tensorflow.python.feature_column.feature_column import _LazyBuilder from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors @@ -109,13 +110,15 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column_a = fc.embedding_column( - categorical_column_a, dimension=embedding_dimension_a, + embedding_column_a = fc._embedding_column( + categorical_column_a, + dimension=embedding_dimension_a, initializer=_get_initializer(embedding_dimension_a, embedding_values_a)) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - embedding_column_b = fc.embedding_column( - categorical_column_b, dimension=embedding_dimension_b, + embedding_column_b = fc._embedding_column( + categorical_column_b, + dimension=embedding_dimension_b, initializer=_get_initializer(embedding_dimension_b, embedding_values_b)) input_layer, sequence_length = sfc.sequence_input_layer( @@ -148,10 +151,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): values=(2, 0, 1), dense_shape=(2, 2)) - categorical_column_a = fc.categorical_column_with_identity( + categorical_column_a = fc._categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column_a = fc.embedding_column( - categorical_column_a, dimension=2) + embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2) with self.assertRaisesRegexp( ValueError, @@ -206,7 +208,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) # Test that columns are reordered alphabetically. - shared_embedding_columns = fc.shared_embedding_columns( + shared_embedding_columns = fc_lib.shared_embedding_columns( [categorical_column_b, categorical_column_a], dimension=embedding_dimension, initializer=_get_initializer(embedding_dimension, embedding_values)) @@ -244,11 +246,11 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): values=(2, 0, 1), dense_shape=(2, 2)) - categorical_column_a = fc.categorical_column_with_identity( + categorical_column_a = fc._categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - categorical_column_b = fc.categorical_column_with_identity( + categorical_column_b = fc._categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - shared_embedding_columns = fc.shared_embedding_columns( + shared_embedding_columns = fc_lib.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) with self.assertRaisesRegexp( @@ -315,10 +317,10 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size_a) - indicator_column_a = fc.indicator_column(categorical_column_a) + indicator_column_a = fc._indicator_column(categorical_column_a) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size_b) - indicator_column_b = fc.indicator_column(categorical_column_b) + indicator_column_b = fc._indicator_column(categorical_column_b) input_layer, sequence_length = sfc.sequence_input_layer( features={ 'aaa': sparse_input_a, @@ -342,9 +344,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): values=(2, 0, 1), dense_shape=(2, 2)) - categorical_column_a = fc.categorical_column_with_identity( + categorical_column_a = fc._categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column_a = fc.indicator_column(categorical_column_a) + indicator_column_a = fc._indicator_column(categorical_column_a) with self.assertRaisesRegexp( ValueError, @@ -530,7 +532,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args) categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=3) - indicator_column = fc.indicator_column(categorical_column) + indicator_column = fc._indicator_column(categorical_column) input_layer, _ = sfc.sequence_input_layer( features={'aaa': sparse_input}, feature_columns=[indicator_column]) @@ -616,8 +618,7 @@ class InputLayerTest(test.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column_a = fc.embedding_column( - categorical_column_a, dimension=2) + embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2) with self.assertRaisesRegexp( ValueError, @@ -639,7 +640,7 @@ class InputLayerTest(test.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column_a = fc.indicator_column(categorical_column_a) + indicator_column_a = fc._indicator_column(categorical_column_a) with self.assertRaisesRegexp( ValueError, @@ -918,8 +919,9 @@ class SequenceEmbeddingColumnTest( categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column = fc.embedding_column( - categorical_column, dimension=embedding_dimension, + embedding_column = fc._embedding_column( + categorical_column, + dimension=embedding_dimension, initializer=_initializer) embedding_lookup, _ = embedding_column._get_sequence_dense_tensor( @@ -956,8 +958,7 @@ class SequenceEmbeddingColumnTest( categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column = fc.embedding_column( - categorical_column, dimension=2) + embedding_column = fc._embedding_column(categorical_column, dimension=2) _, sequence_length = embedding_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': inputs})) @@ -984,8 +985,7 @@ class SequenceEmbeddingColumnTest( categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column = fc.embedding_column( - categorical_column, dimension=2) + embedding_column = fc._embedding_column(categorical_column, dimension=2) _, sequence_length = embedding_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) @@ -1055,7 +1055,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase): key='aaa', num_buckets=vocabulary_size) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - shared_embedding_columns = fc.shared_embedding_columns( + shared_embedding_columns = fc_lib.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, initializer=_initializer) @@ -1101,7 +1101,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase): expected_sequence_length_b = [2, 1] categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - shared_embedding_columns = fc.shared_embedding_columns( + shared_embedding_columns = fc_lib.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor( @@ -1152,7 +1152,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase): categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - shared_embedding_columns = fc.shared_embedding_columns( + shared_embedding_columns = fc_lib.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor( @@ -1218,7 +1218,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column = fc.indicator_column(categorical_column) + indicator_column = fc._indicator_column(categorical_column) indicator_tensor, _ = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': inputs})) @@ -1250,7 +1250,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column = fc.indicator_column(categorical_column) + indicator_column = fc._indicator_column(categorical_column) _, sequence_length = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': inputs})) @@ -1277,7 +1277,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column = fc.indicator_column(categorical_column) + indicator_column = fc._indicator_column(categorical_column) _, sequence_length = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py index 67ffb939663358b5e356b3b626978db959c1bac9..0d34ad161855476b6a4cd9a258521dbe122b4140 100644 --- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py +++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py @@ -26,7 +26,7 @@ import collections from tensorflow.python.feature_column import feature_column as fc_old -from tensorflow.python.feature_column import feature_column_v2 as fc +from tensorflow.python.feature_column import feature_column_lib as fc from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape @@ -226,10 +226,8 @@ def sequence_categorical_column_with_identity( ValueError: if `default_value` is not in range `[0, num_buckets)`. """ return fc_old._SequenceCategoricalColumn( - fc_old.categorical_column_with_identity( - key=key, - num_buckets=num_buckets, - default_value=default_value)) + fc_old._categorical_column_with_identity( + key=key, num_buckets=num_buckets, default_value=default_value)) def sequence_categorical_column_with_hash_bucket( @@ -269,10 +267,8 @@ def sequence_categorical_column_with_hash_bucket( ValueError: `dtype` is neither string nor integer. """ return fc_old._SequenceCategoricalColumn( - fc_old.categorical_column_with_hash_bucket( - key=key, - hash_bucket_size=hash_bucket_size, - dtype=dtype)) + fc_old._categorical_column_with_hash_bucket( + key=key, hash_bucket_size=hash_bucket_size, dtype=dtype)) def sequence_categorical_column_with_vocabulary_file( @@ -328,7 +324,7 @@ def sequence_categorical_column_with_vocabulary_file( ValueError: `dtype` is neither string nor integer. """ return fc_old._SequenceCategoricalColumn( - fc_old.categorical_column_with_vocabulary_file( + fc_old._categorical_column_with_vocabulary_file( key=key, vocabulary_file=vocabulary_file, vocabulary_size=vocabulary_size, @@ -388,7 +384,7 @@ def sequence_categorical_column_with_vocabulary_list( ValueError: if `dtype` is not integer or string. """ return fc_old._SequenceCategoricalColumn( - fc_old.categorical_column_with_vocabulary_list( + fc_old._categorical_column_with_vocabulary_list( key=key, vocabulary_list=vocabulary_list, dtype=dtype, @@ -441,7 +437,7 @@ def sequence_numeric_column( ValueError: if any dimension in shape is not a positive integer. ValueError: if `dtype` is not convertible to `tf.float32`. """ - shape = fc._check_shape(shape=shape, key=key) + shape = fc_old._check_shape(shape=shape, key=key) if not (dtype.is_integer or dtype.is_floating): raise ValueError('dtype must be convertible to float. ' 'dtype: {}, key: {}'.format(dtype, key)) diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py index 904d149fdbd532cf9a307f8fc06cfa221b7abf41..ca4398a142065de0be7bee57cd7e54670bbae12e 100644 --- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py +++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py @@ -25,7 +25,7 @@ import numpy as np from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc_old from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc from tensorflow.python.feature_column import feature_column as fc_old -from tensorflow.python.feature_column import feature_column_v2 as fc +from tensorflow.python.feature_column import feature_column_lib as fc from tensorflow.python.feature_column.feature_column import _LazyBuilder from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors @@ -111,13 +111,15 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column_a = fc_old.embedding_column( - categorical_column_a, dimension=embedding_dimension_a, + embedding_column_a = fc_old._embedding_column( + categorical_column_a, + dimension=embedding_dimension_a, initializer=_get_initializer(embedding_dimension_a, embedding_values_a)) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - embedding_column_b = fc_old.embedding_column( - categorical_column_b, dimension=embedding_dimension_b, + embedding_column_b = fc_old._embedding_column( + categorical_column_b, + dimension=embedding_dimension_b, initializer=_get_initializer(embedding_dimension_b, embedding_values_b)) input_layer, sequence_length = sfc.sequence_input_layer( @@ -150,9 +152,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): values=(2, 0, 1), dense_shape=(2, 2)) - categorical_column_a = fc_old.categorical_column_with_identity( + categorical_column_a = fc_old._categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column_a = fc_old.embedding_column( + embedding_column_a = fc_old._embedding_column( categorical_column_a, dimension=2) with self.assertRaisesRegexp( @@ -208,7 +210,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) # Test that columns are reordered alphabetically. - shared_embedding_columns = fc_old.shared_embedding_columns( + shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_b, categorical_column_a], dimension=embedding_dimension, initializer=_get_initializer(embedding_dimension, embedding_values)) @@ -246,11 +248,11 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): values=(2, 0, 1), dense_shape=(2, 2)) - categorical_column_a = fc_old.categorical_column_with_identity( + categorical_column_a = fc_old._categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - categorical_column_b = fc_old.categorical_column_with_identity( + categorical_column_b = fc_old._categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - shared_embedding_columns = fc_old.shared_embedding_columns( + shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) with self.assertRaisesRegexp( @@ -317,10 +319,10 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size_a) - indicator_column_a = fc_old.indicator_column(categorical_column_a) + indicator_column_a = fc_old._indicator_column(categorical_column_a) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size_b) - indicator_column_b = fc_old.indicator_column(categorical_column_b) + indicator_column_b = fc_old._indicator_column(categorical_column_b) input_layer, sequence_length = sfc.sequence_input_layer( features={ 'aaa': sparse_input_a, @@ -344,9 +346,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): values=(2, 0, 1), dense_shape=(2, 2)) - categorical_column_a = fc_old.categorical_column_with_identity( + categorical_column_a = fc_old._categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column_a = fc_old.indicator_column(categorical_column_a) + indicator_column_a = fc_old._indicator_column(categorical_column_a) with self.assertRaisesRegexp( ValueError, @@ -532,7 +534,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase): sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args) categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=3) - indicator_column = fc_old.indicator_column(categorical_column) + indicator_column = fc_old._indicator_column(categorical_column) input_layer, _ = sfc.sequence_input_layer( features={'aaa': sparse_input}, feature_columns=[indicator_column]) @@ -618,7 +620,7 @@ class InputLayerTest(test.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column_a = fc_old.embedding_column( + embedding_column_a = fc_old._embedding_column( categorical_column_a, dimension=2) with self.assertRaisesRegexp( @@ -641,7 +643,7 @@ class InputLayerTest(test.TestCase): categorical_column_a = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column_a = fc_old.indicator_column(categorical_column_a) + indicator_column_a = fc_old._indicator_column(categorical_column_a) with self.assertRaisesRegexp( ValueError, @@ -920,8 +922,9 @@ class SequenceEmbeddingColumnTest( categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column = fc_old.embedding_column( - categorical_column, dimension=embedding_dimension, + embedding_column = fc_old._embedding_column( + categorical_column, + dimension=embedding_dimension, initializer=_initializer) embedding_lookup, _ = embedding_column._get_sequence_dense_tensor( @@ -958,8 +961,7 @@ class SequenceEmbeddingColumnTest( categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column = fc_old.embedding_column( - categorical_column, dimension=2) + embedding_column = fc_old._embedding_column(categorical_column, dimension=2) _, sequence_length = embedding_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': inputs})) @@ -986,8 +988,7 @@ class SequenceEmbeddingColumnTest( categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - embedding_column = fc_old.embedding_column( - categorical_column, dimension=2) + embedding_column = fc_old._embedding_column(categorical_column, dimension=2) _, sequence_length = embedding_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) @@ -1057,7 +1058,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase): key='aaa', num_buckets=vocabulary_size) categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - shared_embedding_columns = fc_old.shared_embedding_columns( + shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=embedding_dimension, initializer=_initializer) @@ -1103,7 +1104,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase): expected_sequence_length_b = [2, 1] categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - shared_embedding_columns = fc_old.shared_embedding_columns( + shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor( @@ -1154,7 +1155,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase): categorical_column_b = sfc.sequence_categorical_column_with_identity( key='bbb', num_buckets=vocabulary_size) - shared_embedding_columns = fc_old.shared_embedding_columns( + shared_embedding_columns = fc.shared_embedding_columns( [categorical_column_a, categorical_column_b], dimension=2) sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor( @@ -1220,7 +1221,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column = fc_old.indicator_column(categorical_column) + indicator_column = fc_old._indicator_column(categorical_column) indicator_tensor, _ = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': inputs})) @@ -1252,7 +1253,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column = fc_old.indicator_column(categorical_column) + indicator_column = fc_old._indicator_column(categorical_column) _, sequence_length = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': inputs})) @@ -1279,7 +1280,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase): categorical_column = sfc.sequence_categorical_column_with_identity( key='aaa', num_buckets=vocabulary_size) - indicator_column = fc.indicator_column_v2(categorical_column) + indicator_column = fc.indicator_column(categorical_column) _, sequence_length = indicator_column._get_sequence_dense_tensor( _LazyBuilder({'aaa': sparse_input})) diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD index cd747df4d69d2c264f5a64b491da9570b1423770..53efae1e10f30a2c5a42c9997c92ad909d77f58e 100644 --- a/tensorflow/contrib/framework/BUILD +++ b/tensorflow/contrib/framework/BUILD @@ -66,6 +66,7 @@ tf_custom_op_py_library( "//tensorflow/python:resource_variable_ops", "//tensorflow/python:script_ops", "//tensorflow/python:smart_cond", + "//tensorflow/python:sort_ops", "//tensorflow/python:sparse_tensor", "//tensorflow/python:state_ops", "//tensorflow/python:state_ops_gen", @@ -311,17 +312,3 @@ py_test( "//third_party/py/numpy", ], ) - -py_test( - name = "sort_ops_test", - size = "medium", - srcs = ["python/ops/sort_ops_test.py"], - srcs_version = "PY2AND3", - deps = [ - ":framework_py", - "//tensorflow/python:array_ops", - "//tensorflow/python:client_testlib", - "//tensorflow/python:random_ops", - "//third_party/py/numpy", - ], -) diff --git a/tensorflow/contrib/framework/python/ops/sort_ops.py b/tensorflow/contrib/framework/python/ops/sort_ops.py index 1921a77c1e96ee3531d1ed0f98e41c27c9d427ac..42184a4e55e292f7921702e3f8909ae54f717702 100644 --- a/tensorflow/contrib/framework/python/ops/sort_ops.py +++ b/tensorflow/contrib/framework/python/ops/sort_ops.py @@ -22,173 +22,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import numpy as np +from tensorflow.python.ops import sort_ops -from tensorflow.python.framework import constant_op -from tensorflow.python.framework import ops as framework_ops -from tensorflow.python.framework import tensor_util -from tensorflow.python.ops import array_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import nn_ops - - -def sort(values, axis=-1, direction='ASCENDING', name=None): - """Sorts a tensor. - - Args: - values: 1-D or higher numeric `Tensor`. - axis: The axis along which to sort. The default is -1, which sorts the last - axis. - direction: The direction in which to sort the values (`'ASCENDING'` or - `'DESCENDING'`). - name: Optional name for the operation. - - Returns: - A `Tensor` with the same dtype and shape as `values`, with the elements - sorted along the given `axis`. - - Raises: - ValueError: If axis is not a constant scalar, or the direction is invalid. - """ - with framework_ops.name_scope(name, 'sort'): - return _sort_or_argsort(values, axis, direction, return_argsort=False) - - -def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None): - """Returns the indices of a tensor that give its sorted order along an axis. - - For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to - `tf.sort(values)`. For higher dimensions, the output has the same shape as - `values`, but along the given axis, values represent the index of the sorted - element in that slice of the tensor at the given position. - - Args: - values: 1-D or higher numeric `Tensor`. - axis: The axis along which to sort. The default is -1, which sorts the last - axis. - direction: The direction in which to sort the values (`'ASCENDING'` or - `'DESCENDING'`). - stable: If True, equal elements in the original tensor will not be - re-ordered in the returned order. Unstable sort is not yet implemented, - but will eventually be the default for performance reasons. If you - require a stable order, pass `stable=True` for forwards compatibility. - name: Optional name for the operation. - - Returns: - An int32 `Tensor` with the same shape as `values`. The indices that would - sort each slice of the given `values` along the given `axis`. - - Raises: - ValueError: If axis is not a constant scalar, or the direction is invalid. - """ - del stable # Unused. - with framework_ops.name_scope(name, 'argsort'): - return _sort_or_argsort(values, axis, direction, return_argsort=True) - - -def _sort_or_argsort(values, axis, direction, return_argsort): - """Internal sort/argsort implementation. - - Args: - values: The input values. - axis: The axis along which to sort. - direction: 'ASCENDING' or 'DESCENDING'. - return_argsort: Whether to return the argsort result. - - Returns: - Either the sorted values, or the indices of the sorted values in the - original tensor. See the `sort` and `argsort` docstrings. - - Raises: - ValueError: If axis is not a constant scalar, or the direction is invalid. - """ - if direction not in _SORT_IMPL: - raise ValueError('%s should be one of %s' % - (direction, ', '.join(sorted(_SORT_IMPL.keys())))) - # Axis must be an integer, not a Tensor. - axis = framework_ops.convert_to_tensor(axis, name='axis') - axis_static = tensor_util.constant_value(axis) - if axis.shape.ndims != 0 or axis_static is None: - raise ValueError('axis must be a constant scalar') - axis_static = int(axis_static) # Avoids NumPy casting error - - values = framework_ops.convert_to_tensor(values, name='values') - - return _SORT_IMPL[direction](values, axis_static, return_argsort) - - -def _descending_sort(values, axis, return_argsort=False): - """Sorts values in reverse using `top_k`. - - Args: - values: Tensor of numeric values. - axis: Index of the axis which values should be sorted along. - return_argsort: If False, return the sorted values. If True, return the - indices that would sort the values. - - Returns: - The sorted values. - """ - k = array_ops.shape(values)[axis] - rank = array_ops.rank(values) - static_rank = values.shape.ndims - # Fast path: sorting the last axis. - if axis == -1 or axis + 1 == values.get_shape().ndims: - top_k_input = values - transposition = None - else: - # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`. - if axis < 0: - # Calculate the actual axis index if counting from the end. Use the static - # rank if available, or else make the axis back into a tensor. - axis += static_rank or rank - if static_rank is not None: - # Prefer to calculate the transposition array in NumPy and make it a - # constant. - transposition = constant_op.constant( - np.r_[ - # Axes up to axis are unchanged. - np.arange(axis), - # Swap axis and rank - 1. - [static_rank - 1], - # Axes in [axis + 1, rank - 1) are unchanged. - np.arange(axis + 1, static_rank - 1), - # Swap axis and rank - 1. - [axis]], - name='transposition') - else: - # Generate the transposition array from the tensors. - transposition = array_ops.concat( - [ - # Axes up to axis are unchanged. - math_ops.range(axis), - # Swap axis and rank - 1. - [rank - 1], - # Axes in [axis + 1, rank - 1) are unchanged. - math_ops.range(axis + 1, rank - 1), - # Swap axis and rank - 1. - [axis] - ], - axis=0) - top_k_input = array_ops.transpose(values, transposition) - - values, indices = nn_ops.top_k(top_k_input, k) - return_value = indices if return_argsort else values - if transposition is not None: - # transposition contains a single cycle of length 2 (swapping 2 elements), - # so it is an involution (it is its own inverse). - return_value = array_ops.transpose(return_value, transposition) - return return_value - - -def _ascending_sort(values, axis, return_argsort=False): - # Negate the values to get the ascending order from descending sort. - values_or_indices = _descending_sort(-values, axis, return_argsort) - # If not argsort, negate the values again. - return values_or_indices if return_argsort else -values_or_indices - - -_SORT_IMPL = { - 'ASCENDING': _ascending_sort, - 'DESCENDING': _descending_sort, -} +sort = sort_ops.sort +argsort = sort_ops.argsort diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py index df0342c80c587cd0dfbf5f1455e05c31745995f5..c91ce2c0f3b371c867398007f5635a60a478ef45 100644 --- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py +++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py @@ -36,8 +36,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import numpy as np - from tensorflow.contrib.framework.python.ops import variables as contrib_variables_lib from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util @@ -817,7 +815,7 @@ def _numerically_stable_global_norm(tensor_list): Returns: A scalar tensor with the global norm. """ - if np.all([x is None for x in tensor_list]): + if all(x is None for x in tensor_list): return 0.0 list_max = math_ops.reduce_max([math_ops.reduce_max(math_ops.abs(x)) for x in diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py index 73185c79fc061a04d0468fe9898392d8e6951a74..cf5b9d9476738e58f6f1286bf5652d55b49ed4d5 100644 --- a/tensorflow/contrib/gan/python/train.py +++ b/tensorflow/contrib/gan/python/train.py @@ -1071,8 +1071,19 @@ def get_sequential_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)): return get_hooks +def _num_joint_steps(train_steps): + g_steps = train_steps.generator_train_steps + d_steps = train_steps.discriminator_train_steps + # Get the number of each type of step that should be run. + num_d_and_g_steps = min(g_steps, d_steps) + num_g_steps = g_steps - num_d_and_g_steps + num_d_steps = d_steps - num_d_and_g_steps + + return num_d_and_g_steps, num_g_steps, num_d_steps + + def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)): - """Returns a hooks function for sequential GAN training. + """Returns a hooks function for joint GAN training. When using these train hooks, IT IS RECOMMENDED TO USE `use_locking=True` ON ALL OPTIMIZERS TO AVOID RACE CONDITIONS. @@ -1105,12 +1116,7 @@ def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)): Returns: A function that takes a GANTrainOps tuple and returns a list of hooks. """ - g_steps = train_steps.generator_train_steps - d_steps = train_steps.discriminator_train_steps - # Get the number of each type of step that should be run. - num_d_and_g_steps = min(g_steps, d_steps) - num_g_steps = g_steps - num_d_and_g_steps - num_d_steps = d_steps - num_d_and_g_steps + num_d_and_g_steps, num_g_steps, num_d_steps = _num_joint_steps(train_steps) def get_hooks(train_ops): g_op = train_ops.generator_train_op diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py index 64d670619905a427a84bee4b661228abca591fae..e8c24eea3df3535665cb96eeddbc622e5fcaf706 100644 --- a/tensorflow/contrib/gan/python/train_test.py +++ b/tensorflow/contrib/gan/python/train_test.py @@ -519,7 +519,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase): """Test output type.""" loss = train.gan_loss(get_gan_model_fn(), add_summaries=True) self.assertIsInstance(loss, namedtuples.GANLoss) - self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0) + self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES)) @parameterized.named_parameters( ('cyclegan', create_cyclegan_model), @@ -528,7 +528,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase): def test_cyclegan_output_type(self, get_gan_model_fn): loss = train.cyclegan_loss(get_gan_model_fn(), add_summaries=True) self.assertIsInstance(loss, namedtuples.CycleGANLoss) - self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0) + self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES)) @parameterized.named_parameters( ('gan', create_gan_model, False), @@ -759,7 +759,7 @@ class TensorPoolAdjusteModelTest(test.TestCase): # For [pool_size, ?), the pool is full, tensor2 must be equal to some # historical values of tensor1 (which is previously stored in the # pool). - self.assertTrue(any([(v == t2).all() for v in history_values])) + self.assertTrue(any((v == t2).all() for v in history_values)) def _make_new_model_and_check(self, model, pool_size): pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=pool_size) @@ -923,8 +923,7 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase): model, loss, generator_optimizer=g_opt, discriminator_optimizer=d_opt) self.assertIsInstance(train_ops, namedtuples.GANTrainOps) # No new trainable variables should have been added. - self.assertEqual(num_trainable_vars, - len(variables_lib.get_trainable_variables())) + self.assertLen(variables_lib.get_trainable_variables(), num_trainable_vars) g_sync_init_op = g_opt.get_init_tokens_op(num_tokens=1) d_sync_init_op = d_opt.get_init_tokens_op(num_tokens=1) diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc index 94f522c04e5a09ed2d9355fa675125c340407923..fbccbead03fc0d641db40ede661bf3677d44c45d 100644 --- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc +++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc @@ -170,6 +170,14 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous { // Record "call" in active_ so that it can be aborted cleanly. RegisterCall(call); + // RendezvousMgr already aborted, shouldn't send RPC call any more + if (!call->status().ok()) { + done(call->status(), Args(), Args(), Tensor(), false); + session()->worker_cache->ReleaseWorker(src_worker, rwi); + delete call; + return; + } + // Start "call". Ref(); call->Start([this, call, src_worker, rwi, done]() { diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc index 478b716d88321101c971789f36c0ff8ecd3f418e..108da04494685f06f9afc26a26a5dadcdd99b0ff 100644 --- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc +++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc @@ -115,7 +115,7 @@ class AdjustHsvInYiqOp : public AdjustHsvInYiqOpBase { *context->device()->tensorflow_cpu_worker_threads(); Shard(worker_threads.num_threads, worker_threads.workers, channel_count, kCostPerChannel, - [channel_count, &input_data, &output_data, &tranformation_matrix]( + [&input_data, &output_data, &tranformation_matrix]( int64 start_channel, int64 end_channel) { // Applying projection matrix to input RGB vectors. const float* p = input_data.data() + start_channel * kChannelSize; diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py index 3327a9f9a613bfb56e6a25af0fe1c0ca18609035..9e19884df852c0fd259a55aef56c62b4189cd1da 100644 --- a/tensorflow/contrib/keras/api/keras/layers/__init__.py +++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py @@ -20,7 +20,7 @@ from __future__ import print_function # Generic layers. # pylint: disable=g-bad-import-order -from tensorflow.python.keras.engine.base_layer import InputSpec +from tensorflow.python.keras.engine.input_spec import InputSpec from tensorflow.python.keras.engine.base_layer import Layer from tensorflow.python.keras.engine.input_layer import Input from tensorflow.python.keras.engine.input_layer import InputLayer diff --git a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py index de7530231db4ea4f50996a67eb8c0d6936db9dd3..1626e55b9b3bc82bd96703bfab765ac6ad81f462 100644 --- a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py +++ b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py @@ -90,7 +90,7 @@ def _update_features_and_columns(features, feature_columns, mapped_column_name = column_name + "_MAPPED" # Construct new feature columns based on provided kernel_mappers. column_kernel_mappers = kernel_mappers_dict[feature_column] - new_dim = sum([mapper.output_dim for mapper in column_kernel_mappers]) + new_dim = sum(mapper.output_dim for mapper in column_kernel_mappers) mapped_columns.add( layers.feature_column.real_valued_column(mapped_column_name, new_dim)) diff --git a/tensorflow/contrib/layers/python/layers/encoders.py b/tensorflow/contrib/layers/python/layers/encoders.py index f42112206d0db9d2e42bd4cff19f6a6533951d46..3671633c8d795034b13cb55fd6db87c453e9fa12 100644 --- a/tensorflow/contrib/layers/python/layers/encoders.py +++ b/tensorflow/contrib/layers/python/layers/encoders.py @@ -84,8 +84,7 @@ def bow_encoder(ids, if isinstance(ids, sparse_tensor.SparseTensor): raise TypeError('ids are expected to be dense Tensor, got: %s', ids) return math_ops.reduce_mean( - embedding_ops.embedding_lookup(embeddings, ids), - reduction_indices=1) + embedding_ops.embedding_lookup(embeddings, ids), axis=1) def embed_sequence(ids, diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py index 222404b19db2b93b695ee6d2cb16584e17033700..00d819ed0e9fe3a5644105a571beda100204631e 100644 --- a/tensorflow/contrib/layers/python/layers/feature_column.py +++ b/tensorflow/contrib/layers/python/layers/feature_column.py @@ -1015,8 +1015,7 @@ class _OneHotColumn( dense_id_tensor, depth=self.length, on_value=1.0, off_value=0.0) # Reduce to get a multi-hot per example. - return math_ops.reduce_sum( - one_hot_id_tensor, reduction_indices=[output_rank - 1]) + return math_ops.reduce_sum(one_hot_id_tensor, axis=[output_rank - 1]) @property def _variable_shape(self): diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py index ac9561c7693fc4ad994a00889aa3f15b4b5a5ee4..403b522ce45ac6ad98a321378626b87aaa7738aa 100644 --- a/tensorflow/contrib/layers/python/layers/layers.py +++ b/tensorflow/contrib/layers/python/layers/layers.py @@ -35,6 +35,7 @@ from tensorflow.python.framework import function from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape +from tensorflow.python.keras.engine import input_spec from tensorflow.python.layers import base from tensorflow.python.layers import convolutional as convolutional_layers from tensorflow.python.layers import core as core_layers @@ -1958,7 +1959,7 @@ class GDN(base.Layer): self._reparam_offset = reparam_offset self.data_format = data_format self._channel_axis() # trigger ValueError early - self.input_spec = base.InputSpec(min_ndim=3, max_ndim=5) + self.input_spec = input_spec.InputSpec(min_ndim=3, max_ndim=5) def _channel_axis(self): try: @@ -2015,7 +2016,7 @@ class GDN(base.Layer): raise ValueError('The channel dimension of the inputs to `GDN` ' 'must be defined.') self._input_rank = input_shape.ndims - self.input_spec = base.InputSpec( + self.input_spec = input_spec.InputSpec( ndim=input_shape.ndims, axes={ channel_axis: num_channels }) diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py index 8ead6336a08db4dd52edf0d3372db5a50f860e2b..0a4d2c6d4cb5cad7da93cea89478bc0fca2ac4d6 100644 --- a/tensorflow/contrib/layers/python/layers/layers_test.py +++ b/tensorflow/contrib/layers/python/layers/layers_test.py @@ -3811,7 +3811,7 @@ class UnitNormTests(test.TestCase): image = random_ops.random_uniform((height, width, 3)) output = _layers.unit_norm(image, dim=dim, epsilon=1e-6) norms = math_ops.sqrt( - math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim)) + math_ops.reduce_sum(math_ops.square(output), axis=dim)) shape = [height, width, 3] del shape[dim] @@ -3847,7 +3847,7 @@ class UnitNormTests(test.TestCase): image = array_ops.placeholder(dtypes.float32, (None, None, 3)) output = _layers.unit_norm(image, dim=dim, epsilon=1e-6) norms = math_ops.sqrt( - math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim)) + math_ops.reduce_sum(math_ops.square(output), axis=dim)) with self.cached_session(): actual = norms.eval({image: placeholder_value}) diff --git a/tensorflow/contrib/layers/python/layers/regularizers_test.py b/tensorflow/contrib/layers/python/layers/regularizers_test.py index 51faba30c74d64c54d3d2b11d2a11195cca6b759..5cb00b76847430be8ade9f4e4fc8f7372035485a 100644 --- a/tensorflow/contrib/layers/python/layers/regularizers_test.py +++ b/tensorflow/contrib/layers/python/layers/regularizers_test.py @@ -141,7 +141,7 @@ class RegularizerTest(test.TestCase): dummy_regularizer = lambda x: math_ops.reduce_sum(2 * x) array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]] tensor_weights_list = [constant_op.constant(x) for x in array_weights_list] - expected = sum([2 * x for l in array_weights_list for x in l]) + expected = sum(2 * x for l in array_weights_list for x in l) with self.cached_session(): result = regularizers.apply_regularization(dummy_regularizer, tensor_weights_list) diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py index 18ca4214a1c407653294ecfac0116bf00cda46a1..10fbd60ba2df4c3f84169bf04f249d67dc14573f 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py +++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py @@ -150,10 +150,10 @@ def _dnn_model_fn(features, labels, mode, params, config=None): "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=input_layer_partitioner) as input_layer_scope: - if all([ + if all( isinstance(fc, feature_column._FeatureColumn) # pylint: disable=protected-access for fc in feature_columns - ]): + ): net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py index 7a3cc8bd984b1b621f50d9dbf2979dcd6fa8b11f..2ade6b7b6ce2678ec8df7c98ffaa5636ae9d4b1d 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py +++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py @@ -236,10 +236,10 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None): "input_from_feature_columns", values=tuple(six.itervalues(features)), partitioner=input_layer_partitioner) as dnn_input_scope: - if all([ + if all( isinstance(fc, feature_column_lib._FeatureColumn) # pylint: disable=protected-access for fc in dnn_feature_columns - ]): + ): net = layers.input_from_feature_columns( columns_to_tensors=features, feature_columns=dnn_feature_columns, @@ -292,8 +292,8 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None): linear_parent_scope, values=tuple(six.itervalues(features)), partitioner=linear_partitioner) as scope: - if all([isinstance(fc, feature_column_lib._FeatureColumn) # pylint: disable=protected-access - for fc in linear_feature_columns]): + if all(isinstance(fc, feature_column_lib._FeatureColumn) # pylint: disable=protected-access + for fc in linear_feature_columns): if joint_linear_weights: linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns( columns_to_tensors=features, diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py index 1d8a59281a4934ad063362cba064e6cb3abff5a2..28c4964527bb034c8c6b1642366c6c82c1a72201 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py @@ -668,7 +668,7 @@ class DynamicRNNEstimatorLearningTest(test.TestCase): sequences = centers + noise inputs = array_ops.expand_dims(sequences, 2) - labels = math_ops.reduce_mean(sequences, reduction_indices=[1]) + labels = math_ops.reduce_mean(sequences, axis=[1]) return {'inputs': inputs}, labels return input_fn @@ -722,8 +722,8 @@ class DynamicRNNEstimatorLearningTest(test.TestCase): inputs = array_ops.expand_dims(math_ops.to_float(random_sequence), 2) labels = math_ops.to_int32( array_ops.squeeze( - math_ops.reduce_sum( - inputs, reduction_indices=[1]) > (sequence_length / 2.0))) + math_ops.reduce_sum(inputs, axis=[1]) > ( + sequence_length / 2.0))) return {'inputs': inputs}, labels return input_fn diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py index 8bc869db895b753be805219892342b5e6ea3799b..9132b2209bce8005b323d058d6d176784a84b2d1 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py @@ -1066,11 +1066,11 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable, chief_hooks = [] if (self._config.save_checkpoints_secs or self._config.save_checkpoints_steps): - saver_hook_exists = any([ + saver_hook_exists = any( isinstance(h, basic_session_run_hooks.CheckpointSaverHook) for h in (all_hooks + model_fn_ops.training_hooks + chief_hooks + model_fn_ops.training_chief_hooks) - ]) + ) if not saver_hook_exists: chief_hooks = [ basic_session_run_hooks.CheckpointSaverHook( @@ -1493,7 +1493,7 @@ class Estimator(BaseEstimator): # pylint: disable=protected-access class SKCompat(sklearn.BaseEstimator): """Scikit learn wrapper for TensorFlow Learn Estimator. - + THIS CLASS IS DEPRECATED. See [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md) for general migration instructions. diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py index 439b17e505d1146492a32cc2fd58febee2b2456d..9ee8d8004bf26224dd96a98bad109720c44d04f7 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/linear.py +++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py @@ -155,8 +155,8 @@ def _linear_model_fn(features, labels, mode, params, config=None): parent_scope, values=tuple(six.itervalues(features)), partitioner=partitioner) as scope: - if all([isinstance(fc, feature_column._FeatureColumn) # pylint: disable=protected-access - for fc in feature_columns]): + if all(isinstance(fc, feature_column._FeatureColumn) # pylint: disable=protected-access + for fc in feature_columns): if joint_weights: layer_fn = layers.joint_weighted_sum_from_feature_columns else: diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py index 64ee2eef0ae2e93b6dd7bf6deef98d26c1808533..dfc76bfde6c0109f98093232b6f223d6938007f9 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py @@ -1745,7 +1745,7 @@ class LinearRegressorTest(test.TestCase): 'place_holder': constant_op.constant([[0.0]] * num_examples), }, constant_op.constant( - [[1 if i % 4 is 0 else 0] for i in range(num_examples)]) + [[1 if i % 4 == 0 else 0] for i in range(num_examples)]) place_holder = feature_column_lib.real_valued_column('place_holder') sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer( diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py index 647667188238dc18b137eaad98356a79b3a549b4..7a5354222f103aa0f45adc513079e420bbbfd30c 100644 --- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py +++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py @@ -524,7 +524,7 @@ class SDCALinearRegressorTest(test.TestCase): # LinearClassifier requires at least one column. 'place_holder': constant_op.constant([[0.0]] * num_examples), - }, constant_op.constant([[1 if i % 4 is 0 else 0] + }, constant_op.constant([[1 if i % 4 == 0 else 0] for i in range(num_examples)]) with self._single_threaded_test_session(): diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py index 619294b51822bd9983eda777acae5cf0d253926d..709a042bbcefb89125f7e4cd14a0d7ecd2b53281 100644 --- a/tensorflow/contrib/losses/python/losses/loss_ops.py +++ b/tensorflow/contrib/losses/python/losses/loss_ops.py @@ -22,7 +22,6 @@ from __future__ import division from __future__ import print_function from tensorflow.contrib.framework.python.ops import add_arg_scope -from tensorflow.python.compat import compat from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -60,41 +59,12 @@ def _scale_losses(losses, weights): """ # First, compute the sum of the losses over all elements: start_index = max(0, weights.get_shape().ndims) - reduction_indices = list(range(start_index, losses.get_shape().ndims)) - reduced_losses = math_ops.reduce_sum( - losses, reduction_indices=reduction_indices) + axis = list(range(start_index, losses.get_shape().ndims)) + reduced_losses = math_ops.reduce_sum(losses, axis=axis) reduced_losses = math_ops.multiply(reduced_losses, weights) return math_ops.reduce_sum(reduced_losses) -def _safe_div(numerator, denominator, name="value"): - """Computes a safe divide which returns 0 if the denominator is zero. - - Note that the function contains an additional conditional check that is - necessary for avoiding situations where the loss is zero causing NaNs to - creep into the gradient computation. - - Args: - numerator: An arbitrary `Tensor`. - denominator: A `Tensor` whose shape matches `numerator` and whose values are - assumed to be non-negative. - name: An optional name for the returned op. - - Returns: - The element-wise value of the numerator divided by the denominator. - """ - if compat.forward_compatible(2018, 11, 1): - return math_ops.div_no_nan(numerator, denominator, name=name) - return array_ops.where( - math_ops.greater(denominator, 0), - math_ops.div(numerator, - array_ops.where( - math_ops.equal(denominator, 0), - array_ops.ones_like(denominator), denominator)), - array_ops.zeros_like(numerator), - name=name) - - def _safe_mean(losses, num_present): """Computes a safe mean of the losses. @@ -107,7 +77,7 @@ def _safe_mean(losses, num_present): then zero is returned. """ total_loss = math_ops.reduce_sum(losses) - return _safe_div(total_loss, num_present, name="value") + return math_ops.div_no_nan(total_loss, num_present, name="value") @deprecated("2016-12-30", "Use tf.losses.compute_weighted_loss instead.") @@ -187,10 +157,9 @@ def _num_present(losses, weights, per_batch=False): # First, count the number of nonzero weights: if weights.get_shape().ndims >= 1: - reduction_indices = list(range(1, weights.get_shape().ndims)) + axis = list(range(1, weights.get_shape().ndims)) num_nonzero_per_batch = math_ops.reduce_sum( - math_ops.to_float(math_ops.not_equal(weights, 0)), - reduction_indices=reduction_indices) + math_ops.to_float(math_ops.not_equal(weights, 0)), axis=axis) # Next, determine the number of elements that weights would broadcast to: broadcast_dims = array_ops.slice( @@ -606,20 +575,20 @@ def mean_pairwise_squared_error(predictions, if weights.get_shape().ndims is None: raise ValueError("weights.get_shape().ndims cannot be None") - reduction_indices = list(range(1, diffs.get_shape().ndims)) + axis = list(range(1, diffs.get_shape().ndims)) sum_squares_diff_per_batch = math_ops.reduce_sum( - math_ops.square(diffs), reduction_indices=reduction_indices) + math_ops.square(diffs), axis=axis) num_present_per_batch = _num_present(diffs, weights, per_batch=True) - term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, - num_present_per_batch, - name="value") + term1 = 2.0 * math_ops.div_no_nan( + sum_squares_diff_per_batch, num_present_per_batch, name="value") - sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices) - term2 = 2.0 * _safe_div(math_ops.square(sum_diff), - math_ops.square(num_present_per_batch), - name="value") + sum_diff = math_ops.reduce_sum(diffs, axis=axis) + term2 = 2.0 * math_ops.div_no_nan( + math_ops.square(sum_diff), + math_ops.square(num_present_per_batch), + name="value") loss = _scale_losses(term1 - term2, weights) @@ -674,7 +643,7 @@ def cosine_distance(predictions, radial_diffs = math_ops.multiply(predictions, labels) losses = 1 - math_ops.reduce_sum( - radial_diffs, reduction_indices=[ + radial_diffs, axis=[ axis, ]) return compute_weighted_loss(losses, weights, scope=scope) diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh index 0a07588f07f0bb89dbf5dc5909f511f74470fb41..b396c527673902d61072dc9cf7d2766476be8369 100755 --- a/tensorflow/contrib/makefile/download_dependencies.sh +++ b/tensorflow/contrib/makefile/download_dependencies.sh @@ -34,7 +34,7 @@ NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\. # 1.10 branch does not work. `make distclean` fails and blocks the build # process. For now we're hardcoding to the version which is used by # TensorFlow 1.9. -PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz" +PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz" # TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' once # the archive has been propagated in mirror.bazel.build. RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt index e779eff68901af7042deb5c09b78a230e0d06d02..655c7eefcb978d40c8bc16a23685e03ed71bfb63 100644 --- a/tensorflow/contrib/makefile/tf_op_files.txt +++ b/tensorflow/contrib/makefile/tf_op_files.txt @@ -157,6 +157,7 @@ tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc +tensorflow/core/kernels/multinomial_op.cc tensorflow/core/kernels/no_op.cc tensorflow/core/kernels/non_max_suppression_op.cc tensorflow/core/kernels/one_hot_op.cc @@ -252,6 +253,7 @@ tensorflow/core/kernels/split_op.cc tensorflow/core/kernels/split_v_op.cc tensorflow/core/kernels/stack.cc tensorflow/core/kernels/stack_ops.cc +tensorflow/core/kernels/stateless_random_ops.cc tensorflow/core/kernels/strided_slice_op.cc tensorflow/core/kernels/strided_slice_op_inst_0.cc tensorflow/core/kernels/strided_slice_op_inst_1.cc diff --git a/tensorflow/contrib/metrics/python/metrics/classification.py b/tensorflow/contrib/metrics/python/metrics/classification.py index ac1236086503a7c6e541bdf098efcb92f84e577f..062deb74b165329d8e72efa73b9d81f4174f8831 100644 --- a/tensorflow/contrib/metrics/python/metrics/classification.py +++ b/tensorflow/contrib/metrics/python/metrics/classification.py @@ -175,7 +175,7 @@ def f1_score(labels, predictions, weights=None, num_thresholds=200, return best_f1 best_f1 = distribution_strategy_context.get_replica_context().merge_call( - f1_across_replicas, values) + f1_across_replicas, args=(values,)) update_op = compute_best_f1_score(tp=update_ops['tp'], fp=update_ops['fp'], fn=update_ops['fn'], name='update') diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py index d6932f6e4b603b1a76250ab622f5fe8eaea81bc9..7b432f8bd20989c6d95310bcaca88d44ce3e0d1f 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py @@ -24,7 +24,6 @@ from __future__ import print_function import collections as collections_lib -from tensorflow.python.compat import compat from tensorflow.python.eager import context from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -46,32 +45,6 @@ from tensorflow.python.util.deprecation import deprecated _EPSILON = 1e-7 -def _safe_div(numerator, denominator): - """Computes a safe divide which returns 0 if the denominator is zero. - - Note that the function contains an additional conditional check that is - necessary for avoiding situations where the loss is zero causing NaNs to - creep into the gradient computation. - - Args: - numerator: An arbitrary `Tensor`. - denominator: A `Tensor` whose shape matches `numerator` and whose values are - assumed to be non-negative. - - Returns: - The element-wise value of the numerator divided by the denominator. - """ - if compat.forward_compatible(2018, 11, 1): - return math_ops.div_no_nan(numerator, denominator) - return array_ops.where( - math_ops.greater(denominator, 0), - math_ops.div(numerator, - array_ops.where( - math_ops.equal(denominator, 0), - array_ops.ones_like(denominator), denominator)), - array_ops.zeros_like(numerator)) - - @deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the ' 'order of the labels and predictions arguments has been switched.') def streaming_true_positives(predictions, @@ -3247,24 +3220,20 @@ def streaming_covariance(predictions, # We update the means by Delta=Error*BatchCount/(BatchCount+PrevCount) # batch_mean_prediction is E[x_B] in the update equation - batch_mean_prediction = _safe_div( - math_ops.reduce_sum(weighted_predictions), - batch_count) - delta_mean_prediction = _safe_div( - (batch_mean_prediction - mean_prediction) * batch_count, - update_count) + batch_mean_prediction = math_ops.div_no_nan( + math_ops.reduce_sum(weighted_predictions), batch_count) + delta_mean_prediction = math_ops.div_no_nan( + (batch_mean_prediction - mean_prediction) * batch_count, update_count) update_mean_prediction = state_ops.assign_add(mean_prediction, delta_mean_prediction) # prev_mean_prediction is E[x_A] in the update equation prev_mean_prediction = update_mean_prediction - delta_mean_prediction # batch_mean_label is E[y_B] in the update equation - batch_mean_label = _safe_div( - math_ops.reduce_sum(weighted_labels), - batch_count) - delta_mean_label = _safe_div( - (batch_mean_label - mean_label) * batch_count, - update_count) + batch_mean_label = math_ops.div_no_nan( + math_ops.reduce_sum(weighted_labels), batch_count) + delta_mean_label = math_ops.div_no_nan( + (batch_mean_label - mean_label) * batch_count, update_count) update_mean_label = state_ops.assign_add(mean_label, delta_mean_label) # prev_mean_label is E[y_A] in the update equation prev_mean_label = update_mean_label - delta_mean_label @@ -3447,7 +3416,7 @@ def streaming_mean_cosine_distance(predictions, predictions.get_shape().assert_is_compatible_with(labels.get_shape()) radial_diffs = math_ops.multiply(predictions, labels) radial_diffs = math_ops.reduce_sum( - radial_diffs, reduction_indices=[ + radial_diffs, axis=[ dim, ], keepdims=True) mean_distance, update_op = streaming_mean(radial_diffs, weights, None, None, @@ -3926,9 +3895,8 @@ def cohen_kappa(labels, po_sum = math_ops.reduce_sum(po) total = math_ops.reduce_sum(pe_row) pe_sum = math_ops.reduce_sum( - _safe_div( - math_ops.to_double(pe_row * pe_col), - math_ops.to_double(total))) + math_ops.div_no_nan( + math_ops.to_double(pe_row * pe_col), math_ops.to_double(total))) po_sum, pe_sum, total = (math_ops.to_double(po_sum), math_ops.to_double(pe_sum), math_ops.to_double(total)) diff --git a/tensorflow/contrib/model_pruning/python/layers/core_layers.py b/tensorflow/contrib/model_pruning/python/layers/core_layers.py index f0ce6fe03966c2de2dfd8ebcca07bf46afcf4fce..1fa5c8cb485704a5fccc486e823bbc4050bf505a 100644 --- a/tensorflow/contrib/model_pruning/python/layers/core_layers.py +++ b/tensorflow/contrib/model_pruning/python/layers/core_layers.py @@ -21,6 +21,7 @@ from __future__ import print_function from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape +from tensorflow.python.keras.engine import input_spec from tensorflow.python.layers import base from tensorflow.python.layers import utils from tensorflow.python.ops import array_ops @@ -119,7 +120,7 @@ class _MaskedConv(base.Layer): self.bias_initializer = bias_initializer self.kernel_regularizer = kernel_regularizer self.bias_regularizer = bias_regularizer - self.input_spec = base.InputSpec(ndim=self.rank + 2) + self.input_spec = input_spec.InputSpec(ndim=self.rank + 2) def build(self, input_shape): input_shape = tensor_shape.TensorShape(input_shape) @@ -171,7 +172,7 @@ class _MaskedConv(base.Layer): dtype=self.dtype) else: self.bias = None - self.input_spec = base.InputSpec( + self.input_spec = input_spec.InputSpec( ndim=self.rank + 2, axes={channel_axis: input_dim}) self.built = True @@ -393,14 +394,14 @@ class MaskedFullyConnected(base.Layer): self.bias_initializer = bias_initializer self.kernel_regularizer = kernel_regularizer self.bias_regularizer = bias_regularizer - self.input_spec = base.InputSpec(min_ndim=2) + self.input_spec = input_spec.InputSpec(min_ndim=2) def build(self, input_shape): input_shape = tensor_shape.TensorShape(input_shape) if tensor_shape.dimension_value(input_shape[-1]) is None: raise ValueError('The last dimension of the inputs to `Dense` ' 'should be defined. Found `None`.') - self.input_spec = base.InputSpec( + self.input_spec = input_spec.InputSpec( min_ndim=2, axes={-1: tensor_shape.dimension_value(input_shape[-1])}) self.kernel = self.add_variable( diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py index 155ff5b3f4f29d4d9c81bb265d19d1b8cce4fef2..960826407b66b4efa3c2693efb6d2e17c4b47b33 100644 --- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py +++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py @@ -18,6 +18,7 @@ from __future__ import division from __future__ import print_function from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import state_ops @@ -83,14 +84,14 @@ class NadamOptimizer(adam.AdamOptimizer): with ops.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) # m_bar = (1 - beta1) * g_t + beta1 * m_t - m_bar = m_scaled_g_values + beta1_t * m_t + m_bar = m_scaled_g_values + beta1_t * array_ops.gather(m_t, indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) - v_sqrt = math_ops.sqrt(v_t) - var_update = state_ops.assign_sub( - var, lr * m_bar / (v_sqrt + epsilon_t), use_locking=self._use_locking) + v_t_slice = array_ops.gather(v_t, indices) + v_sqrt = math_ops.sqrt(v_t_slice) + var_update = scatter_add(var, indices, -lr * m_bar / (v_sqrt + epsilon_t)) return control_flow_ops.group(*[var_update, m_bar, v_t]) diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py index 85e05ce71cec6ef897cadb7d123e630febb3c064..a4372f64874e7591dbceac901fad6c941209bef9 100644 --- a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py +++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py @@ -52,14 +52,19 @@ def nadam_update_numpy(param, class NadamOptimizerTest(test.TestCase): def doTestSparse(self, use_resource=False): + # need to use a larger value of epsilon here so that + # np.sqrt(v_t) + epsilon doesn't get rounded to 0 when + # the dtype is half and np.sqrt(v_t) = 0, as is the case + # when the gradient is 0 + sparse_epsilon = 1e-7 for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: with self.cached_session(): # Initialize variables for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 - var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) - grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) - var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) - grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) + var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype) + grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype) + var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype) + grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype) if use_resource: var0 = resource_variable_ops.ResourceVariable(var0_np) @@ -67,21 +72,21 @@ class NadamOptimizerTest(test.TestCase): else: var0 = variables.Variable(var0_np) var1 = variables.Variable(var1_np) - grads0_np_indices = np.array([0, 1], dtype=np.int32) + grads0_np_indices = np.array([0, 2], dtype=np.int32) grads0 = ops.IndexedSlices( - constant_op.constant(grads0_np), - constant_op.constant(grads0_np_indices), constant_op.constant([2])) - grads1_np_indices = np.array([0, 1], dtype=np.int32) + constant_op.constant(grads0_np[grads0_np_indices]), + constant_op.constant(grads0_np_indices), constant_op.constant([3])) + grads1_np_indices = np.array([0, 2], dtype=np.int32) grads1 = ops.IndexedSlices( - constant_op.constant(grads1_np), - constant_op.constant(grads1_np_indices), constant_op.constant([2])) - opt = nadam_optimizer.NadamOptimizer() + constant_op.constant(grads1_np[grads1_np_indices]), + constant_op.constant(grads1_np_indices), constant_op.constant([3])) + opt = nadam_optimizer.NadamOptimizer(epsilon=sparse_epsilon) update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() # Fetch params to validate initial values - self.assertAllClose([1.0, 2.0], var0.eval()) - self.assertAllClose([3.0, 4.0], var1.eval()) + self.assertAllClose([1.0, 1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 3.0, 4.0], var1.eval()) beta1_power, beta2_power = opt._get_beta_accumulators() @@ -91,8 +96,10 @@ class NadamOptimizerTest(test.TestCase): self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval()) update.run() - var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0) - var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1) + var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0, + epsilon=sparse_epsilon) + var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1, + epsilon=sparse_epsilon) # Validate updated params self.assertAllCloseAccordingToType(var0_np, var0.eval()) diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py index 1b6e70d3a03d270dc9ec4cedc470f1c2e03aadeb..a72db5e12fc086c3ec817d25d4964bbb9df2db60 100644 --- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py +++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py @@ -447,7 +447,7 @@ class _OptimizerV2State(object): if v is None: if colocate_with is None: colocate_with = self._non_slot_devices - with self._distribution.colocate_vars_with(colocate_with): + with self._distribution.extended.colocate_vars_with(colocate_with): # TODO(josh11b): Use get_variable() except for the legacy Adam use case. v = variable_scope.variable(initial_value, name=name, trainable=False) self._non_slot_dict[name] = v @@ -892,7 +892,8 @@ class OptimizerV2(optimizer_v1.Optimizer): raise ValueError("No gradients provided for any variable: %s." % ([str(v) for _, v in grads_and_vars],)) return distribute_ctx.get_replica_context().merge_call( - self._distributed_apply, filtered, global_step=global_step, name=name) + self._distributed_apply, args=(filtered,), + kwargs={"global_step": global_step, "name": name}) def _get_or_create_state(self, var_list=None): """Either looks up or creates `_OptimizerV2State`. @@ -927,7 +928,7 @@ class OptimizerV2(optimizer_v1.Optimizer): def _distributed_apply(self, distribution, grads_and_vars, global_step, name): """`apply_gradients` for use with a `DistributionStrategy`.""" - reduced_grads = distribution.batch_reduce( + reduced_grads = distribution.extended.batch_reduce_to( ds_reduce_util.ReduceOp.SUM, grads_and_vars) var_list = [v for _, v in grads_and_vars] grads_and_vars = zip(reduced_grads, var_list) @@ -944,7 +945,7 @@ class OptimizerV2(optimizer_v1.Optimizer): with ops.name_scope(name, self._name) as name: per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list) # Include the current value of any dynamic hyper parameters in `state`. - non_slot_devices = distribution.non_slot_devices(var_list) + non_slot_devices = distribution.extended.non_slot_devices(var_list) state = per_graph_state._copy_with_dynamic_hyper( # pylint: disable=protected-access self._hyper, distribution, non_slot_devices) @@ -989,7 +990,8 @@ class OptimizerV2(optimizer_v1.Optimizer): # Use the processors to update the variables. update_ops = [] for grad, var in grads_and_vars: - update_ops.extend(distribution.update(var, update, grad, grouped=False)) + update_ops.extend(distribution.extended.update( + var, update, args=(grad,), group=False)) # Give the child class a chance to do something after applying # gradients @@ -1001,8 +1003,8 @@ class OptimizerV2(optimizer_v1.Optimizer): update_ops = control_flow_ops.group(update_ops) with ops.control_dependencies([update_ops]): - finish_updates = distribution.update_non_slot( - non_slot_devices, finish, grouped=False) + finish_updates = distribution.extended.update_non_slot( + non_slot_devices, finish, group=False) # We said grouped=False, which means finish_updates is always a list. # It will be [None] when finish() returns None. if finish_updates == [None]: @@ -1017,8 +1019,8 @@ class OptimizerV2(optimizer_v1.Optimizer): def update_global_step(global_step, name): return global_step.assign_add(1, read_value=False, name=name) - apply_updates = distribution.update(global_step, update_global_step, - name) + apply_updates = distribution.extended.update( + global_step, update_global_step, args=(name,)) # Add the training op to the TRAIN_OP graph collection in graph mode. if not eager_execution: diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py index 6f659347fba019288361dd0420f2ade6dc1bebaf..8619708cdaecd78bcc7de0e8e0cbf2baa11bf6a2 100644 --- a/tensorflow/contrib/quantize/python/quant_ops.py +++ b/tensorflow/contrib/quantize/python/quant_ops.py @@ -138,7 +138,7 @@ def LastValueQuantize(inputs, if per_channel: if input_dim >= 2: batch_min = math_ops.reduce_min( - inputs, reduction_indices=reduce_dims, name='BatchMin') + inputs, axis=reduce_dims, name='BatchMin') else: batch_min = inputs else: @@ -147,7 +147,7 @@ def LastValueQuantize(inputs, if per_channel: if input_dim >= 2: batch_max = math_ops.reduce_max( - inputs, reduction_indices=reduce_dims, name='BatchMax') + inputs, axis=reduce_dims, name='BatchMax') else: batch_max = inputs else: @@ -263,7 +263,7 @@ def MovingAvgQuantize(inputs, if per_channel: if input_dim >= 2: batch_min = math_ops.reduce_min( - inputs, reduction_indices=reduce_dims, name='BatchMin') + inputs, axis=reduce_dims, name='BatchMin') else: batch_min = inputs else: @@ -272,7 +272,7 @@ def MovingAvgQuantize(inputs, if per_channel: if input_dim >= 2: batch_max = math_ops.reduce_max( - inputs, reduction_indices=reduce_dims, name='BatchMax') + inputs, axis=reduce_dims, name='BatchMax') else: batch_max = inputs else: diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py index 338923f75125ed3d1a2b1046a65d563bc8f7d3e3..21d1b1213090273b5abd8e012f8711db98c94347 100644 --- a/tensorflow/contrib/quantize/python/quantize.py +++ b/tensorflow/contrib/quantize/python/quantize.py @@ -160,7 +160,7 @@ def Quantize(graph, # shouldn't quantize it, since the activation will be Fused into the # Add at inference time. consumers = input_to_ops_map.ConsumerOperations(layer_match.bypass_op) - if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]): + if any(consumer.type in _ACTIVATION_TYPES for consumer in consumers): logging.info('Skipping %s, because its followed by an activation.', layer_match.bypass_op.name) else: @@ -195,7 +195,7 @@ def Quantize(graph, # Add at inference time. consumers = input_to_ops_map.ConsumerOperations( layer_match.post_activation_bypass_op) - if any([consumer.type in _RELU_TYPES for consumer in consumers]): + if any(consumer.type in _RELU_TYPES for consumer in consumers): logging.info('Skipping %s, because its followed by an activation.', layer_match.post_activation_bypass_op.name) else: diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py index b30ca7882fce1747cb1dcb27f97f5b012ff9da02..251a933eaec826b08266123245d9aef8573d3e06 100644 --- a/tensorflow/contrib/rnn/python/ops/gru_ops.py +++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py @@ -21,7 +21,7 @@ from tensorflow.contrib.rnn.ops import gen_gru_ops from tensorflow.contrib.util import loader from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape -from tensorflow.python.layers import base as base_layer +from tensorflow.python.keras.engine import input_spec from tensorflow.python.ops import array_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops @@ -165,7 +165,7 @@ class GRUBlockCell(LayerRNNCell): num_units = cell_size self._cell_size = num_units # Inputs must be 2-dimensional. - self.input_spec = base_layer.InputSpec(ndim=2) + self.input_spec = input_spec.InputSpec(ndim=2) @property def state_size(self): diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py index 4db431f85a467389717e98d87875afce5e08b974..b043026bc556a8879b15b432829baf8136250c0e 100644 --- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py +++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py @@ -25,6 +25,7 @@ from tensorflow.contrib.rnn.ops import gen_lstm_ops from tensorflow.contrib.util import loader from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops +from tensorflow.python.keras.engine import input_spec from tensorflow.python.layers import base as base_layer from tensorflow.python.ops import array_ops from tensorflow.python.ops import init_ops @@ -385,7 +386,7 @@ class LSTMBlockCell(LayerRNNCell): "scope": "lstm_cell" } # Inputs must be 2-dimensional. - self.input_spec = base_layer.InputSpec(ndim=2) + self.input_spec = input_spec.InputSpec(ndim=2) @property def state_size(self): @@ -628,7 +629,7 @@ class LSTMBlockFusedCell(LSTMBlockWrapper): self._use_peephole = use_peephole # Inputs must be 3-dimensional. - self.input_spec = base_layer.InputSpec(ndim=3) + self.input_spec = input_spec.InputSpec(ndim=3) @property def num_units(self): diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py index e159dc95796e8f02287a4b6db4d25023348fe8da..8a1c09f171e6108174671e3122d5ff4c0b236003 100644 --- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py +++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py @@ -30,7 +30,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.keras import activations from tensorflow.python.keras import initializers -from tensorflow.python.layers import base as base_layer +from tensorflow.python.keras.engine import input_spec from tensorflow.python.ops import array_ops from tensorflow.python.ops import clip_ops from tensorflow.python.ops import gen_array_ops @@ -2752,7 +2752,7 @@ class SRUCell(rnn_cell_impl.LayerRNNCell): self._activation = activation or math_ops.tanh # Restrict inputs to be 2-dimensional matrices - self.input_spec = base_layer.InputSpec(ndim=2) + self.input_spec = input_spec.InputSpec(ndim=2) @property def state_size(self): @@ -3089,7 +3089,7 @@ class IndRNNCell(rnn_cell_impl.LayerRNNCell): super(IndRNNCell, self).__init__(_reuse=reuse, name=name, dtype=dtype) # Inputs must be 2-dimensional. - self.input_spec = base_layer.InputSpec(ndim=2) + self.input_spec = input_spec.InputSpec(ndim=2) self._num_units = num_units self._activation = activation or math_ops.tanh @@ -3183,7 +3183,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell): super(IndyGRUCell, self).__init__(_reuse=reuse, name=name, dtype=dtype) # Inputs must be 2-dimensional. - self.input_spec = base_layer.InputSpec(ndim=2) + self.input_spec = input_spec.InputSpec(ndim=2) self._num_units = num_units self._activation = activation or math_ops.tanh @@ -3323,7 +3323,7 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell): super(IndyLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype) # Inputs must be 2-dimensional. - self.input_spec = base_layer.InputSpec(ndim=2) + self.input_spec = input_spec.InputSpec(ndim=2) self._num_units = num_units self._forget_bias = forget_bias @@ -3444,7 +3444,7 @@ class MinimalRNNCell(rnn_cell_impl.LayerRNNCell): super(MinimalRNNCell, self).__init__(name=name, dtype=dtype, **kwargs) # Inputs must be 2-dimensional. - self.input_spec = base_layer.InputSpec(ndim=2) + self.input_spec = input_spec.InputSpec(ndim=2) self.units = units self.activation = activations.get(activation) @@ -3558,7 +3558,7 @@ class CFNCell(rnn_cell_impl.LayerRNNCell): super(CFNCell, self).__init__(name=name, dtype=dtype, **kwargs) # Inputs must be 2-dimensional. - self.input_spec = base_layer.InputSpec(ndim=2) + self.input_spec = input_spec.InputSpec(ndim=2) self.units = units self.activation = activations.get(activation) diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD index f0947fe423f7e6bf84dae468bc36ca11147ac0bb..269443b2c6508bb618d30f64487b1a6a84e8646f 100644 --- a/tensorflow/contrib/saved_model/BUILD +++ b/tensorflow/contrib/saved_model/BUILD @@ -102,7 +102,10 @@ py_test( size = "medium", srcs = ["python/saved_model/keras_saved_model_test.py"], srcs_version = "PY2AND3", - tags = ["no_windows"], + tags = [ + "no_oss", # TODO(b/119349471): Re-enable + "no_windows", + ], deps = [ ":keras_saved_model", "//tensorflow/python:client_testlib", diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py index 0619c4ca5a584f0af7778b81c4d2c2eb2fd9f60d..d8637effe2ba88689d591482b067ac6f4a1683c1 100644 --- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py +++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py @@ -348,12 +348,14 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase): # feeding in the inputs and targets. loss, predictions, _ = sess.run( (outputs['loss'], outputs['predictions/' + output_name], - outputs['metrics/mae/update_op']), - {inputs[input_name]: input_arr, inputs[target_name]: target_arr}) + outputs['metrics/mean_absolute_error/update_op']), { + inputs[input_name]: input_arr, + inputs[target_name]: target_arr + }) # The metric value should be run after the update op, to ensure that it # reflects the correct value. - metric_value = sess.run(outputs['metrics/mae/value']) + metric_value = sess.run(outputs['metrics/mean_absolute_error/value']) self.assertEqual(int(train_before_export), sess.run(training_module.get_global_step())) @@ -368,8 +370,8 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase): self.assertEqual(int(train_before_export), sess.run(training_module.get_global_step())) self.assertIn('loss', outputs) - self.assertIn('metrics/mae/update_op', outputs) - self.assertIn('metrics/mae/value', outputs) + self.assertIn('metrics/mean_absolute_error/update_op', outputs) + self.assertIn('metrics/mean_absolute_error/value', outputs) self.assertIn('predictions/' + output_name, outputs) # Train for a step diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py index 8668c67cf95aba6cbd466142bed37c79e34d9e04..922f21b98b35dfff19c8c605a25e89c5d2da8d98 100644 --- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py +++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py @@ -154,8 +154,8 @@ class AttentionWrapperTest(test.TestCase): if attention_layer_sizes is not None: # Compute sum of attention_layer_sizes. Use encoder_output_depth if None. - attention_depth = sum([attention_layer_size or encoder_output_depth - for attention_layer_size in attention_layer_sizes]) + attention_depth = sum(attention_layer_size or encoder_output_depth + for attention_layer_size in attention_layer_sizes) elif attention_layers is not None: # Compute sum of attention_layers output depth. attention_depth = sum( diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py index 42898e797cc351e3de290cc65fc825f1406c739d..605625c3059868d349da015b8286d219691fc255 100644 --- a/tensorflow/contrib/summary/summary.py +++ b/tensorflow/contrib/summary/summary.py @@ -79,6 +79,7 @@ from tensorflow.python.ops.summary_ops_v2 import image from tensorflow.python.ops.summary_ops_v2 import import_event from tensorflow.python.ops.summary_ops_v2 import initialize from tensorflow.python.ops.summary_ops_v2 import never_record_summaries +from tensorflow.python.ops.summary_ops_v2 import record_summaries from tensorflow.python.ops.summary_ops_v2 import record_summaries_every_n_global_steps from tensorflow.python.ops.summary_ops_v2 import scalar from tensorflow.python.ops.summary_ops_v2 import should_record_summaries diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD index 20bcd2447e6fd7eaf11e3e5cf383f6abf168c787..784acce444a8d0c066f1b7ae6c1b5d7d65405549 100644 --- a/tensorflow/contrib/tensorrt/BUILD +++ b/tensorflow/contrib/tensorrt/BUILD @@ -29,6 +29,10 @@ load( "if_tensorrt", ) +exports_files(glob([ + "test/testdata/*", +])) + tf_cuda_cc_test( name = "tensorrt_test_cc", size = "small", @@ -491,6 +495,7 @@ cuda_py_tests( "test/memory_alignment_test.py", "test/multi_connection_neighbor_engine_test.py", "test/neighboring_engine_test.py", + "test/quantization_test.py", "test/rank_two_test.py", "test/reshape_transpose_test.py", "test/vgg_block_nchw_test.py", @@ -527,6 +532,30 @@ cuda_py_tests( ], ) +cuda_py_test( + name = "quantization_mnist_test", + srcs = ["test/quantization_mnist_test.py"], + additional_deps = [ + ":tf_trt_integration_test_base", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python/keras:keras", + "//tensorflow/python/estimator:estimator", + ], + data = [ + "test/testdata/checkpoint", + "test/testdata/model.ckpt-46900.data-00000-of-00001", + "test/testdata/model.ckpt-46900.index", + ], + tags = [ + "no_cuda_on_cpu_tap", + "no_pip", + "no_tap", # It is not able to download the mnist data. + "no_windows", + "nomac", + ], +) + cc_library( name = "utils", srcs = ["convert/utils.cc"], diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index f95ffe4100c700d836b0e5ff3b28f5e8d0fdf2d3..21f505b7fe72f35b57594456a9405da2e23d7083 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -82,60 +82,73 @@ std::vector GetLoadedTensorRTVersion() { } TrtCandidateSelector::TrtCandidateSelector( - const grappler::GraphProperties& graph_properties) - : graph_properties_(graph_properties) {} + const grappler::GraphProperties& graph_properties, int precision_mode) + : graph_properties_(graph_properties), precision_mode_(precision_mode) {} Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) { // TODO(laigd): move this set to TrtNodeValidator where it should belong. // LINT.IfChange static const std::set candidate_ops = { - "Identity", - "Snapshot", - "Const", - "Conv2D", - "MaxPool", - "BiasAdd", - "Relu", - "Add", - "Mul", - "Sub", - "Rsqrt", - "Pad", - "Mean", - "AvgPool", - "ConcatV2", - "DepthwiseConv2dNative", - "FusedBatchNorm", - "FusedBatchNormV2", - "Div", - "RealDiv", - "Rsqrt", - "Reciprocal", - "Exp", - "Log", - "Sqrt", - "Abs", - "Neg", - "Transpose", - "Reshape", - "MatMul", - "BatchMatMul", - "Softmax", - "Minimum", - "Maximum", - "TopKV2", - "Sum", - "Prod", - "Max", - "Min", + "Identity", + "Snapshot", + "Const", + "Conv2D", + "MaxPool", + "BiasAdd", + "Relu", + "Add", + "Mul", + "Sub", + "Rsqrt", + "Pad", + "Mean", + "AvgPool", + "ConcatV2", + "DepthwiseConv2dNative", + "FusedBatchNorm", + "FusedBatchNormV2", + "Div", + "RealDiv", + "Rsqrt", + "Reciprocal", + "Exp", + "Log", + "Sqrt", + "Abs", + "Neg", + "Transpose", + "Reshape", + "MatMul", + "BatchMatMul", + "Softmax", + "Minimum", + "Maximum", + "TopKV2", + "Sum", + "Prod", + "Max", + "Min", + "Relu6", }; - // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc) - const bool is_supported_op_type = + bool is_supported_op_type = (candidate_ops.count(node->type_string()) || PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string())); + static const std::set quantize_ops = { + "QuantizeAndDequantizeV2", + "QuantizeAndDequantizeV3", + "FakeQuantWithMinMaxVars", + "FakeQuantWithMinMaxArgs", + }; + // In INT8 mode, we will always apply the quantization ranges provided by + // these ops to the relevant tensors. This happens regardless of the value of + // use_calibration. + if (precision_mode_ == INT8MODE && quantize_ops.count(node->type_string())) { + is_supported_op_type = true; + } + // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc) if (!is_supported_op_type) { return errors::Unimplemented("Op type ", node->type_string(), - " is not supported."); + " is not supported"); } std::vector input_edges; @@ -220,7 +233,8 @@ tensorflow::Status ConvertGraphDefToTensorRT( const std::vector& output_names, size_t max_batch_size, size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def, int precision_mode, int minimum_segment_size, bool is_dyn_op, - int max_cached_engines, std::vector cached_engine_batches) { + int max_cached_engines, std::vector cached_engine_batches, + bool use_calibration) { // Create GrapplerItem. tensorflow::grappler::GrapplerItem item; item.fetch = output_names; @@ -287,6 +301,7 @@ tensorflow::Status ConvertGraphDefToTensorRT( list->add_i(batch); } } + parameters["use_calibration"].set_b(use_calibration); // Run optimizer. tensorflow::grappler::MetaOptimizer meta_opt(nullptr, config_proto); @@ -566,27 +581,30 @@ tensorflow::Status CreateTRTNode(const std::vector& infos, int pos, } } } + + const bool calibrate_int8 = + (info.precision_mode == INT8MODE && info.use_calibration); + // Build the engine and get its serialized representation. string segment_string; - if (info.engine_type == EngineInfo::EngineType::TRTStatic || - info.precision_mode == INT8MODE) { + if (info.engine_type == EngineInfo::EngineType::TRTStatic || calibrate_int8) { // Create static engine for fp32/fp16 mode, and test validity of the engine - // for int8 mode. We don't want engine to fail at the calibration time. - // So we are constructing a FP32 engine here to check its validity, and if - // it is a valid engine then we put the serialized graphdef to the op. - // Otherwise we skip node creation for this engine. + // for int8 calibration mode. We don't want engine to fail at the + // calibration time. So we are constructing a FP32 engine here to check its + // validity, and if it is a valid engine then we put the serialized graphdef + // to the op. Otherwise we skip node creation for this engine. Logger trt_logger; TrtUniquePtrType engine; // TODO(sami): What happens if 1st dim is not batch? TF_RETURN_IF_ERROR(ConvertGraphDefToEngine( - info.segment_graph_def, - info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode, + info.segment_graph_def, calibrate_int8 ? FP32MODE : info.precision_mode, max_batch_size, info.max_workspace_size_bytes, input_shapes, &trt_logger, alloc, /*calibrator=*/nullptr, &engine, + info.use_calibration, /*convert_successfully=*/nullptr)); TrtUniquePtrType engine_data(engine->serialize()); segment_string = string((const char*)engine_data->data(), engine_data->size()); - if (info.precision_mode == INT8MODE) { + if (calibrate_int8) { // See above comment about why not putting this inside the 'else' branch. segment_string = info.segment_graph_def.SerializeAsString(); } @@ -598,7 +616,7 @@ tensorflow::Status CreateTRTNode(const std::vector& infos, int pos, // conversion. string prec_string; TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string)); - if (info.precision_mode == INT8MODE && + if (info.precision_mode == INT8MODE && calibrate_int8 && !TRTResourceManager::instance()->getManager("TRTCalibration")) { LOG(ERROR) << "Failed to construct calibration storage"; } @@ -634,6 +652,7 @@ tensorflow::Status CreateTRTNode(const std::vector& infos, int pos, .Attr("cached_engine_batches", {max_batch_size}) .Attr("workspace_size_bytes", info.max_workspace_size_bytes) .Attr("precision_mode", prec_string) + .Attr("use_calibration", info.use_calibration) .Attr("OutT", out_types) .Finalize(&trt_node); if (!status.ok()) { @@ -866,7 +885,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { } segment_options.minimum_segment_size = params.minimum_segment_size; tensorflow::tensorrt::segment::SegmentNodesVector initial_segments; - TrtCandidateSelector candidate_selector(*params.graph_properties); + TrtCandidateSelector candidate_selector(*params.graph_properties, + params.precision_mode); TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph( &graph, std::bind(&TrtCandidateSelector::IsTensorRTCandidate, &candidate_selector, @@ -904,10 +924,14 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) { continue; } curr_engine.precision_mode = params.precision_mode; - curr_engine.engine_type = - (params.is_dyn_op || params.precision_mode == INT8MODE - ? EngineInfo::EngineType::TRTDynamic - : EngineInfo::EngineType::TRTStatic); + if (params.use_calibration && params.precision_mode != INT8MODE) { + return errors::InvalidArgument( + "Calibration with FP32 or FP16 is not supported."); + } + curr_engine.engine_type = ((params.is_dyn_op || params.use_calibration) + ? EngineInfo::EngineType::TRTDynamic + : EngineInfo::EngineType::TRTStatic); + curr_engine.use_calibration = params.use_calibration; curr_engine.cached_engine_batches = params.cached_engine_batches; curr_engine.maximum_cached_engines = params.max_cached_engines; StrAppend(&curr_engine.engine_name, "my_trt_op_", t); diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h index 1c9d82105a7b380cafbb27c340a4cc9d1580ee2c..1f39f56f6392ba33af3d74fec12c326ed4451cb6 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.h +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h @@ -35,7 +35,8 @@ namespace convert { // supported by TRT. class TrtCandidateSelector { public: - TrtCandidateSelector(const grappler::GraphProperties& graph_properties); + TrtCandidateSelector(const grappler::GraphProperties& graph_properties, + int precision_mode); // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added // to TRT subgraph and later converted into TRT engine. @@ -49,6 +50,9 @@ class TrtCandidateSelector { // GraphProperties of the graph whose nodes are to be validated by // IsTensorRTCandidate(). const grappler::GraphProperties& graph_properties_; + + // Quantization ops are only converted when using quantized precisions. + const int precision_mode_; }; struct ConversionParams { @@ -63,6 +67,7 @@ struct ConversionParams { cluster(nullptr), is_dyn_op(false), fixed_input_size(true), + use_calibration(true), max_cached_engines(1) {} const tensorflow::GraphDef* input_graph_def; const std::vector* output_names; @@ -76,6 +81,7 @@ struct ConversionParams { bool is_dyn_op; // Whether to create engine on conversion or execution time bool fixed_input_size; // Assume non-batch ranks of input tensors are fixed int max_cached_engines; // maximum number of cached engines + bool use_calibration; std::vector cached_engine_batches; // list of cached engines }; @@ -95,7 +101,7 @@ tensorflow::Status ConvertGraphDefToTensorRT( size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def, int precision_mode = 1, int minimum_segment_size = 3, bool is_dyn_op = false, int max_cached_engines = 1, - std::vector cached_engine_batches = {}); + std::vector cached_engine_batches = {}, bool use_calibration = true); // Method to call from optimization pass tensorflow::Status ConvertAfterShapes(ConversionParams& params); diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc index f10729987fdb787c6a745fdac28fe7d7d60d08fa..2d2bfeb192c1893824c7b30bfad593c62c203392 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc @@ -85,27 +85,42 @@ TEST(TrtCandidateSelector, Basics) { ops::MatMul(s.WithOpName("matmul_with_incompatible_input"), incompatible_feed, const_2); + // Quantize ops. + auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f); + auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("quantize"), feed, + quantize_attrs); + + // Get GrapplerItem and GraphProperties. grappler::GrapplerItem item; TF_EXPECT_OK(s.ToGraphDef(&item.graph)); Tensor feed_tensor(DT_FLOAT, input_shape); item.feed.push_back(std::make_pair("feed", feed_tensor)); - grappler::GraphProperties graph_properties(item); TF_EXPECT_OK(graph_properties.InferStatically(true)); - TrtCandidateSelector selector(graph_properties); - TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node())); - ExpectStatus( - selector.IsTensorRTCandidate(incompatible_matmul.operation.node()), - error::INVALID_ARGUMENT, - "transpose_a is not supported for TensorRT FullyConnected " - "(op: MatMul), at: incompatible_matmul"); - ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()), - error::UNIMPLEMENTED, "Op type Sin is not supported"); - ExpectStatus(selector.IsTensorRTCandidate( - matmul_with_incompatible_input.operation.node()), - error::INTERNAL, - "Failed to convert input with index 0 to a TRT_TensorOrWeights"); + for (const int precision_mode : {FP32MODE, INT8MODE}) { + TrtCandidateSelector selector(graph_properties, precision_mode); + TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node())); + ExpectStatus( + selector.IsTensorRTCandidate(incompatible_matmul.operation.node()), + error::INVALID_ARGUMENT, + "transpose_a is not supported for TensorRT FullyConnected " + "(op: MatMul), at: incompatible_matmul"); + ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()), + error::UNIMPLEMENTED, "Op type Sin is not supported"); + ExpectStatus( + selector.IsTensorRTCandidate( + matmul_with_incompatible_input.operation.node()), + error::INTERNAL, + "Failed to convert input with index 0 to a TRT_TensorOrWeights"); + if (precision_mode == INT8MODE) { + TF_EXPECT_OK(selector.IsTensorRTCandidate(quantize.operation.node())); + } else { + ExpectStatus(selector.IsTensorRTCandidate(quantize.operation.node()), + error::UNIMPLEMENTED, + "Op type FakeQuantWithMinMaxArgs is not supported"); + } + } } class FakeCluster : public grappler::Cluster { diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc index 431e3636126137e3af641af027eaaec3f80e525e..631c2575c062811dce86348bd9711d3a36a7c585 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc @@ -54,10 +54,10 @@ limitations under the License. // would work! #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2) -#define TFTRT_INTERNAL_ERROR_AT_NODE(node) \ - do { \ - return tensorflow::errors::Internal( \ - "TFTRT::", __FUNCTION__, "failed to add TRT layer, at: ", node); \ +#define TFTRT_INTERNAL_ERROR_AT_NODE(node) \ + do { \ + return tensorflow::errors::Internal( \ + "TFTRT::", __FUNCTION__, " failed to add TRT layer, at: ", node); \ } while (0) #define TFTRT_RETURN_ERROR_IF_FALSE(status, node) \ @@ -187,10 +187,49 @@ Status ValidateTensorProperties(const string& producer_node_type, return Status::OK(); } +string DebugString(const nvinfer1::DimensionType type) { + switch (type) { + case nvinfer1::DimensionType::kSPATIAL: + return "kSPATIAL"; + case nvinfer1::DimensionType::kCHANNEL: + return "kCHANNEL"; + case nvinfer1::DimensionType::kINDEX: + return "kINDEX"; + case nvinfer1::DimensionType::kSEQUENCE: + return "kSEQUENCE"; + default: + return StrCat(static_cast(type), "=unknown"); + } +} + +string DebugString(const nvinfer1::DataType trt_dtype) { + switch (trt_dtype) { + case nvinfer1::DataType::kFLOAT: + return "kFLOAT"; + case nvinfer1::DataType::kHALF: + return "kHALF"; + case nvinfer1::DataType::kINT8: + return "kINT8"; + case nvinfer1::DataType::kINT32: + return "kINT32"; + default: + return "Invalid TRT data type"; + } +} + string DebugString(const nvinfer1::Dims& dims) { string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d="); for (int i = 0; i < dims.nbDims; ++i) { - StrAppend(&out, dims.d[i], ","); + StrAppend(&out, dims.d[i], "[", DebugString(dims.type[i]), "],"); + } + StrAppend(&out, ")"); + return out; +} + +string DebugString(const nvinfer1::Permutation& permutation, int len) { + string out = "nvinfer1::Permutation("; + for (int i = 0; i < len; ++i) { + StrAppend(&out, permutation.order[i], ","); } StrAppend(&out, ")"); return out; @@ -198,16 +237,15 @@ string DebugString(const nvinfer1::Dims& dims) { string DebugString(const nvinfer1::ITensor& tensor) { return StrCat("nvinfer1::ITensor(@", reinterpret_cast(&tensor), - ", shape=", DebugString(tensor.getDimensions()), ")"); + ", name=", tensor.getName(), + ", dtype=", DebugString(tensor.getType()), + ", dims=", DebugString(tensor.getDimensions()), ")"); } -// Return whether or not the broadcast is feasible; -bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l, - const bool operand_l_is_tensor, - const nvinfer1::Dims& operand_r, - const bool operand_r_is_tensor, - nvinfer1::Dims* operand_l_new_shape, - nvinfer1::Dims* operand_r_new_shape) { +Status Converter::GetTrtBroadcastShape( + const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r, + nvinfer1::Dims* operand_l_new_dims, + nvinfer1::Dims* operand_r_new_dims) const { // *************************************************************************** // TensorRT Elementwise op supports broadcast but requires both tensor to be // of Identical rank @@ -232,52 +270,59 @@ bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l, // -> T: 1 1 1 -1 3 5 1 // -> W: 1 1 1 1 3 5 1 // *************************************************************************** - const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1; - const size_t element_size = sizeof(operand_l.d[0]); - - // fill in dimensions - int l_s[max_nb_dims]; - std::fill(l_s, l_s + max_nb_dims, 1); - int l_d = operand_l_is_tensor ? operand_l.nbDims + 1 : operand_l.nbDims; - int r_s[max_nb_dims]; - std::fill(r_s, r_s + max_nb_dims, 1); - int r_d = operand_r_is_tensor ? operand_r.nbDims + 1 : operand_r.nbDims; - - int max_d = std::max(l_d, r_d); - std::memcpy(l_s + max_d - operand_l.nbDims, operand_l.d, - operand_l.nbDims * element_size); - std::memcpy(r_s + max_d - operand_r.nbDims, operand_r.d, - operand_r.nbDims * element_size); - - // set -1 for batch dimension, since batch size is not supposed to be - // broadcasted - if (operand_l_is_tensor) { - if (max_d != l_d) { // if broadcast beyond batch dimension, fail - return false; - } - l_s[0] = -1; - } - if (operand_r_is_tensor) { - if (max_d != r_d) { // if broadcast beyond batch dimension, fail - return false; - } - r_s[0] = -1; + if (!operand_l.is_tensor() && !operand_r.is_tensor()) { + return errors::InvalidArgument( + "Broadcasting requires at least one of the operands be tensors"); } - // compare broadcast feasibility - for (int i = max_d - 1; i >= 0; i--) { - if ((l_s[i] != r_s[i]) && (l_s[i] != 1) && (r_s[i] != 1)) { - return false; + const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1; + auto compute_output_dims = + [max_nb_dims](const TRT_TensorOrWeights& input, int broadcast_num_dims, + int* output_dims_array, nvinfer1::Dims* output_dims) { + const nvinfer1::Dims input_dims = input.GetTrtDims(); + std::fill(output_dims_array, output_dims_array + max_nb_dims, 1); + std::copy(input_dims.d, input_dims.d + input_dims.nbDims, + output_dims_array + broadcast_num_dims - input_dims.nbDims); + if (input.is_tensor()) { + const int true_input_dims = input_dims.nbDims + 1; + if (true_input_dims < broadcast_num_dims) { + return errors::InvalidArgument( + "Broadcasting beyond batch dimension is not supported ", + "(tensor #dims ", true_input_dims, " vs broadcast #dims ", + broadcast_num_dims, ")"); + } + // Set the batch dimension to -1, since batch size is not supposed to + // be broadcasted. + output_dims_array[0] = -1; + } + // Copy to output dimensions (stripping the batch dimension). + output_dims->nbDims = broadcast_num_dims - 1; + std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims, + output_dims->d); + return Status::OK(); + }; + + // Compute the output dimensions. + const int broadcast_num_dims = + std::max(operand_l.GetTrtDims().nbDims + (operand_l.is_tensor() ? 1 : 0), + operand_r.GetTrtDims().nbDims + (operand_r.is_tensor() ? 1 : 0)); + int output_l[max_nb_dims], output_r[max_nb_dims]; + TF_RETURN_IF_ERROR(compute_output_dims(operand_l, broadcast_num_dims, + output_l, operand_l_new_dims)); + TF_RETURN_IF_ERROR(compute_output_dims(operand_r, broadcast_num_dims, + output_r, operand_r_new_dims)); + + // Compare broadcast feasibility + for (int i = 0; i < broadcast_num_dims; ++i) { + if ((output_l[i] != output_r[i]) && (output_l[i] != 1) && + (output_r[i] != 1)) { + return errors::InvalidArgument( + "Infeasible broadcast scheme (", "batch_dim: ", output_l[0], ", ", + DebugString(*operand_l_new_dims), " vs ", "batch_dim: ", output_r[0], + ", ", DebugString(*operand_r_new_dims), ")"); } } - - // output new TensorRT Dimension (stripping the batch dimension) - operand_l_new_shape->nbDims = max_d - 1; - std::memcpy(operand_l_new_shape->d, l_s + 1, (max_d - 1) * element_size); - operand_r_new_shape->nbDims = max_d - 1; - std::memcpy(operand_r_new_shape->d, r_s + 1, (max_d - 1) * element_size); - - return true; + return Status::OK(); } inline bool DimsEqual(const nvinfer1::Dims& dim_l, @@ -381,7 +426,7 @@ size_t TRT_ShapedWeights::size_bytes() const { string TRT_ShapedWeights::DebugString() const { return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_), - ", type=", type_, + ", type=", DataTypeString(type_), ", values=", reinterpret_cast(GetValues()), ")"); } @@ -425,7 +470,9 @@ class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor { void setLocation(nvinfer1::TensorLocation location) override {} #if NV_TENSORRT_MAJOR >= 5 - bool setDynamicRange(float min, float max) override {} + bool setDynamicRange(float min, float max) override { return true; } + + float getDynamicRange() const override { return 0; } #endif private: @@ -489,8 +536,7 @@ nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const { string TRT_TensorOrWeights::DebugString() const { string output = "TRT_TensorOrWeights(type="; if (is_tensor()) { - StrAppend(&output, "tensor @", reinterpret_cast(tensor()), - ", shape=", convert::DebugString(tensor()->getDimensions()), + StrAppend(&output, "tensor=", convert::DebugString(*tensor()), ", batch_size=", batch_size_); } else { StrAppend(&output, "weights=", weights_.DebugString()); @@ -753,8 +799,9 @@ Status TrtNodeValidator::ValidateNode( Status status = ConvertToTensorOrWeights( *pair.first, pair.second, graph_properties, &tensor_or_weights); if (!status.ok()) { - return errors::Internal("Failed to convert input with index ", i, - " to a TRT_TensorOrWeights"); + return errors::Internal( + "Failed to convert input with index ", i, + " to a TRT_TensorOrWeights: ", status.error_message()); } inputs.push_back(tensor_or_weights); } @@ -786,8 +833,11 @@ Status TrtNodeValidator::ConvertConstToWeights( return status; } -Converter::Converter(nvinfer1::INetworkDefinition* trt_network, bool is_fp16) - : trt_network_(trt_network), is_fp16_(is_fp16) { +Converter::Converter(nvinfer1::INetworkDefinition* trt_network, + int precision_mode, bool use_calibration) + : trt_network_(trt_network), + precision_mode_(precision_mode), + use_calibration_(use_calibration) { this->RegisterOpConverters(); } @@ -812,13 +862,18 @@ Status Converter::ConvertNode(const NodeDef& node_def) { TRT_TensorOrWeights& output = outputs[i]; string output_name = node_def.name(); if (i != 0) output_name = StrCat(output_name, ":", i); - // We need to check the name before setting it. For Identity op where the - // output is the input, if its input is one of the engine input, setting - // the name here will overwrite engine input bindings which will cause - // runtime error. + // We need to check the name before setting it. If the input is one of the + // engine input, setting the name here will overwrite engine input + // bindings which will cause runtime error. if (output.is_tensor()) { const char* tensor_name = output.tensor()->getName(); - if (tensor_name == nullptr || std::strlen(tensor_name) == 0) { + if (!tensorflow::str_util::StartsWith(tensor_name, kInputPHName)) { + // TRT initializes tensor names as "(Unnamed ITensor* N)". We rename + // them to match their corresponding TensorFlow name. + // Note: ITensors that we create internally within TF-TRT which are + // not inputs or outputs of a node will not be renamed. This is a + // potential cause of confusion if an error message or warning + // mentions the unnamed tensor. output.tensor()->setName(output_name.c_str()); } } @@ -930,11 +985,14 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor, nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose"); + MarkQuantizationRangesAsInferrable(input_tensor, layer->getOutput(0)); nvinfer1::Permutation permutation; for (int32_t i = 0; i < dims.nbDims; ++i) { permutation.order[i] = order_with_batch_dim[i + 1] - 1; } + VLOG(1) << "TransposeTensor permutation: " + << DebugString(permutation, dims.nbDims); layer->setFirstTranspose(permutation); nvinfer1::Dims reshape_dims; @@ -950,6 +1008,38 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor, return tensorflow::Status::OK(); } +Status Converter::GetWeightRange(const TRT_ShapedWeights& weights, + float* out_min, float* out_max) const { + switch (weights.type_) { + case DataType::DT_FLOAT: { + auto inp = static_cast(weights.GetValues()); + auto result = std::minmax_element(inp, inp + weights.count()); + *out_min = *result.first; + *out_max = *result.second; + break; + } + case DataType::DT_HALF: { + auto inp = static_cast(weights.GetValues()); + auto result = std::minmax_element(inp, inp + weights.count()); + *out_min = Eigen::half_impl::half_to_float(*result.first); + *out_max = Eigen::half_impl::half_to_float(*result.second); + break; + } + case DataType::DT_INT32: { + auto inp = static_cast(weights.GetValues()); + auto result = std::minmax_element(inp, inp + weights.count()); + *out_min = static_cast(*result.first); + *out_max = static_cast(*result.second); + break; + } + default: + return errors::Unimplemented( + "Data type not supported for GetWeightRange: ", + DataTypeString(weights.type_)); + } + return Status::OK(); +} + Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input, const nvinfer1::Dims& dims, const nvinfer1::ITensor** tensor) { @@ -964,8 +1054,9 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input, } if (can_check_shapes && TrtDimsNumElements(input.GetTrtDims()) != TrtDimsNumElements(dims)) { - return tensorflow::errors::InvalidArgument( - "Reshape shapes are not compatible."); + return errors::InvalidArgument("Reshape shapes are not compatible (", + DebugString(input.GetTrtDims()), " vs ", + DebugString(dims), ")"); } if (input.is_tensor()) { @@ -976,6 +1067,8 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input, *const_cast(input.tensor())); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape"); layer->setReshapeDimensions(dims); + MarkQuantizationRangesAsInferrable( + const_cast(input.tensor()), layer->getOutput(0)); *tensor = layer->getOutput(0); } } else { @@ -983,10 +1076,123 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input, this->network()->addConstant(dims, input.weights().GetTrtWeights()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape"); *tensor = layer->getOutput(0); + if (precision_mode() == INT8MODE && !use_calibration()) { + // If we are in int8 mode and not calibrating, we need to explicitly set a + // quantization range for the output tensor of the IConstantLayer. Here we + // set the range to [min(weights), max(weights)]. + float min_range = 0.0f; + float max_range = 0.0f; + TF_RETURN_IF_ERROR( + GetWeightRange(input.weights(), &min_range, &max_range)); + // Avoid setting range to 0 because TRT will throw an error. If the + // weights are zero then the range doesn't matter: using 127.0f should + // ensure the quantized weight will be exactly zero. + if (min_range == 0.0f && max_range == 0.0f) { + min_range = -127.0f; + max_range = 127.0f; + } + ProvideQuantizationRange(const_cast(*tensor), + min_range, max_range); + } } return tensorflow::Status::OK(); } +void Converter::MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input, + nvinfer1::ITensor* output) { + quantization_infer_.push_back({input, output}); + quantization_infer_.push_back({output, input}); +} + +void Converter::ProvideQuantizationRange(nvinfer1::ITensor* tensor, + float min_range, float max_range) { + float symmetric_range = std::max(std::abs(min_range), std::abs(max_range)); + quantization_ranges_[tensor] = symmetric_range; +} + +void Converter::MaybeApplyQuantizationRanges() { + if (precision_mode() != INT8MODE) return; + + // Infer ranges across marked ops. + PropagateQuantizationRanges(); + // Apply ranges. +#if NV_TENSORRT_MAJOR >= 5 + for (auto pair : quantization_ranges_) { + nvinfer1::ITensor* tensor = pair.first; + const float range = pair.second; + VLOG(1) << "Setting range for: " << tensor->getName() << ": " << range; + // TODO(laigd): if 'tensor' already has a range set which doesn't match + // 'range', it should report error. + tensor->setDynamicRange(-range, range); + } +#endif + + // Warn user about tensors that are missing ranges. If TRT fuses some layers + // then these tensors may not actually be required, which is why this is + // just a warning. If we are still missing ranges even after fusion, + // Builder::buildCudaEngine() will return nullptr and we will catch the + // error at that point. + if (!use_calibration()) { + // Get all tensors from network + std::set all_tensors; + for (int i = 0; i < this->network()->getNbLayers(); i++) { + nvinfer1::ILayer* layer = this->network()->getLayer(i); + for (int j = 0; j < layer->getNbInputs(); j++) { + all_tensors.insert(layer->getInput(j)); + } + for (int j = 0; j < layer->getNbOutputs(); j++) { + all_tensors.insert(layer->getOutput(j)); + } + } + // Find tensors with no ranges + for (auto tensor : all_tensors) { + if (!quantization_ranges_.count(tensor)) { + // Note: there may be some warnings for "(Unnamed ITensor* N)". These + // are tensors which are created internally by TF-TRT. The ranges for + // these unnamed ITensors are always inferred from user provided ranges, + // thus there will also be a warning for the range(s) the user missed. + LOG(WARNING) << "Quantization range was not found for " + << tensor->getName() << ". " + << "This is okay if TensorRT does not need the range " + << "(e.g. due to node fusion)."; + } + } + } +} + +void Converter::PropagateQuantizationRanges() { + // Propagate ranges across edges in quantization_infer_ until no new + // information is added. + // Note: this function modifies quantization_infer_, it might be better to + // modify a copy instead if we for some reason need quantization_infer_ + // later. + bool information_added = true; + while (information_added) { + information_added = false; + for (auto it = quantization_infer_.begin(); + it != quantization_infer_.end();) { + auto input_tensor_range = quantization_ranges_.find(it->first); + auto output_tensor_range = quantization_ranges_.find(it->second); + if (input_tensor_range != quantization_ranges_.end() && + output_tensor_range == quantization_ranges_.end()) { + // Input has range but output doesn't: copy range + // TODO(laigd): consider reporting error if it a different range is + // already set. + quantization_ranges_[it->second] = input_tensor_range->second; + information_added = true; + VLOG(1) << "Copy quantization range: " << it->first->getName() << " -> " + << it->second->getName(); + } + // We can remove edges when the output range is known + if (quantization_ranges_.find(it->second) != quantization_ranges_.end()) { + it = quantization_infer_.erase(it); + } else { + ++it; + } + } + } +} + Status Converter::GetInputs(const tensorflow::NodeDef& node_def, std::vector* inputs) const { for (auto const& input_name : node_def.input()) { @@ -1043,12 +1249,11 @@ TRT_ShapedWeights ConvertFP32ToFP16(TrtWeightStore* store, } // **************************************************************************** -// Constant folding functions -// TODO(jie): once optimizer kicks in, we should have done constant folding -// there. +// Constant folding functions for weights. +// TODO(laigd): we should probably use eigen directly. // ***************************************************************************** struct LambdaFactory { - enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB, RECIP }; + enum class OP_CATEGORY : int { RSQRT = 0, NEG, RECIP }; OP_CATEGORY op; template @@ -1063,7 +1268,7 @@ struct LambdaFactory { case OP_CATEGORY::RECIP: return [](T t) -> T { return 1.0 / t; }; default: - VLOG(2) << "Not supported op for unary: " << static_cast(op); + LOG(ERROR) << "Not supported op for unary: " << static_cast(op); return nullptr; } } @@ -1074,15 +1279,18 @@ std::function LambdaFactory::unary() { switch (op) { case OP_CATEGORY::RSQRT: { VLOG(2) << "RSQRT GETS DONE"; - return [](Eigen::half t) -> Eigen::half { + return [](Eigen::half t) { return Eigen::half(1.0 / sqrt(static_cast(t))); }; } case OP_CATEGORY::NEG: - return [](Eigen::half t) -> Eigen::half { return -t; }; - // TODO(aaroey): can we support RECIP? + return [](Eigen::half t) { return -t; }; + case OP_CATEGORY::RECIP: + return [](Eigen::half t) { + return Eigen::half(1.0 / static_cast(t)); + }; default: - VLOG(2) << "Not supported op for unary: " << static_cast(op); + LOG(ERROR) << "Not supported op for unary: " << static_cast(op); return nullptr; } } @@ -1114,50 +1322,48 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights, return tensorflow::Status::OK(); } +// If swapped_inputs is false, 'tensor' is the left operand and 'weights' is the +// right operand. If swapped_inputs is true, those two are swapped. +// // TODO(jie): broadcast is needed yet not implemented. -// Only implemented channel wise for the time being -tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params, - const nvinfer1::ITensor* tensor, - TRT_ShapedWeights weights, - bool swapped_inputs) { +// Only implemented channel wise for the time being. +Status BinaryTensorOpWeight(OpConverterParams* params, + const nvinfer1::ITensor* tensor, + TRT_ShapedWeights weights, bool swapped_inputs) { + static const std::unordered_set supported_ops = {"Sub", "Add", "Mul", + "Div", "RealDiv"}; const auto& node_def = params->node_def; - // tensor is the left operand while weights is the right operand; - // when swapped_inputs set to true, those two are swapped. - // TODO(aaroey): use a set. - if (node_def.op() != "Sub" && node_def.op() != "Add" && - node_def.op() != "Mul" && node_def.op() != "Div" && - node_def.op() != "RealDiv") { - return tensorflow::errors::Unimplemented( - "op not supported: " + node_def.op() + ", at: " + node_def.name()); + if (!supported_ops.count(node_def.op())) { + return errors::Unimplemented(node_def.op(), " is not supported, at ", + node_def.name()); } - // Check type consistency - nvinfer1::DataType ttype; - TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &ttype)); + // Check type consistency. + nvinfer1::DataType trt_dtype; + TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &trt_dtype)); - // Check scale mode + // Check scale mode. auto dims_w = weights.shape_; - auto dims_t = tensor->getDimensions(); + const auto dims_t = tensor->getDimensions(); // TODO(jie): addScale checks for input tensor dimension if (dims_t.nbDims != 3) { - return tensorflow::errors::InvalidArgument( - "addScale requires tensor with rank 3, " + node_def.name()); + return errors::InvalidArgument("addScale requires tensor with rank 3, at ", + node_def.name()); } - // default to element-wise + // Default to element-wise auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; // TODO(jie): maybe use a permutation instead to support more cases; - bool permutation_flag = false; + bool need_to_permute = false; if (weights.count() == 1) { - VLOG(2) << "UNIFORM"; scale_mode = nvinfer1::ScaleMode::kUNIFORM; } else { - // no broadcasting on Batch dimension; - VLOG(2) << "WEIGHTS DIM: " << dims_w.nbDims - << " tensor DIM: " << dims_t.nbDims; + VLOG(2) << "weights dims: " << DebugString(dims_w) + << "; tensor dims: " << DebugString(dims_t); + // Make sure no broadcasting on batch dimension. if (dims_w.nbDims == dims_t.nbDims + 1) { if (dims_w.d[0] == 1) { for (int i = 1; i < dims_w.nbDims; i++) { @@ -1165,72 +1371,70 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params, } dims_w.nbDims--; } else { - return tensorflow::errors::InvalidArgument( - "Binary op cannot operate on batch, " + node_def.name()); + return errors::InvalidArgument("Binary op cannot operate on batch, at ", + node_def.name()); } } if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) { scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; - // default is element; + // Default is element-wise for (int i = 1; i < dims_w.nbDims; i++) { if (dims_w.d[i] != dims_t.d[i]) { - // if dimension does not match, switch back to channel; - VLOG(2) << "channel"; + // If dimension does not match, switch back to per-channel scale_mode = nvinfer1::ScaleMode::kCHANNEL; break; } } - // if channel as candidate, validate it + // If the mode is per-channel, since channel dimension is assumed to be + // the third to last dimension, we need to make sure all other dimensions + // have size 1. if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) { for (int i = 1; i < dims_w.nbDims; i++) { if (dims_w.d[i] != 1) - return tensorflow::errors::InvalidArgument( - "Weight shape not compatible at, " + node_def.name()); + return errors::InvalidArgument( + "Weight dims not compatible for channel-wise broadcast at ", + node_def.name()); } - } else { - VLOG(2) << "elementwise"; } } else if (dims_w.nbDims == 1 && dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) { - // channel wise and broadcast required; - permutation_flag = true; + // Channel wise and broadcast required. We compare the last dimension of + // the tensor shape because of tensorflow default broadcasting rules. + need_to_permute = true; scale_mode = nvinfer1::ScaleMode::kCHANNEL; } else { - return tensorflow::errors::InvalidArgument( - "Weight shape not compatible at, " + node_def.name()); + return errors::InvalidArgument("Weight dims not compatible at ", + node_def.name()); } } + // TODO(laigd): we should add validation_only support in TransposeTensor() and + // PrepareTensorForShape(). + if (params->validation_only) return Status::OK(); - // transpose last dimension + // Transpose last dimension. std::vector permutation(dims_t.nbDims + 1); - if (permutation_flag) { - if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) { - // we swap the last dimension into channel for trt. - // because of tensorflow default broadcasting rules. - for (int i = 0; i < static_cast(permutation.size()); i++) { - permutation[i] = i; - } - permutation[1] = dims_t.nbDims; - permutation[dims_t.nbDims] = 1; - TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - const_cast(tensor), permutation, &tensor)); - } else { - return tensorflow::errors::InvalidArgument( - "Transpose cannot be applied, " + node_def.name()); + if (need_to_permute) { + // We swap the last dimension into channel for trt, because of tensorflow + // default broadcasting rules. + for (int i = 0; i < static_cast(permutation.size()); i++) { + permutation[i] = i; } + permutation[1] = dims_t.nbDims; + permutation[dims_t.nbDims] = 1; + TF_RETURN_IF_ERROR(params->converter->TransposeTensor( + const_cast(tensor), permutation, &tensor)); } - if (params->converter->is_fp16()) { + if (params->converter->precision_mode() == FP16MODE) { weights = ConvertFP32ToFP16(params->weight_store, weights); } - // prepare weights + // Prepare weights TRT_ShapedWeights shift_weights(weights.type_); TRT_ShapedWeights scale_weights(weights.type_); TRT_ShapedWeights power_weights(weights.type_); - // Maybe I should do a switch if (node_def.op() == "Sub") { if (swapped_inputs) { shift_weights = weights; @@ -1238,6 +1442,10 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params, *const_cast(tensor), nvinfer1::UnaryOperation::kNEG); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + // Since quantization ranges are symmetric, the same range as the input + // will work for the negation of the input. + params->converter->MarkQuantizationRangesAsInferrable( + const_cast(tensor), layer->getOutput(0)); tensor = layer->getOutput(0); } else { TRT_ShapedWeights neg_weights = @@ -1249,6 +1457,25 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params, } } else if (node_def.op() == "Div" || node_def.op() == "RealDiv") { if (swapped_inputs) { + // We need to infer the quantization range for this intermediate tensor. + // + // x -> [Recip] -> 1/x -> [Scale] -> s/x + // ^ + // need range for this + // + // We have the quantization scales for x and s/x - can we divide the scale + // for s/x by s? Only if it is a scalar. + // + // Because of this issue, fall back to BinaryTensorOpTensor if we are + // doing INT8 with no calibration. There is most likely no performance + // penalty by falling back here. + if (params->converter->precision_mode() == INT8MODE && + !params->converter->use_calibration()) { + return errors::Unimplemented( + "Intermediate quantization range cannot be determined without" + " calibration. Falling back to BinaryTensorOpTensor for ", + node_def.op(), ", at ", node_def.name()); + } scale_weights = weights; nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary( *const_cast(tensor), @@ -1268,8 +1495,8 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params, } else if (node_def.op() == "Add") { shift_weights = weights; } else { - return tensorflow::errors::Unimplemented("Binary op not supported: " + - node_def.op()); + // This should not happen. + return errors::Unimplemented("Binary op not supported at ", node_def.op()); } nvinfer1::IScaleLayer* layer = params->converter->network()->addScale( @@ -1279,8 +1506,8 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params, TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); const nvinfer1::ITensor* output_tensor = layer->getOutput(0); - // transpose back dimension - if (permutation_flag) { + // Transpose back dimension + if (need_to_permute) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( const_cast(output_tensor), permutation, &output_tensor)); @@ -1324,7 +1551,7 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) { return tensorflow::errors::Internal( "Conv2D expects kernel of dimension 4, at: " + node_def.name()); } - if (params->converter->is_fp16()) { + if (params->converter->precision_mode() == FP16MODE) { weights_rsck = ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights()); } @@ -1371,6 +1598,8 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) { nvinfer1::DimsHW(padding[0].first, padding[1].first), nvinfer1::DimsHW(padding[0].second, padding[1].second)); TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name()); + params->converter->MarkQuantizationRangesAsInferrable( + const_cast(tensor), pad_layer->getOutput(0)); padding = {{0, 0}, {0, 0}}; tensor = pad_layer->getOutput(0); VLOG(2) << "TENSOR after: " << DebugString(tensor->getDimensions()); @@ -1412,9 +1641,9 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, params->node_def.name()); } -tensorflow::Status BinaryTensorOpTensor(OpConverterParams* params, - const TRT_TensorOrWeights& operand_l, - const TRT_TensorOrWeights& operand_r) { +Status BinaryTensorOpTensor(OpConverterParams* params, + const TRT_TensorOrWeights& operand_l, + const TRT_TensorOrWeights& operand_r) { const auto& node_def = params->node_def; static const std::unordered_map ops{ {"Add", nvinfer1::ElementWiseOperation::kSUM}, @@ -1425,50 +1654,52 @@ tensorflow::Status BinaryTensorOpTensor(OpConverterParams* params, {"Minimum", nvinfer1::ElementWiseOperation::kMIN}, {"Maximum", nvinfer1::ElementWiseOperation::kMAX}, }; + auto op_pair = ops.find(node_def.op()); + if (op_pair == ops.end()) { + return errors::Unimplemented("Binary op ", node_def.op(), + " not supported at: ", node_def.name()); + } - const nvinfer1::ITensor* tensor_l; - const nvinfer1::ITensor* tensor_r; - - nvinfer1::Dims dim_l; - nvinfer1::Dims dim_r; - - if (!TensorRTGetBroadcastShape(operand_l.GetTrtDims(), operand_l.is_tensor(), - operand_r.GetTrtDims(), operand_r.is_tensor(), - &dim_l, &dim_r)) { - return tensorflow::errors::InvalidArgument( - "Binary op broadcast scheme not supported by TensorRT op: " + - node_def.op() + ", at: " + node_def.name()); + nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r; + Status status = params->converter->GetTrtBroadcastShape( + operand_l, operand_r, &broadcasted_dims_l, &broadcasted_dims_r); + if (!status.ok()) { + return errors::InvalidArgument( + "Unsupported binary op broadcast scheme for op ", node_def.name(), ": ", + status.error_message()); } + if (params->validation_only) return Status::OK(); - TF_RETURN_IF_ERROR( - params->converter->PrepareTensorForShape(operand_l, dim_l, &tensor_l)); - TF_RETURN_IF_ERROR( - params->converter->PrepareTensorForShape(operand_r, dim_r, &tensor_r)); + const nvinfer1::ITensor* tensor_l = nullptr; + const nvinfer1::ITensor* tensor_r = nullptr; + status = params->converter->PrepareTensorForShape( + operand_l, broadcasted_dims_l, &tensor_l); + if (status.ok()) { + status = params->converter->PrepareTensorForShape( + operand_r, broadcasted_dims_r, &tensor_r); + } + if (!status.ok()) { + return errors::Internal("Failed to convert binary op ", node_def.name(), + ": ", status.error_message()); + } - // get trt type & shape + // Check type consistency. TFAttrs attrs(node_def); - // maybe this part has to be moved into the block of rsqrt later nvinfer1::DataType dtype = attrs.get("T"); + TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype) + << DebugString(tensor_l->getType()) << " vs " << DebugString(dtype); + TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype) + << DebugString(tensor_r->getType()) << " vs " << DebugString(dtype); - // check type consistency - TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype); - TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype); - auto op_pair = ops.find(node_def.op()); - if (op_pair == ops.end()) { - return tensorflow::errors::Unimplemented( - "binary op: ", node_def.op(), " not supported at: ", node_def.name()); - } - + // Add ElementWise layer. nvinfer1::IElementWiseLayer* layer = params->converter->network()->addElementWise( - // TODO(aaroey): will tensor_l/tensor_r get modified? *const_cast(tensor_l), *const_cast(tensor_r), op_pair->second); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - nvinfer1::ITensor* output_tensor = layer->getOutput(0); - // pass the output + // Pass the output params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return tensorflow::Status::OK(); } @@ -1715,6 +1946,8 @@ tensorflow::Status ConvertPool(OpConverterParams* params) { nvinfer1::DimsHW(padding[0].first, padding[1].first), nvinfer1::DimsHW(padding[0].second, padding[1].second)); TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name()); + params->converter->MarkQuantizationRangesAsInferrable( + const_cast(tensor), pad_layer->getOutput(0)); padding = {{0, 0}, {0, 0}}; tensor = pad_layer->getOutput(0); } @@ -1722,6 +1955,11 @@ tensorflow::Status ConvertPool(OpConverterParams* params) { nvinfer1::IPoolingLayer* layer = params->converter->network()->addPooling( *const_cast(tensor), type, ksize); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); + // TODO(tmorris): Average pooling may not be entirely safe to infer + // quantization range through (at least forwards - backwards should be fine). + // Max pooling is okay. + params->converter->MarkQuantizationRangesAsInferrable( + const_cast(tensor), layer->getOutput(0)); layer->setStride(stride); layer->setPadding({padding[0].first, padding[1].first}); @@ -1750,99 +1988,249 @@ tensorflow::Status ConvertActivation(OpConverterParams* params) { return tensorflow::Status::OK(); } -tensorflow::Status ConvertScale(OpConverterParams* params) { +Status ConvertQuantize(OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; - if (inputs.size() != 2 || !inputs.at(0).is_tensor() || - !inputs.at(1).is_weights()) { + if ((inputs.size() == 0) || + (node_def.op() == "FakeQuantWithMinMaxArgs" && inputs.size() != 1) || + (node_def.op() == "FakeQuantWithMinMaxVars" && inputs.size() != 3) || + (node_def.op() == "QuantizeAndDequantizeV2" && inputs.size() != 3) || + (node_def.op() == "QuantizeAndDequantizeV3" && inputs.size() != 4)) { + return errors::InvalidArgument("Invalid number of inputs for ", + node_def.op(), ", at ", node_def.name()); + } + if (inputs.at(0).is_weights()) { + // TensorRT will automatically quantize weights, so we will ignore ranges + // for weights. + params->outputs->push_back(inputs.at(0)); + return Status::OK(); + } + float min_range = 0.0f; + float max_range = 0.0f; + if (node_def.op() == "FakeQuantWithMinMaxArgs") { + // Get ranges via node attributes. + TFAttrs attrs(node_def); + if (attrs.count("min") == 0 || attrs.count("max") == 0) { + return errors::InvalidArgument("Min or max attribute not found for ", + node_def.op(), " at ", node_def.name()); + } + min_range = attrs.get("min"); + max_range = attrs.get("max"); + } else if (node_def.op() == "FakeQuantWithMinMaxVars" || + node_def.op() == "QuantizeAndDequantizeV2" || + node_def.op() == "QuantizeAndDequantizeV3") { + // Get ranges via inputs. + if (!inputs.at(1).is_weights() || !inputs.at(2).is_weights()) { + return errors::InvalidArgument("Min and max inputs for ", node_def.op(), + " must be weights not tensors, at ", + node_def.name()); + } + auto get_weights_value = [&inputs](int index) { + auto raw_weights = static_cast( + const_cast(inputs.at(index).weights().GetValues())); + return raw_weights[0]; + }; + min_range = get_weights_value(1); + max_range = get_weights_value(2); + } else { + return errors::InvalidArgument("Unknown quantization op ", node_def.op(), + ", at ", node_def.name()); + } + if (params->validation_only) return Status::OK(); + + // Store ranges for tensor + params->converter->ProvideQuantizationRange( + const_cast(inputs.at(0).tensor()), min_range, + max_range); + // Sometimes, TRT may not quantize a tensor, either because it chooses to + // execute a higher precision kernel or because of op fusion. In these cases, + // accuracy will suffer if the model was trained to expect quantization at + // that tensor. We should consider adding a clip(tensor, min_range, max_range) + // operation here to ensure that any arbitrarily placed quantize node will + // execute as expected. However, this will negatively affect performance. If + // users train their models in a way which models inference as close as + // possible (i.e. not quantizing in place where fusion will occur), then there + // is no problem with the current implementation. + params->outputs->push_back(inputs.at(0)); + return Status::OK(); +} + +// TODO(pdavoodi): we should update relu6 implementation once TensorRT supports +// Relu6 natively. +tensorflow::Status ConvertRelu6(OpConverterParams* params) { + const auto& inputs = params->inputs; + const auto& node_def = params->node_def; + if (inputs.size() != 1) { + return tensorflow::errors::InvalidArgument( + "Invalid number of inputs for Relu6, at ", node_def.name()); + } + if (inputs.at(0).is_weights()) { return tensorflow::errors::Unimplemented( - "ConvertScale only supports tensorweight: ", node_def.name()); + "Relu6 is only implemented for tensors, not weights, at ", + node_def.name()); } + if (params->validation_only) return Status::OK(); + // *************************************************************************** + // TensorRT does not implement Relu6 natively. This function converts Relu6 op + // to available TensorRT ops: Relu6(x) = min(Relu(x), 6) + // *************************************************************************** + // Input Tensor const nvinfer1::ITensor* tensor = inputs.at(0).tensor(); - TRT_ShapedWeights weights = inputs.at(1).weights(); - if (params->converter->is_fp16()) { - weights = ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights()); - } - TRT_ShapedWeights empty_weights(weights.type_); - TFAttrs attrs(node_def); + // Relu operation i.e. Relu(x) = max(0, x) + nvinfer1::IActivationLayer* relu_layer = + params->converter->network()->addActivation( + *const_cast(tensor), + nvinfer1::ActivationType::kRELU); + TFTRT_RETURN_ERROR_IF_NULLPTR(relu_layer, node_def.name()); + + // Large range of relu is problematic during quantization in INT8 precision + // mode. Setting dynamic range of relu = [0.f, 6.0f] helps with quantization. + // TRT only uses dynamic ranges in INT8 precision mode, + // and this does not affect the FP32 path. + params->converter->ProvideQuantizationRange(relu_layer->getOutput(0), 0.0f, + 6.0f); + + // Create a constant layer to store the floating point weight i.e. 6.0f This + // tensor will be broadcasted uniformly during elementwise `min` operation. + // The constant has to have the same rank as the input in order for TRT to + // broadcast + nvinfer1::Dims dims; + dims.nbDims = relu_layer->getOutput(0)->getDimensions().nbDims; + for (int i = 0; i < dims.nbDims; i++) { + dims.d[i] = 1; + } + TRT_ShapedWeights weights = params->weight_store->GetTempWeights( + tensorflow::DataType::DT_FLOAT, dims); + auto weights_ptr = + static_cast(const_cast(weights.GetValues())); + weights_ptr[0] = 6.0f; + nvinfer1::IConstantLayer* const6_layer = + params->converter->network()->addConstant(dims, weights.GetTrtWeights()); + TFTRT_RETURN_ERROR_IF_NULLPTR(const6_layer, node_def.name()); + params->converter->ProvideQuantizationRange(const6_layer->getOutput(0), 0.0f, + 6.0f); + + // ElementWise Min Operation + // Min op is a nop for INT8 execution path, as the input tensor + // to this layer will only have values in range [0.f, 6.0f]. + const nvinfer1::ITensor* tensor_l = relu_layer->getOutput(0); + const nvinfer1::ITensor* tensor_r = const6_layer->getOutput(0); + nvinfer1::IElementWiseLayer* relu6_layer = + params->converter->network()->addElementWise( + *const_cast(tensor_l), + *const_cast(tensor_r), + nvinfer1::ElementWiseOperation::kMIN); + TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name()); + nvinfer1::ITensor* output_tensor = relu6_layer->getOutput(0); + params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f); - const auto data_format = attrs.get("data_format"); - int channel_index; - const auto dims = tensor->getDimensions(); - if (data_format == "NHWC") { - // 1). NHWC is really N+C - channel_index = dims.nbDims - 1; // batch dimension is implicit here! - } else { - // 2). NCHW is really N+CHW - channel_index = 0; // batch dimension is implicit here! - } + params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); + return Status::OK(); +} - nvinfer1::Permutation permutation; - for (int32_t i = 0; i < dims.nbDims; ++i) { - permutation.order[i] = i; +tensorflow::Status ConvertBiasAdd(OpConverterParams* params) { + const auto& inputs = params->inputs; + const auto& node_def = params->node_def; + if (inputs.size() != 2 || !inputs.at(0).is_tensor() || + !inputs.at(1).is_weights()) { + return errors::InvalidArgument("Input expects tensor and weights, at ", + node_def.name()); } + if (params->validation_only) return Status::OK(); + + nvinfer1::ITensor* tensor = + const_cast(inputs.at(0).tensor()); + const nvinfer1::Dims original_dims = tensor->getDimensions(); + TFAttrs attrs(node_def); + const string data_format = attrs.get("data_format"); + const int channel_index = + (data_format == "NHWC" ? original_dims.nbDims - 1 : 0); - if (channel_index >= 0) { + nvinfer1::Permutation permutation; + if (channel_index != 0) { + // Permute the dimensions so that the channel dimension is the first + // dimension. + for (int i = 0; i < original_dims.nbDims; ++i) { + permutation.order[i] = i; + } permutation.order[0] = channel_index; permutation.order[channel_index] = 0; - } else { - return tensorflow::errors::Unimplemented( - "TFTRT::BiasAdd cannot apply on batch dimension, at ", node_def.name()); + VLOG(1) << "ConvertBiasAdd permutation: " + << DebugString(permutation, original_dims.nbDims); } // TensorRT addScale requires input to be of rank 3, we need to apply - // transpose as well as reshape - if (channel_index != 0 || dims.nbDims != 3) { + // transpose as well as reshape. + // TODO(laigd): this doesn't match what the TRT doc says, fix the doc? + if (channel_index != 0 || original_dims.nbDims != 3) { nvinfer1::IShuffleLayer* shuffle_layer = - params->converter->network()->addShuffle( - *const_cast(tensor)); + params->converter->network()->addShuffle(*tensor); TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name()); + params->converter->MarkQuantizationRangesAsInferrable( + tensor, shuffle_layer->getOutput(0)); + + // NOTE(laigd): for some reason we need to apply the reshape + // unconditionally. The default shape has nbDims==-1 and it seems the + // behavior is undefined in some cases. nvinfer1::Dims reshape_dims; reshape_dims.nbDims = 3; - reshape_dims.d[0] = 0; // 0 copy from the input - reshape_dims.d[1] = dims.nbDims >= 2 ? 0 : 1; // 0 copy from the input - reshape_dims.d[2] = dims.nbDims >= 3 ? -1 : 1; // -1 infer from the rest + // 0 means copying from input; -1 means inferring from the rest. + reshape_dims.d[0] = 0; + reshape_dims.d[1] = original_dims.nbDims >= 2 ? 0 : 1; + reshape_dims.d[2] = original_dims.nbDims >= 3 ? -1 : 1; + shuffle_layer->setReshapeDimensions(reshape_dims); + if (channel_index != 0) { - // maybe we do not need this check. concerned about TRT optimization shuffle_layer->setFirstTranspose(permutation); } - shuffle_layer->setReshapeDimensions(reshape_dims); tensor = shuffle_layer->getOutput(0); } + TRT_ShapedWeights weights = inputs.at(1).weights(); + if (params->converter->precision_mode() == FP16MODE) { + weights = ConvertFP32ToFP16(params->weight_store, weights); + } nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL; if (weights.shape_.d[0] == 1) { mode = nvinfer1::ScaleMode::kUNIFORM; } + TRT_ShapedWeights empty_weights(weights.type_); nvinfer1::IScaleLayer* layer = params->converter->network()->addScale( - *const_cast(tensor), mode, weights.GetTrtWeights(), - empty_weights.GetTrtWeights(), empty_weights.GetTrtWeights()); + *tensor, mode, weights.GetTrtWeights(), empty_weights.GetTrtWeights(), + empty_weights.GetTrtWeights()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); nvinfer1::ITensor* output_tensor = layer->getOutput(0); - // restore transpose & reshape - if (channel_index != 0 || dims.nbDims != 3) { + // Restore transpose & reshape. + if (channel_index != 0 || original_dims.nbDims != 3) { nvinfer1::IShuffleLayer* shuffle_layer = - params->converter->network()->addShuffle( - *const_cast(output_tensor)); + params->converter->network()->addShuffle(*output_tensor); TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name()); - nvinfer1::Dims reshape_dims = dims; - int tmp = reshape_dims.d[channel_index]; - reshape_dims.d[channel_index] = reshape_dims.d[0]; - reshape_dims.d[0] = tmp; + // NOTE: for same reason as mentioned above we need to apply the reshape + // unconditionally. + nvinfer1::Dims reshape_dims = original_dims; + if (channel_index != 0) { + // NOTE: according to NVIDIA dimension types are deprecated, so we don't + // need to copy them back. + reshape_dims.d[channel_index] = original_dims.d[0]; + reshape_dims.d[0] = original_dims.d[channel_index]; + } shuffle_layer->setReshapeDimensions(reshape_dims); + if (channel_index != 0) { shuffle_layer->setSecondTranspose(permutation); } + params->converter->MarkQuantizationRangesAsInferrable( + output_tensor, shuffle_layer->getOutput(0)); output_tensor = shuffle_layer->getOutput(0); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); - return tensorflow::Status::OK(); + return Status::OK(); } Status GetTensorDimsWithProtoShape(const Tensor& tensor, @@ -1996,32 +2384,41 @@ tensorflow::Status ConvertConst(OpConverterParams* params) { } tensorflow::Status ConvertIdentity(OpConverterParams* params) { + // TODO(tmorris): TRT's Identity layer does not get optimized away as of TRT + // 5.0, however once we know that it does it would be nice to use that + // instead. params->outputs->push_back(params->inputs.at(0)); return tensorflow::Status::OK(); } -tensorflow::Status ConvertBinary(OpConverterParams* params) { +Status ConvertBinary(OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; if (inputs.size() != 2) { - return tensorflow::errors::FailedPrecondition( - "Binary ops require two tensor input, at ", node_def.name()); + return errors::InvalidArgument("Binary ops require two inputs, at ", + node_def.name()); } // Constant folding should have been done by TensorFlow - if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) { - return tensorflow::errors::Unimplemented( + return errors::Unimplemented( "Constant folding is falled back to TensorFlow, binary op received " "both input as constant at: ", node_def.name()); } - // Try to convert into Scale layer first (for better performance) + // TODO(tmorris): TRT plans to deprecate IScaleLayer and will replace it with + // IElementwiseLayer. At that point, we can remove BinaryTensorOpWeight. For + // now, the performance will be slightly better with IScaleLayer because it + // can be fused in more situations. However, most of the benefits of + // IScaleLayer are when the layer performs both a shift and a scale, which we + // don't do except for convolutions. + // + // Try to convert into Scale layer first (for better performance). // Since scale layer supports restricted broadcast policy and op types, we // allow failure and try to handle it through Elementwise op - // (BinaryTensorOpTensor) - Status status = tensorflow::Status::OK(); + // (BinaryTensorOpTensor). + Status status = Status::OK(); if (inputs.at(0).is_tensor() && inputs.at(1).is_weights()) { status = BinaryTensorOpWeight(params, inputs.at(0).tensor(), inputs.at(1).weights(), false); @@ -2029,7 +2426,10 @@ tensorflow::Status ConvertBinary(OpConverterParams* params) { status = BinaryTensorOpWeight(params, inputs.at(1).tensor(), inputs.at(0).weights(), true); } + // If both input are tensors, or one of them is weights but the conversion + // above failed, try the conversion using BinaryTensorOpTensor. if ((inputs.at(0).is_tensor() && inputs.at(1).is_tensor()) || !status.ok()) { + if (!status.ok()) VLOG(1) << status; status = BinaryTensorOpTensor(params, inputs.at(0), inputs.at(1)); } return status; @@ -2059,6 +2459,20 @@ tensorflow::Status ConvertUnary(OpConverterParams* params) { nvinfer1::IUnaryLayer* layer; if (node_def.op() == "Rsqrt") { + // We will need a quantization range for intermediate tensor if not using + // calibration. + // + // x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x) + // ^ + // need range here + if (params->converter->precision_mode() == INT8MODE && + !params->converter->use_calibration()) { + return errors::Unimplemented( + "Intermediate quantization range cannot be determined without" + " calibration for Rsqrt, consider replacing with " + "Sqrt -> FakeQuant -> Reciprocal ops, at ", + node_def.name()); + } layer = params->converter->network()->addUnary( *const_cast(tensor), nvinfer1::UnaryOperation::kSQRT); @@ -2618,6 +3032,8 @@ tensorflow::Status ConvertSoftmax(OpConverterParams* params) { layer->setAxes(1 << (nbDims - 1)); nvinfer1::ITensor* output_tensor = layer->getOutput(0); + // Quantization range for SoftMax is always (0, 1) + params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 1.0f); params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return tensorflow::Status::OK(); } @@ -2658,40 +3074,49 @@ tensorflow::Status ConvertTopK(OpConverterParams* params) { return tensorflow::Status::OK(); } -void TrtNodeValidator::RegisterOpValidators() { +static void RegisterValidatableOpConverters( + std::unordered_map* registration) { // TODO(laigd): support all op types. - op_validators_["Const"] = ConvertConst; - op_validators_["Transpose"] = ConvertTranspose; - op_validators_["Reshape"] = ConvertReshape; - op_validators_["MatMul"] = ConvertMatMul; + (*registration)["BiasAdd"] = ConvertBiasAdd; + (*registration)["Const"] = ConvertConst; + (*registration)["Transpose"] = ConvertTranspose; + (*registration)["Reshape"] = ConvertReshape; + (*registration)["MatMul"] = ConvertMatMul; + (*registration)["Relu6"] = ConvertRelu6; + + for (auto quantization_op_type : + {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3", + "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs"}) { + (*registration)[quantization_op_type] = ConvertQuantize; + } + for (auto binary_op_type : + {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum"}) { + (*registration)[binary_op_type] = ConvertBinary; + } +} + +void TrtNodeValidator::RegisterOpValidators() { + RegisterValidatableOpConverters(&op_validators_); } void Converter::RegisterOpConverters() { - // vgg_16 slim implementation + RegisterValidatableOpConverters(&op_registry_); + op_registry_["Conv2D"] = ConvertConv2D; op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise; op_registry_["Relu"] = ConvertActivation; op_registry_["MaxPool"] = ConvertPool; op_registry_["AvgPool"] = ConvertPool; - op_registry_["BiasAdd"] = ConvertScale; - op_registry_["Const"] = ConvertConst; // TODO(ben,jie): this is a temp hack. op_registry_["Identity"] = ConvertIdentity; // Identity should be removed op_registry_["Snapshot"] = ConvertIdentity; // Snapshot should be removed - // resnet_50_v1 slim implementation - op_registry_["Add"] = ConvertBinary; - op_registry_["Mul"] = ConvertBinary; - op_registry_["Sub"] = ConvertBinary; op_registry_["Pad"] = ConvertPad; op_registry_["ConcatV2"] = ConvertConcat; op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm; op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm; - op_registry_["Div"] = ConvertBinary; - op_registry_["RealDiv"] = ConvertBinary; - op_registry_["Rsqrt"] = ConvertUnary; op_registry_["Reciprocal"] = ConvertUnary; op_registry_["Exp"] = ConvertUnary; @@ -2700,20 +3125,19 @@ void Converter::RegisterOpConverters() { op_registry_["Abs"] = ConvertUnary; op_registry_["Neg"] = ConvertUnary; - op_registry_["Transpose"] = ConvertTranspose; - op_registry_["Reshape"] = ConvertReshape; - op_registry_["Sum"] = ConvertReduce; op_registry_["Prod"] = ConvertReduce; op_registry_["Max"] = ConvertReduce; op_registry_["Min"] = ConvertReduce; op_registry_["Mean"] = ConvertReduce; - op_registry_["Maximum"] = ConvertBinary; - op_registry_["Minimum"] = ConvertBinary; op_registry_["Softmax"] = ConvertSoftmax; - op_registry_["MatMul"] = ConvertMatMul; op_registry_["BatchMatMul"] = ConvertBatchMatMul; op_registry_["TopKV2"] = ConvertTopK; + op_registry_["Relu6"] = ConvertRelu6; + op_registry_["QuantizeAndDequantizeV2"] = ConvertQuantize; + op_registry_["QuantizeAndDequantizeV3"] = ConvertQuantize; + op_registry_["FakeQuantWithMinMaxVars"] = ConvertQuantize; + op_registry_["FakeQuantWithMinMaxArgs"] = ConvertQuantize; plugin_converter_ = ConvertPlugin; } @@ -2724,7 +3148,7 @@ tensorflow::Status ConvertGraphDefToEngine( const std::vector& input_shapes, Logger* logger, nvinfer1::IGpuAllocator* allocator, TRTInt8Calibrator* calibrator, - TrtUniquePtrType* engine, + TrtUniquePtrType* engine, bool use_calibration, bool* convert_successfully) { engine->reset(); if (convert_successfully) *convert_successfully = false; @@ -2739,7 +3163,11 @@ tensorflow::Status ConvertGraphDefToEngine( builder->setHalf2Mode(true); } else if (precision_mode == INT8MODE) { builder->setInt8Mode(true); - builder->setInt8Calibrator(calibrator); + if (use_calibration) { + builder->setInt8Calibrator(calibrator); + } else { + builder->setInt8Calibrator(nullptr); + } } // Create the network. @@ -2752,7 +3180,7 @@ tensorflow::Status ConvertGraphDefToEngine( // Build the network VLOG(1) << "Starting engine conversion "; - Converter converter(trt_network.get(), precision_mode == FP16MODE); + Converter converter(trt_network.get(), precision_mode, use_calibration); std::vector> output_tensors; // Graph nodes are already topologically sorted during construction for (const auto& node_def : gdef.node()) { @@ -2808,6 +3236,9 @@ tensorflow::Status ConvertGraphDefToEngine( TF_RETURN_IF_ERROR(converter.RenameAndMarkOutputTensors(output_tensors)); if (convert_successfully) *convert_successfully = true; + // Apply user provided quantization ranges to tensors + converter.MaybeApplyQuantizationRanges(); + // Build the engine. VLOG(1) << "Starting engine creation"; engine->reset(builder->buildCudaEngine(*converter.network())); diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h index 5cc28b33e7f2c56d2f281d24e8390d253a8228f5..54e19b73957bccdae2b23bd3556de9ad00b864e5 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h @@ -92,7 +92,8 @@ struct EngineInfo { EngineInfo() : engine_type(EngineType::TRTStatic), max_workspace_size_bytes(0), - precision_mode(FP32MODE) {} + precision_mode(FP32MODE), + use_calibration(true) {} string engine_name; string device; @@ -109,6 +110,7 @@ struct EngineInfo { int maximum_cached_engines; std::vector cached_engine_batches; int precision_mode; + bool use_calibration; }; // Constructs a graphdef from the segment in the given graph. Adds placeholder @@ -145,7 +147,7 @@ tensorflow::Status ConvertGraphDefToEngine( const std::vector& input_shapes, Logger* logger, nvinfer1::IGpuAllocator* allocator, TRTInt8Calibrator* calibrator, - TrtUniquePtrType* engine, + TrtUniquePtrType* engine, bool use_calibration, bool* convert_successfully); // Helper class for the segmenter to determine whether an output edge from the @@ -392,7 +394,8 @@ class TrtNodeValidator { // Class to convert TF nodes to TRT network. class Converter { public: - Converter(nvinfer1::INetworkDefinition* trt_network, bool is_fp16); + Converter(nvinfer1::INetworkDefinition* trt_network, int precision_mode, + bool use_calibration); ////////////////////////////////////////////////////////////////////////////// // Methods used by the TRT engine builder to build a TRT network from a TF @@ -422,8 +425,27 @@ class Converter { // to add TRT layers. nvinfer1::INetworkDefinition* network() { return trt_network_; } - // Is the converter operating in fp16 mode? - bool is_fp16() const { return is_fp16_; } + // What precision are we targeting? + int precision_mode() const { return precision_mode_; } + + // Calibration will be or was previously performed on this network? + bool use_calibration() const { return use_calibration_; } + + // This should be called on the inputs and outputs of any layer we create + // where we know that the quantization range does not change during that + // operation. (e.g. Reshape, Transpose, Identity, MaxPool). + void MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input, + nvinfer1::ITensor* output); + + // This function should be called when we know the quantization range of a + // tensor, either from a quantize/dequantize node or when the output is a + // fixed range (e.g. SoftMax, Relu6, Sigmoid). + void ProvideQuantizationRange(nvinfer1::ITensor* tensor, float min_range, + float max_range); + + // Should be called when full TRT network has been constructed and before + // building the engine. + void MaybeApplyQuantizationRanges(); // Below are helper methods for op converters to add different layers to the // TRT network. @@ -440,6 +462,13 @@ class Converter { const nvinfer1::Dims& dims, const nvinfer1::ITensor** tensor); + // Return OK if the broadcast scheme is supported and compute the shapes after + // broadcasting. + Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, + const TRT_TensorOrWeights& operand_r, + nvinfer1::Dims* operand_l_new_dims, + nvinfer1::Dims* operand_r_new_dims) const; + private: // Verify the provided batch_size is consistent with batch_size_ and update it // if necessary. @@ -457,6 +486,12 @@ class Converter { void RegisterOpConverters(); + void PropagateQuantizationRanges(); + + // Gets the min and max value in a TRT_ShapedWeights + Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min, + float* out_max) const; + // Registered op converters by op type. std::unordered_map op_registry_; @@ -472,7 +507,25 @@ class Converter { // Store the weights added during construction of trt_network_. TrtWeightStore weight_store_; - const bool is_fp16_; + // During conversion, this table is populated with quantization ranges per + // tensor. MaybeApplyQuantizationRanges() will use this table to set the TRT + // quantization ranges. Since TRT only supports symmetric ranges, we will + // store the range as a single float = max(abs(min_range), abs(max_range)). + // Range refers to the floating point values, e.g. min_range = 0.0f, max_range + // = 6.0f for Relu6. + std::unordered_map quantization_ranges_; + + // Edges where quantization ranges can be inferred (copied) across ops - from + // first tensor to second tensor. PropagateQuantizationRanges() will propagate + // known ranges from quantization_ranges_ across these edges, adding the new + // ranges to quantization_ranges_ so that they can be applied in + // MaybeApplyQuantizationRanges(). + std::vector> + quantization_infer_; + + const int precision_mode_; + + const bool use_calibration_; // Batch size of inputs to trt_network_ added by AddInputTensor(). During // network construction it will update this, use it to verify the batch diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc index c3a39395f3a99f3e471e09688a11cc0ebba61ff4..603c4f7b5e5af8df7f81484c715675968f5da695 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc @@ -35,7 +35,10 @@ limitations under the License. #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/platform/test.h" +#include "tensorflow/core/protobuf/config.pb.h" // NOLINT +#include "tensorflow/core/public/session.h" #if GOOGLE_CUDA #if GOOGLE_TENSORRT @@ -47,7 +50,9 @@ namespace tensorflow { namespace tensorrt { namespace convert { +using ::tensorflow::strings::StrCat; using ::testing::ElementsAre; +using ::testing::ElementsAreArray; // TODO(laigd): put this into some test utils file. void ExpectStatus(Status status, error::Code code = error::OK, @@ -69,6 +74,32 @@ nvinfer1::Dims GetTestDims(const std::vector& d) { return dims; } +nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) { + switch (tf_dtype) { + case DT_FLOAT: + return nvinfer1::DataType::kFLOAT; + case DT_HALF: + return nvinfer1::DataType::kHALF; + case DT_INT32: + return nvinfer1::DataType::kINT32; + default: + QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype); + } +} + +DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) { + switch (trt_dtype) { + case nvinfer1::DataType::kFLOAT: + return DT_FLOAT; + case nvinfer1::DataType::kHALF: + return DT_HALF; + case nvinfer1::DataType::kINT32: + return DT_INT32; + default: + QCHECK(false) << "Unexpected data type " << static_cast(trt_dtype); + } +} + NodeDef MakeNodeDef(const string& name, const string& op, const std::vector& inputs) { NodeDef node_def; @@ -111,6 +142,15 @@ bool TrtDimsEqualsArray(const std::vector& lhs, return TrtDimsEquals(GetTestDims(lhs), rhs); } +// TODO(laigd): define a parameterized matcher that can compare against the +// vector. +void ExpectTrtDimsEqualsArray(const std::vector& lhs, + const nvinfer1::Dims& rhs) { + EXPECT_TRUE(TrtDimsEqualsArray(lhs, rhs)) + << "expected: " << DebugString(GetTestDims(lhs)) << "\n" + << " actual: " << DebugString(rhs); +} + bool TrtShapedWeightsEquals(const TRT_ShapedWeights& lhs, const TRT_ShapedWeights& rhs) { return TrtDimsEquals(lhs.shape_, rhs.shape_) && lhs.type_ == rhs.type_ && @@ -121,8 +161,7 @@ template void ValidateWeights(const TRT_ShapedWeights& weights, const std::vector& expected_dims, const std::vector& expected_value) { - EXPECT_TRUE(TrtDimsEqualsArray(expected_dims, weights.shape_)) - << weights.DebugString(); + ExpectTrtDimsEqualsArray(expected_dims, weights.shape_); ASSERT_EQ(expected_value.size(), weights.count()) << weights.DebugString(); const T* actual_values = static_cast(weights.GetValues()); for (int i = 0; i < expected_value.size(); ++i) { @@ -133,11 +172,12 @@ void ValidateWeights(const TRT_ShapedWeights& weights, // Fake ITensor implementation for testing purposes. class FakeITensor : public nvinfer1::ITensor { public: - FakeITensor() {} + FakeITensor() : dynamic_range_(0.0f) {} - FakeITensor(const nvinfer1::Dims& dims) : dims_(dims) {} + FakeITensor(const nvinfer1::Dims& dims) : dims_(dims), dynamic_range_(0.0f) {} - FakeITensor(const std::vector& dims) : dims_(GetTestDims(dims)) {} + FakeITensor(const std::vector& dims) + : dims_(GetTestDims(dims)), dynamic_range_(0.0f) {} void setName(const char* name) override { name_ = name; } @@ -166,7 +206,12 @@ class FakeITensor : public nvinfer1::ITensor { } #if NV_TENSORRT_MAJOR >= 5 - bool setDynamicRange(float min, float max) override {} + bool setDynamicRange(float min, float max) override { + dynamic_range_ = std::max(std::abs(min), std::abs(max)); + return true; + } + + float getDynamicRange() const override { return dynamic_range_; } #endif private: @@ -174,6 +219,7 @@ class FakeITensor : public nvinfer1::ITensor { nvinfer1::Dims dims_; nvinfer1::DataType type_; nvinfer1::TensorLocation location_; + float dynamic_range_; }; TEST(TRT_ShapedWeights_Test, Basic) { @@ -265,9 +311,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) { EXPECT_EQ(1, ptr->batch_size()); } EXPECT_EQ(&itensor, ptr->tensor()); - EXPECT_TRUE(TrtDimsEqualsArray({1}, ptr->GetTrtDims())) - << "- expected: " << DebugString(dims) - << "\n vs\n- actual: " << DebugString(ptr->GetTrtDims()); + ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims()); } } } @@ -286,9 +330,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) { EXPECT_EQ(false, ptr->is_weights()); EXPECT_EQ(1, ptr->batch_size()); EXPECT_NE(nullptr, ptr->tensor()); - EXPECT_TRUE(TrtDimsEqualsArray({1}, ptr->GetTrtDims())) - << "- expected: " << DebugString(dims) - << "\n vs\n- actual: " << DebugString(ptr->GetTrtDims()); + ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims()); } } // Test constructor with TRT_ShapedWeights argument. @@ -305,9 +347,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) { nvinfer1::Dims dims; dims.nbDims = 0; - EXPECT_TRUE(TrtDimsEqualsArray({}, ptr->GetTrtDims())) - << "- expected: " << DebugString(dims) - << "\n vs\n- actual: " << DebugString(ptr->GetTrtDims()); + ExpectTrtDimsEqualsArray({}, ptr->GetTrtDims()); } } } @@ -341,34 +381,50 @@ TEST_F(ValidatorTest, ConvertToTensorOrWeights) { graph_properties, &output)); ValidateWeights(output.weights(), {2}, {1.0, 2.0}); } - // Convert non-Const. We test the case where the non-batch dimemsion is - // unknown as well, to make sure the validator allows that. - for (const int32 non_batch_dim : {-1, 2}) { - const int32 batch_size = 12; + // Helper method to run ConvertToTensorOrWeights() with predefined parameters. + auto convert_to_tensor_or_weights = [this](const std::vector& dims, + TRT_TensorOrWeights* output) { Scope s = Scope::NewRootScope(); - ops::Placeholder::Attrs attrs; - TF_EXPECT_OK(TensorShapeUtils::MakeShape( - std::vector{batch_size, non_batch_dim}, &attrs.shape_)); + const auto attrs = ops::Placeholder::Shape(PartialTensorShape{dims}); auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT, attrs); auto add = ops::Add(s.WithOpName("add"), feed, feed); grappler::GrapplerItem item; TF_EXPECT_OK(s.ToGraphDef(&item.graph)); - grappler::GraphProperties graph_properties(item); TF_EXPECT_OK(graph_properties.InferStatically(true)); - - auto& node_def = add.operation.node()->def(); + const NodeDef& node_def = add.operation.node()->def(); + return this->ConvertToTensorOrWeights(node_def, /*output_port=*/0, + graph_properties, output); + }; + // Convert non-Const with #dims > nvinfer1::Dims::MAX_DIMS+1. + { TRT_TensorOrWeights output; - ExpectStatus(ConvertToTensorOrWeights(node_def, /*output_port=*/0, - graph_properties, &output)); + ExpectStatus( + convert_to_tensor_or_weights( + std::vector(nvinfer1::Dims::MAX_DIMS + 2, 1), &output), + error::OUT_OF_RANGE, "Input tensor rank is greater than 9"); + } + // Convert non-Const with #dims < 2. + { + TRT_TensorOrWeights output; + ExpectStatus( + convert_to_tensor_or_weights({1}, &output), error::INVALID_ARGUMENT, + "Input tensor with rank<2 is not supported since the first dimension " + "is treated as batch dimension by TRT"); + } + // Convert non-Const. We test the case where the non-batch dimemsion is + // unknown as well, to make sure the validator allows that. + for (const int32 non_batch_dim : {-1, 2}) { + const int32 batch_size = 12; + TRT_TensorOrWeights output; + ExpectStatus( + convert_to_tensor_or_weights({batch_size, non_batch_dim}, &output)); EXPECT_EQ(true, output.is_tensor()); EXPECT_EQ(batch_size, output.batch_size()); EXPECT_NE(nullptr, output.tensor()); - EXPECT_TRUE(TrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims())) - << "- expected: {" << non_batch_dim << "} \n vs\n" - << "- actual: " << DebugString(output.GetTrtDims()); + ExpectTrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims()); } } @@ -405,7 +461,9 @@ class ConverterTest : public ::testing::Test { ConverterTest() { builder_.reset(nvinfer1::createInferBuilder(logger_)); network_.reset(builder_->createNetwork()); - converter_.reset(new Converter(network_.get(), /*fp16=*/false)); + converter_.reset(new Converter(network_.get(), + /*precision_mode=*/FP32MODE, + /*use_calibration=*/false)); weight_store_ = &converter_->weight_store_; } @@ -432,8 +490,21 @@ class ConverterTest : public ::testing::Test { return converter_->GetInputs(node_def, inputs); } + Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min, + float* out_max) const { + return converter_->GetWeightRange(weights, out_min, out_max); + } + + void PropagateQuantizationRanges() { + converter_->PropagateQuantizationRanges(); + } + int batch_size() const { return converter_->batch_size_; } + std::unordered_map& quantization_ranges() { + return converter_->quantization_ranges_; + } + private: Logger logger_; // These members are ordered in a way such that the destruction order is: @@ -504,9 +575,9 @@ TEST_F(ConverterTest, AddAndGetInputs) { EXPECT_EQ(nvinfer1::DataType::kFLOAT, inputs[0].tensor()->getType()); EXPECT_EQ(nvinfer1::DataType::kINT32, inputs[2].tensor()->getType()); EXPECT_EQ(nvinfer1::DataType::kHALF, inputs[3].tensor()->getType()); - EXPECT_TRUE(TrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions())); - EXPECT_TRUE(TrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions())); - EXPECT_TRUE(TrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions())); + ExpectTrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions()); + ExpectTrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions()); + ExpectTrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions()); } TEST_F(ConverterTest, RenameAndMarkOutputTensors) { @@ -552,7 +623,7 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) { {{"my_op", "my_output"}, {"my_op:1", "my_output_1"}})); EXPECT_EQ(2, output_tensors.size()); for (auto output_tensor : output_tensors) { - EXPECT_TRUE(TrtDimsEqualsArray({2, 1}, output_tensor->getDimensions())); + ExpectTrtDimsEqualsArray({2, 1}, output_tensor->getDimensions()); } EXPECT_EQ("my_output", string(output_tensors[0]->getName())); EXPECT_EQ("my_output_1", string(output_tensors[1]->getName())); @@ -577,8 +648,7 @@ TEST_F(ConverterTest, TransposeTensor) { // OK. TF_EXPECT_OK( converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, &output_tensor)); - EXPECT_TRUE(TrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions())) - << DebugString(*output_tensor); + ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions()); } TEST_F(ConverterTest, PrepareTensorForShape_Tensor) { @@ -590,7 +660,7 @@ TEST_F(ConverterTest, PrepareTensorForShape_Tensor) { // Shape size doesn't match. ExpectStatus(converter_->PrepareTensorForShape(tw, GetTestDims({2, 3, 6}), &output_tensor), - error::INVALID_ARGUMENT, "Reshape shapes are not compatible."); + error::INVALID_ARGUMENT, "Reshape shapes are not compatible"); // TODO(aaroey): we should check the case where uninferred dimensions are not // an exact divisor of input dim ensions, e.g. for dims {-1, 7}. @@ -598,14 +668,12 @@ TEST_F(ConverterTest, PrepareTensorForShape_Tensor) { // Infer shape, ok. TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({-1, 2}), &output_tensor)); - EXPECT_TRUE(TrtDimsEqualsArray({15, 2}, output_tensor->getDimensions())) - << DebugString(*output_tensor); + ExpectTrtDimsEqualsArray({15, 2}, output_tensor->getDimensions()); // Regular shape. TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}), &output_tensor)); - EXPECT_TRUE(TrtDimsEqualsArray({10, 3}, output_tensor->getDimensions())) - << DebugString(*output_tensor); + ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions()); } TEST_F(ConverterTest, PrepareTensorForShape_Weights) { @@ -615,8 +683,7 @@ TEST_F(ConverterTest, PrepareTensorForShape_Weights) { const nvinfer1::ITensor* output_tensor = nullptr; TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}), &output_tensor)); - EXPECT_TRUE(TrtDimsEqualsArray({10, 3}, output_tensor->getDimensions())) - << DebugString(*output_tensor); + ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions()); } TEST_F(ConverterTest, MaybeUpdateBatchSize) { @@ -656,6 +723,178 @@ TEST_F(ConverterTest, AddAndGetTensorOrWeights) { "tensor/weights my_tensor already exist"); } +template +void TestGetWeightRange(ConverterTest* test, TrtWeightStore* weight_store) { + TRT_ShapedWeights weights = + weight_store->GetTempWeights(DataTypeToEnum::v(), GetTestDims({2, 3})); + const std::vector values = {T(3), T(1), T(2), T(6), T(5), T(4)}; + memcpy(const_cast(weights.GetValues()), values.data(), + weights.size_bytes()); + + float out_min = 0.0f; + float out_max = 0.0f; + TF_EXPECT_OK(test->GetWeightRange(weights, &out_min, &out_max)); + EXPECT_EQ(1.0f, out_min); + EXPECT_EQ(6.0f, out_max); +} + +TEST_F(ConverterTest, GetWeightRange) { + TestGetWeightRange(this, weight_store_); + TestGetWeightRange(this, weight_store_); + TestGetWeightRange(this, weight_store_); +} + +TEST_F(ConverterTest, ProvideQuantizationRange) { + FakeITensor fake_tensor; + // Assymetric range + converter_->ProvideQuantizationRange(&fake_tensor, 0.0f, 6.0f); + EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]); + converter_->ProvideQuantizationRange(&fake_tensor, 1.0f, 6.0f); + EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]); + converter_->ProvideQuantizationRange(&fake_tensor, -8.0f, 6.0f); + EXPECT_EQ(8.0f, quantization_ranges()[&fake_tensor]); + converter_->ProvideQuantizationRange(&fake_tensor, -8.123f, -6.123f); + EXPECT_EQ(8.123f, quantization_ranges()[&fake_tensor]); + // Symmetric range + converter_->ProvideQuantizationRange(&fake_tensor, -6.123f, 6.123f); + EXPECT_EQ(6.123f, quantization_ranges()[&fake_tensor]); +} + +TEST_F(ConverterTest, MaybeApplyQuantizationRanges) { + // input -> infer1 -> infer2 -> infer3 + FakeITensor input, infer_1, infer_2, infer_3; + FakeITensor not_infer; + Converter int8_converter(/*trt_network=*/nullptr, INT8MODE, + /*use_calibration=*/true); + int8_converter.ProvideQuantizationRange(&input, -5.0f, 5.0f); + int8_converter.ProvideQuantizationRange(¬_infer, -100.0f, 100.0f); + int8_converter.MarkQuantizationRangesAsInferrable(&input, &infer_1); + int8_converter.MarkQuantizationRangesAsInferrable(&infer_1, &infer_2); + int8_converter.MarkQuantizationRangesAsInferrable(&infer_2, &infer_3); + + // Input range should be inferred along the chain and applied to tensors. + int8_converter.MaybeApplyQuantizationRanges(); +#if NV_TENSORRT_MAJOR >= 5 + EXPECT_EQ(input.getDynamicRange(), 5.0f); + EXPECT_EQ(infer_1.getDynamicRange(), 5.0f); + EXPECT_EQ(infer_2.getDynamicRange(), 5.0f); + EXPECT_EQ(infer_3.getDynamicRange(), 5.0f); + EXPECT_EQ(not_infer.getDynamicRange(), 100.0f); +#endif +} + +TEST_F(ConverterTest, PropagateQuantizationRanges) { + // infer0 <-> infer1 <-> infer2 <-> infer3 + // | + // infer4 <-> infer5 + FakeITensor infer[6]; + FakeITensor not_infer; + converter_->ProvideQuantizationRange(&infer[4], -5.0f, 5.0f); + converter_->MarkQuantizationRangesAsInferrable(&infer[0], &infer[1]); + converter_->MarkQuantizationRangesAsInferrable(&infer[1], &infer[2]); + converter_->MarkQuantizationRangesAsInferrable(&infer[3], &infer[2]); + converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[1]); + converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[5]); + + // Input range should be inferred along the chain. + PropagateQuantizationRanges(); + auto ranges = quantization_ranges(); + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(5.0f, ranges[&infer[i]]); + } + EXPECT_EQ(ranges.count(¬_infer), 0); +} + +TEST_F(ConverterTest, GetTrtBroadcastShape) { + const bool kIsTensor = true; + const bool kIsNotTensor = false; + auto symmetric_test = [this](const std::vector& operand_1_shape, + const std::vector& operand_2_shape, + const bool operand_1_is_tensor, + const bool operand_2_is_tensor, + const std::vector& expected_operand_1_shape, + const std::vector& expected_operand_2_shape, + error::Code expected_code = error::OK, + const char* expected_error_msg_substr = nullptr, + const int operand_1_batch_size = -1, + const int operand_2_batch_size = -1) { + auto create_tensor_or_weights = [](const std::vector& shape, + bool is_tensor, int batch_size = -1) { + if (is_tensor) { + return TRT_TensorOrWeights{nvinfer1::DataType::kFLOAT, + GetTestDims(shape), batch_size}; + } + TRT_ShapedWeights weights; + weights.shape_ = GetTestDims(shape); + return TRT_TensorOrWeights(weights); + }; + + nvinfer1::Dims operand_1_new_dims, operand_2_new_dims; + TRT_TensorOrWeights operand_1 = create_tensor_or_weights( + operand_1_shape, operand_1_is_tensor, operand_1_batch_size); + TRT_TensorOrWeights operand_2 = create_tensor_or_weights( + operand_2_shape, operand_2_is_tensor, operand_2_batch_size); + + // operand_1 broadcast operand_2 + ExpectStatus( + this->converter_->GetTrtBroadcastShape( + operand_1, operand_2, &operand_1_new_dims, &operand_2_new_dims), + expected_code, expected_error_msg_substr); + if (expected_code == error::OK) { + ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims); + ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims); + } + // operand_2 broadcast operand_1 + ExpectStatus( + this->converter_->GetTrtBroadcastShape( + operand_2, operand_1, &operand_2_new_dims, &operand_1_new_dims), + expected_code, expected_error_msg_substr); + if (expected_code == error::OK) { + ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims); + ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims); + } + }; + + // Both inputs are weights. + symmetric_test( + {1}, {1}, kIsNotTensor, kIsNotTensor, {}, {}, error::INVALID_ARGUMENT, + "Broadcasting requires at least one of the operands be tensors"); + + // One tensor and one weights. + symmetric_test({1, 1, 1}, {2}, kIsTensor, kIsNotTensor, {1, 1, 1}, {1, 1, 2}); + symmetric_test({1, 1, 2}, {2}, kIsTensor, kIsNotTensor, {1, 1, 2}, {1, 1, 2}); + symmetric_test({1, 3, 2}, {1}, kIsTensor, kIsNotTensor, {1, 3, 2}, {1, 1, 1}); + symmetric_test({1, 1, 1}, {2, 3}, kIsTensor, kIsNotTensor, {1, 1, 1}, + {1, 2, 3}); + symmetric_test({1, 1, 1}, {2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1}, + {2, 3, 4}); + symmetric_test({1, 1, 1}, {1, 2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1}, + {2, 3, 4}); + symmetric_test({1, 3, 4}, {1, 2, 1, 4}, kIsTensor, kIsNotTensor, {1, 3, 4}, + {2, 1, 4}); + symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {}, + error::INVALID_ARGUMENT, "Infeasible broadcast scheme"); + symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {}, + error::INVALID_ARGUMENT, "Infeasible broadcast scheme", + /*operand_1_batch_size=*/2); + symmetric_test({1, 1, 1}, {1, 1, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {}, + error::INVALID_ARGUMENT, + "Broadcasting beyond batch dimension is not supported " + "(tensor #dims 4 vs broadcast #dims 5)"); + + // Both inputs are tensors. + symmetric_test({1, 1, 1}, {1, 1}, kIsTensor, kIsTensor, {}, {}, + error::INVALID_ARGUMENT, + "Broadcasting beyond batch dimension is not supported " + "(tensor #dims 3 vs broadcast #dims 4)"); + symmetric_test({1, 3, 4}, {2, 1, 4}, kIsTensor, kIsTensor, {1, 3, 4}, + {2, 1, 4}); + symmetric_test({1, 1, 1}, {1, 1, 1, 1}, kIsTensor, kIsTensor, {}, {}, + error::INVALID_ARGUMENT, + "Broadcasting beyond batch dimension is not supported " + "(tensor #dims 4 vs broadcast #dims 5)"); +} + // Class to test various op converters, using both a TrtNodeValidator and // Converter. class OpConverterTest : public ::testing::Test { @@ -684,15 +923,21 @@ class OpConverterTest : public ::testing::Test { // Reset the validator and converter. validator_.reset(new TrtNodeValidator); - converter_.reset(new Converter(network_.get(), /*fp16=*/false)); + converter_.reset(new Converter(network_.get(), + /*precision_mode=*/FP32MODE, + /*use_calibration=*/false)); // Reset other related artifacts. scope_ = Scope::NewRootScope(); validator_inputs_.clear(); } - void BuildAndRun(const char* input_name, const std::vector& input_data, - const char* output_name, std::vector* output_data) { + // TODO(laigd): test fp16 and int8 support. + template + void BuildAndRun( + const std::vector>>& + input_data, + const char* output_name, std::vector* output_data) { // Mark the output tensor as TRT engine output. TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors( {{string(output_name), string(output_name)}})); @@ -703,25 +948,33 @@ class OpConverterTest : public ::testing::Test { CHECK_NOTNULL(engine_.get()); // Execute the TRT engine. - const int input_size = input_data.size() * sizeof(float); - const int output_size = output_data->size() * sizeof(float); - const int input_index = engine_->getBindingIndex(input_name); - const int output_index = engine_->getBindingIndex(output_name); + ASSERT_LE(input_data.size() + 1, 3); + void* buffers[3]; + for (const auto name_and_data : input_data) { + const int input_size = name_and_data.second.size() * sizeof(T); + const int input_index = engine_->getBindingIndex(name_and_data.first); + ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size)); + ASSERT_EQ( + 0, cudaMemcpyAsync(buffers[input_index], name_and_data.second.data(), + input_size, cudaMemcpyHostToDevice, stream_)); + } - ASSERT_EQ(engine_->getNbBindings(), 2); - void* buffers[2]; - ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size)); + const int output_size = output_data->size() * sizeof(T); + const int output_index = engine_->getBindingIndex(output_name); ASSERT_EQ(0, cudaMalloc(&buffers[output_index], output_size)); - ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input_data.data(), - input_size, cudaMemcpyHostToDevice, stream_)); + + ASSERT_EQ(engine_->getNbBindings(), input_data.size() + 1); + TrtUniquePtrType execution_context( engine_->createExecutionContext()); execution_context->enqueue(/*batchSize=*/1, buffers, stream_, nullptr); ASSERT_EQ(0, cudaMemcpyAsync(output_data->data(), buffers[output_index], output_size, cudaMemcpyDeviceToHost, stream_)); cudaStreamSynchronize(stream_); - ASSERT_EQ(0, cudaFree(buffers[input_index])); - ASSERT_EQ(0, cudaFree(buffers[output_index])); + + for (int i = 0; i < input_data.size() + 1; ++i) { + ASSERT_EQ(0, cudaFree(buffers[i])); + } } bool HasStaticShape(const nvinfer1::Dims& dims) const { @@ -736,18 +989,7 @@ class OpConverterTest : public ::testing::Test { void AddTestTensor( const char* name, const std::vector& dims, int batch_size = 1, nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) { - DataType tf_dtype = DT_FLOAT; - switch (trt_dtype) { - case nvinfer1::DataType::kFLOAT: - tf_dtype = DT_FLOAT; - break; - case nvinfer1::DataType::kINT32: - tf_dtype = DT_INT32; - break; - default: - ASSERT_TRUE(false) << "Unexpected data type " - << static_cast(trt_dtype); - } + DataType tf_dtype = TrtDataTypeToTf(trt_dtype); ops::Placeholder::Attrs attrs; TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_)); attrs.shape_.InsertDim(0, batch_size); @@ -826,6 +1068,11 @@ class OpConverterTest : public ::testing::Test { } } + // Expose quantization_ranges_ for tests + std::unordered_map& quantization_ranges() { + return converter_->quantization_ranges_; + } + std::unique_ptr converter_; std::unique_ptr validator_; @@ -835,6 +1082,11 @@ class OpConverterTest : public ::testing::Test { TrtUniquePtrType network_; TrtUniquePtrType engine_; cudaStream_t stream_; + // Used to create placeholders with shape and data type information. The + // created placeholders will be used as inputs to the node to be verified, + // thus we need the shape and data type information to get a non-empty + // GraphProperties. + // TODO(laigd): consider use this Scope to create the NodeDef to verify. Scope scope_; std::unordered_map validator_inputs_; }; @@ -958,15 +1210,15 @@ TEST_F(OpConverterTest, ConvertTranspose) { Reset(); AddTestTensor("input", {1, 2, 3}); AddTestWeights("weights", {4}, {0, 3, 1, 2}); - RunConversion(node_def); + RunValidationAndConversion(node_def); TRT_TensorOrWeights output; TF_EXPECT_OK(GetTensorOrWeights("my_transpose", &output)); EXPECT_TRUE(output.is_tensor()); - EXPECT_TRUE(TrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions())) - << output.DebugString(); + ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions()); std::vector output_data(6); - BuildAndRun("input", {1, 2, 3, 4, 5, 6}, "my_transpose", &output_data); + BuildAndRun({{"input", {1, 2, 3, 4, 5, 6}}}, "my_transpose", + &output_data); EXPECT_THAT(output_data, ElementsAre(1, 4, 2, 5, 3, 6)); } } @@ -1048,15 +1300,15 @@ TEST_F(OpConverterTest, ConvertReshape) { Reset(); AddTestTensor("input", ok_params[i].tensor_dims, ok_params[i].batch_size); AddTestWeights("weights", {4}, ok_params[i].shape); - RunConversion(node_def); + RunValidationAndConversion(node_def); TRT_TensorOrWeights output; TF_EXPECT_OK(GetTensorOrWeights("my_reshape", &output)); EXPECT_TRUE(output.is_tensor()); - EXPECT_TRUE(TrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions())) - << output.DebugString(); + ExpectTrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions()); std::vector output_data(6); - BuildAndRun("input", {1, 2, 3, 4, 5, 6}, "my_reshape", &output_data); + BuildAndRun({{"input", {1, 2, 3, 4, 5, 6}}}, "my_reshape", + &output_data); EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6)); } } @@ -1070,15 +1322,14 @@ TEST_F(OpConverterTest, ConvertMatMul) { "Input expects tensor and weights, at my_matmul"); } - // Get the NodeDef for Reshape. + // Get the NodeDef for MatMul. auto get_matmul_nodedef = [](DataType dtype, bool transpose_a, bool transpose_b) -> NodeDef { Scope s = Scope::NewRootScope(); auto input = ops::Placeholder(s.WithOpName("input"), dtype); auto weights = ops::Placeholder(s.WithOpName("weights"), dtype); - ops::MatMul::Attrs matmul_attrs; - matmul_attrs.transpose_a_ = transpose_a; - matmul_attrs.transpose_b_ = transpose_b; + const auto matmul_attrs = + ops::MatMul::TransposeA(transpose_a).TransposeB(transpose_b); auto matmul = ops::MatMul(s.WithOpName("my_matmul"), input, weights, matmul_attrs); return matmul.operation.node()->def(); @@ -1094,45 +1345,625 @@ TEST_F(OpConverterTest, ConvertMatMul) { node_def, error::UNIMPLEMENTED, "Data type is not supported, for node my_matmul got int32"); } - { - // transpose_a is set. - for (bool transpose_b : {false, true}) { - Reset(); - NodeDef node_def = - get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/true, transpose_b); - AddTestTensor("input", {2}, /*batch_size=*/1); - AddTestWeights("weights", {2, 2}, {0, 1, 2, 3}); - RunValidationAndConversion( - node_def, error::INVALID_ARGUMENT, - "transpose_a is not supported for TensorRT FullyConnected"); + // transpose_a is set. + for (bool transpose_b : {false, true}) { + Reset(); + NodeDef node_def = + get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/true, transpose_b); + AddTestTensor("input", {2}, /*batch_size=*/1); + AddTestWeights("weights", {2, 2}, {0, 1, 2, 3}); + RunValidationAndConversion( + node_def, error::INVALID_ARGUMENT, + "transpose_a is not supported for TensorRT FullyConnected"); + } + // OK. + for (bool transpose_b : {false, true}) { + Reset(); + NodeDef node_def = + get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/false, transpose_b); + AddTestTensor("input", {2}, /*batch_size=*/1); + AddTestWeights("weights", {2, 2}, {0, 1, 2, 3}); + RunValidationAndConversion(node_def); + TRT_TensorOrWeights output; + TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output)); + EXPECT_TRUE(output.is_tensor()); + ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions()); + + std::vector output_data(2); + BuildAndRun({{"input", {0, 1}}}, "my_matmul", &output_data); + if (transpose_b) { + EXPECT_THAT(output_data, ElementsAre(1, 3)); + } else { + EXPECT_THAT(output_data, ElementsAre(2, 3)); } } - { - // OK. - for (bool transpose_b : {false, true}) { - Reset(); - NodeDef node_def = - get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/false, transpose_b); - AddTestTensor("input", {2}, /*batch_size=*/1); - AddTestWeights("weights", {2, 2}, {0, 1, 2, 3}); - RunConversion(node_def); +} + +template +void TestConvertBiasAdd(OpConverterTest* test) { + // Get the NodeDef for BiasAdd. + auto get_biasadd_nodedef = [](const string& data_format) -> NodeDef { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), dtype); + auto weights = ops::Placeholder(s.WithOpName("weights"), dtype); + const auto biasadd_attrs = ops::BiasAdd::DataFormat(data_format); + auto biasadd = + ops::BiasAdd(s.WithOpName("my_biasadd"), input, weights, biasadd_attrs); + return biasadd.operation.node()->def(); + }; + + typedef typename EnumToDataType::Type CType; + for (const string& data_format : {"NHWC", "NCHW"}) { + for (const int trt_input_rank : {1, 2, 3, 4}) { + test->Reset(); + NodeDef node_def = get_biasadd_nodedef(data_format); + + // Add input, dims_array will be like {2, 1, ..., 1, 3} + std::vector dims_array(trt_input_rank, 1); + if (trt_input_rank == 1) { + dims_array[0] = (data_format == "NHWC" ? 3 : 2); + } else { + dims_array[0] = 2; + dims_array[trt_input_rank - 1] = 3; + } + test->AddTestTensor("input", dims_array, /*batch_size=*/1, + TfDataTypeToTrt(dtype)); + + // Add bias weights. + const int channel_size = (data_format == "NHWC" ? 3 : 2); + std::vector bias(channel_size); + for (int i = 0; i < channel_size; ++i) { + bias[i] = CType(i + 1); // bias will be {1, 2, 3, ...} + } + test->AddTestWeights("weights", {channel_size}, bias); + + // Run the conversion. + test->RunValidationAndConversion(node_def); TRT_TensorOrWeights output; - TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output)); + TF_EXPECT_OK(test->GetTensorOrWeights("my_biasadd", &output)); EXPECT_TRUE(output.is_tensor()); - EXPECT_TRUE(TrtDimsEqualsArray({2}, output.tensor()->getDimensions())) - << output.DebugString(); - - std::vector output_data(2); - BuildAndRun("input", {0, 1}, "my_matmul", &output_data); - if (transpose_b) { - EXPECT_THAT(output_data, ElementsAre(1, 3)); + ExpectTrtDimsEqualsArray(dims_array, output.tensor()->getDimensions()); + + // Build and run the engine. + const int num_input = TrtDimsNumElements(GetTestDims(dims_array)); + ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2), + num_input); + std::vector output_data(num_input); + test->BuildAndRun( + {{"input", std::vector(num_input, CType(0))}}, "my_biasadd", + &output_data); + if (trt_input_rank == 1) { + if (data_format == "NHWC") { + EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3))); + } else { + EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2))); + } } else { - EXPECT_THAT(output_data, ElementsAre(2, 3)); + if (data_format == "NHWC") { + EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3), + CType(1), CType(2), CType(3))); + } else { + EXPECT_THAT(output_data, ElementsAre(CType(1), CType(1), CType(1), + CType(2), CType(2), CType(2))); + } } } } } +TEST_F(OpConverterTest, ConvertBiasAdd) { + { + // Input list is empty, should fail. + NodeDef node_def = MakeNodeDef("my_biasadd", "BiasAdd", {}); + RunValidationAndConversion( + node_def, error::INVALID_ARGUMENT, + "Input expects tensor and weights, at my_biasadd"); + } + + // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test + // DT_INT32 type here. + TestConvertBiasAdd(this); + TestConvertBiasAdd(this); +} + +template +NodeDef GetBinaryOpNodeDef(const string& input_name_l, + const string& input_name_r, DataType dtype) { + Scope s = Scope::NewRootScope(); + auto input_l = ops::Placeholder(s.WithOpName(input_name_l), dtype); + auto input_r = ops::Placeholder(s.WithOpName(input_name_r), dtype); + auto op = OpType(s.WithOpName("my_binary"), input_l, input_r); + return op.operation.node()->def(); +} + +void CheckAddedLayers(OpConverterTest* test, bool expect_scale_layer) { + bool element_wise_layer_found = false; + bool scale_layer_found = false; + for (int i = 0; i < test->converter_->network()->getNbLayers(); i++) { + nvinfer1::ILayer* layer = test->converter_->network()->getLayer(i); + if (dynamic_cast(layer)) { + scale_layer_found = true; + } else if (dynamic_cast(layer)) { + element_wise_layer_found = true; + } + } + EXPECT_EQ(expect_scale_layer, scale_layer_found); + EXPECT_NE(expect_scale_layer, element_wise_layer_found); +} + +template +void TestBinaryTensorOpWeightNoBroadcast(OpConverterTest* test) { + typedef typename EnumToDataType::Type CType; + for (auto swap_inputs : {false, true}) { + test->Reset(); + NodeDef node_def; + if (swap_inputs) { + node_def = GetBinaryOpNodeDef("weights", "input", dtype); + } else { + node_def = GetBinaryOpNodeDef("input", "weights", dtype); + } + + const std::vector operand1{CType(3), CType(7.5)}; + const std::vector operand2{CType(2), CType(3)}; + + // It requires the dims to be at least of rank 3 to apply an IScaleLayer. + test->AddTestTensor("input", /*dims=*/{1, 1, 2}, /*batch_size=*/1, + TfDataTypeToTrt(dtype)); + test->AddTestWeights("weights", /*dims=*/{1, 1, 2}, + /*values=*/swap_inputs ? operand1 : operand2); + test->RunValidationAndConversion(node_def); + + // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor. + CheckAddedLayers(test, /*expect_scale_layer=*/true); + + // Check the dims of the output ITensor. + TRT_TensorOrWeights output; + TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output)); + EXPECT_TRUE(output.is_tensor()); + ExpectTrtDimsEqualsArray({1, 1, 2}, output.tensor()->getDimensions()); + + std::vector output_data(2); + test->BuildAndRun( + {{"input", + /*input_data=*/swap_inputs ? operand2 : operand1}}, + "my_binary", &output_data); + if (node_def.op() == "Add") { + EXPECT_THAT(output_data, ElementsAre(CType(5), CType(10.5))); + } else if (node_def.op() == "Sub") { + EXPECT_THAT(output_data, ElementsAre(CType(1), CType(4.5))); + } else if (node_def.op() == "Mul") { + EXPECT_THAT(output_data, ElementsAre(CType(6), CType(22.5))); + } else if (node_def.op() == "Div") { + EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5))); + } else if (node_def.op() == "RealDiv") { + EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5))); + } else { + ASSERT_TRUE(false); + } + } +} + +template +void TestBinaryTensorOpWeightWithChannelWiseBroadcast(OpConverterTest* test) { + typedef typename EnumToDataType::Type CType; + const NodeDef node_def = + GetBinaryOpNodeDef("input", "weights", dtype); + const std::vector input{CType(1), CType(2), CType(3), CType(4)}; + const std::vector weights{CType(10), CType(20)}; + // There are two types of valid dim pairs which requires channel-wise + // broadcasting: + // - input dims (X Y Z) vs weights dims (X 1 1) + // - input dims (X Y Z) vs weights dims (Z) + // Here X=Z=2 and Y=1. + for (auto weights_dims : std::vector>{{2, 1, 1}, {2}}) { + test->Reset(); + test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1, + TfDataTypeToTrt(dtype)); + test->AddTestWeights("weights", weights_dims, weights); + test->RunValidationAndConversion(node_def); + + // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor. + CheckAddedLayers(test, /*expect_scale_layer=*/true); + + // Check the dims of the output ITensor. + TRT_TensorOrWeights output; + TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output)); + EXPECT_TRUE(output.is_tensor()); + ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions()); + + std::vector output_data(4); + test->BuildAndRun({{"input", input}}, "my_binary", &output_data); + if (weights_dims.size() == 1) { + EXPECT_THAT(output_data, + ElementsAre(CType(11), CType(22), CType(13), CType(24))); + } else { + EXPECT_THAT(output_data, + ElementsAre(CType(11), CType(12), CType(23), CType(24))); + } + } +} + +template +void TestBinaryTensorOpWeightWithUniformlyBroadcast(OpConverterTest* test) { + typedef typename EnumToDataType::Type CType; + const NodeDef node_def = + GetBinaryOpNodeDef("input", "weights", dtype); + const std::vector input{CType(1), CType(2), CType(3), CType(4)}; + const std::vector weights{CType(10)}; + test->Reset(); + test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1, + TfDataTypeToTrt(dtype)); + test->AddTestWeights("weights", {1, 1, 1, 1}, weights); + test->RunValidationAndConversion(node_def); + + // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor. + CheckAddedLayers(test, /*expect_scale_layer=*/true); + + // Check the dims of the output ITensor. + TRT_TensorOrWeights output; + TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output)); + EXPECT_TRUE(output.is_tensor()); + ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions()); + + std::vector output_data(4); + test->BuildAndRun({{"input", input}}, "my_binary", &output_data); + EXPECT_THAT(output_data, + ElementsAre(CType(11), CType(12), CType(13), CType(14))); +} + +template +void TestBinaryTensorOpWeightFallback(OpConverterTest* test, + const std::vector& input_dims, + const std::vector& weights_dims, + error::Code code = error::OK, + const char* error_msg_substr = nullptr, + const int input_batch_size = 1) { + const DataType dtype = DT_FLOAT; + typedef typename EnumToDataType::Type CType; + const size_t num_inputs = TrtDimsNumElements(GetTestDims(input_dims)); + const size_t num_weights = TrtDimsNumElements(GetTestDims(weights_dims)); + + test->Reset(); + const NodeDef node_def = + GetBinaryOpNodeDef("input", "weights", dtype); + test->AddTestTensor("input", /*dims=*/input_dims, input_batch_size, + TfDataTypeToTrt(dtype)); + test->AddTestWeights( + "weights", /*dims=*/weights_dims, + /*values=*/std::vector(num_weights, CType(1))); + test->RunValidationAndConversion(node_def, code, error_msg_substr); + if (code != error::OK) return; + + // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight. + CheckAddedLayers(test, /*expect_scale_layer=*/false); + + TRT_TensorOrWeights output; + TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output)); + EXPECT_TRUE(output.is_tensor()); + + // Check the dims of the output ITensor. + std::vector expected_output_dims = input_dims; + for (int i = expected_output_dims.size() - 1, j = weights_dims.size() - 1; + i >= 0 && j >= 0; --i, --j) { + if (expected_output_dims[i] == 1) { + expected_output_dims[i] = weights_dims[j]; + } + } + ExpectTrtDimsEqualsArray(expected_output_dims, + output.tensor()->getDimensions()); + + // Check the result of running the engine. + const int expected_num_outputs = + TrtDimsNumElements(GetTestDims(expected_output_dims)); + std::vector output_data(expected_num_outputs); + test->BuildAndRun( + {{"input", + /*input_data=*/std::vector(num_inputs, CType(2))}}, + "my_binary", &output_data); + if (node_def.op() == "Add") { + EXPECT_THAT(output_data, ElementsAreArray(std::vector( + expected_num_outputs, CType(3)))); + } else if (node_def.op() == "Minimum") { + EXPECT_THAT(output_data, ElementsAreArray(std::vector( + expected_num_outputs, CType(1)))); + } else { + ASSERT_TRUE(false); + } +} + +template +void TestBinaryTensorOpTensor(OpConverterTest* test) { + typedef typename EnumToDataType::Type CType; + test->Reset(); + const NodeDef node_def = + GetBinaryOpNodeDef("input1", "input2", dtype); + test->AddTestTensor("input1", /*dims=*/{1, 2}, /*batch_size=*/1, + TfDataTypeToTrt(dtype)); + test->AddTestTensor("input2", /*dims=*/{2, 1}, /*batch_size=*/1, + TfDataTypeToTrt(dtype)); + test->RunValidationAndConversion(node_def); + + // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight. + CheckAddedLayers(test, /*expect_scale_layer=*/false); + + // Check output dims. + TRT_TensorOrWeights output; + TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output)); + EXPECT_TRUE(output.is_tensor()); + ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions()); + + std::vector output_data(4); + // After broadcasting first input becomes {3, 6, 3, 6} and second input + // becomes {2, 3, 2, 3}. + test->BuildAndRun( + {{"input1", {CType(3), CType(6)}}, {"input2", {CType(2), CType(3)}}}, + "my_binary", &output_data); + if (node_def.op() == "Add") { + EXPECT_THAT(output_data, + ElementsAre(CType(5), CType(8), CType(6), CType(9))); + } else if (node_def.op() == "Sub") { + EXPECT_THAT(output_data, + ElementsAre(CType(1), CType(4), CType(0), CType(3))); + } else if (node_def.op() == "Mul") { + EXPECT_THAT(output_data, + ElementsAre(CType(6), CType(12), CType(9), CType(18))); + } else if (node_def.op() == "Div") { + EXPECT_THAT(output_data, + ElementsAre(CType(1.5), CType(3), CType(1), CType(2))); + } else if (node_def.op() == "RealDiv") { + EXPECT_THAT(output_data, + ElementsAre(CType(1.5), CType(3), CType(1), CType(2))); + } else if (node_def.op() == "Minimum") { + EXPECT_THAT(output_data, + ElementsAre(CType(2), CType(2), CType(3), CType(3))); + } else if (node_def.op() == "Maximum") { + EXPECT_THAT(output_data, + ElementsAre(CType(3), CType(6), CType(3), CType(6))); + } else { + ASSERT_TRUE(false); + } +} + +TEST_F(OpConverterTest, ConvertBinary) { + // Input size doesn't match, should fail. + for (size_t num_inputs = 0; num_inputs < 2; ++num_inputs) { + Reset(); + NodeDef node_def = MakeNodeDef("my_add", "Add", {num_inputs, "input"}); + AddTestTensor("input", {1}, /*batch_size=*/1, nvinfer1::DataType::kFLOAT); + RunValidationAndConversion(node_def, error::INVALID_ARGUMENT, + "Binary ops require two inputs, at my_add"); + } + { + // Both inputs are weights. + Reset(); + NodeDef node_def = MakeNodeDef("my_add", "Add", {"weights1", "weights2"}); + AddTestWeights("weights1", {1}, {1}); + AddTestWeights("weights2", {1}, {1}); + RunValidationAndConversion( + node_def, error::UNIMPLEMENTED, + "Constant folding is falled back to TensorFlow, binary op received " + "both input as constant at: my_add"); + } + + // Test BinaryTensorOpWeight() without broadcasting. + TestBinaryTensorOpWeightNoBroadcast(this); + TestBinaryTensorOpWeightNoBroadcast(this); + TestBinaryTensorOpWeightNoBroadcast(this); + TestBinaryTensorOpWeightNoBroadcast(this); + TestBinaryTensorOpWeightNoBroadcast(this); +#if 0 + // TODO(b/119560144): it doesn't support FP16 constants and the following test + // will fail. + TestBinaryTensorOpWeightNoBroadcast(this); + TestBinaryTensorOpWeightNoBroadcast(this); + TestBinaryTensorOpWeightNoBroadcast(this); + TestBinaryTensorOpWeightNoBroadcast(this); + TestBinaryTensorOpWeightNoBroadcast(this); +#endif + + // Test BinaryTensorOpWeight() with channel-wise broadcasting. + TestBinaryTensorOpWeightWithChannelWiseBroadcast(this); + + // Test BinaryTensorOpWeight() with uniformly broadcasting. + TestBinaryTensorOpWeightWithUniformlyBroadcast(this); + + // Test BinaryTensorOpWeight() falling back to BinaryTensorOpTensor(). + // Unsupported op. + TestBinaryTensorOpWeightFallback(this, {1, 1, 1}, {1}); + // Rank of input tensor dimension <3. + TestBinaryTensorOpWeightFallback(this, {1, 1}, {1}); + // Broadcast on batch dimension, should fail. + TestBinaryTensorOpWeightFallback( + this, {1, 1, 1}, {2, 1, 1, 1}, error::INVALID_ARGUMENT, + "Unsupported binary op broadcast scheme for op my_binary", + /*input_batch_size=*/2); + // Incompatible dims with per-channel mode. + TestBinaryTensorOpWeightFallback(this, {1, 1, 1}, {1, 2, 1}); + // Incompatible dims. + TestBinaryTensorOpWeightFallback(this, {1, 2, 1}, {2}); + + // Test BinaryTensorOpTensor() with broadcasting. + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); + TestBinaryTensorOpTensor(this); +} + +TEST_F(OpConverterTest, ConvertQuantize) { + for (const string& op : + {"FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars", + "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3"}) { + // Input list is empty, should fail. + NodeDef node_def = MakeNodeDef("my_quantize", op, {}); + RunValidationAndConversion( + node_def, error::INVALID_ARGUMENT, + StrCat("Invalid number of inputs for ", op, ", at my_quantize") + .c_str()); + } + { + // FakeQuantWithMinMaxArgs attributes are empty, should fail. + NodeDef node_def = + MakeNodeDef("my_quantize", "FakeQuantWithMinMaxArgs", {"input"}); + AddTestTensor("input", {1, 2, 3}); + RunValidationAndConversion( + node_def, error::INVALID_ARGUMENT, + "Min or max attribute not found for FakeQuantWithMinMaxArgs " + "at my_quantize"); + } + { + // FakeQuantWithMinMaxArgs ranges set via attributes, ok. + Reset(); + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f); + auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("my_quantize"), + input, quantize_attrs); + const NodeDef& node_def = quantize.operation.node()->def(); + AddTestTensor("input", {1, 2, 3}); + RunValidationAndConversion(node_def); + TRT_TensorOrWeights output; + TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output)); + EXPECT_TRUE(output.is_tensor()); + auto ranges = quantization_ranges(); + EXPECT_EQ(1, ranges.count(output.tensor())); + EXPECT_EQ(6.0f, ranges[output.tensor()]); + } + { + // FakeQuantWithMinMaxVars ranges set via inputs, ok. + Reset(); + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT); + auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT); + auto quantize = ops::FakeQuantWithMinMaxVars( + s.WithOpName("my_quantize"), input, weights_min, weights_max); + const NodeDef& node_def = quantize.operation.node()->def(); + AddTestTensor("input", {1, 2, 3}); + AddTestWeights("weights_min", {1}, {-6.0f}); + AddTestWeights("weights_max", {1}, {6.0f}); + RunValidationAndConversion(node_def); + TRT_TensorOrWeights output; + TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output)); + EXPECT_TRUE(output.is_tensor()); + auto ranges = quantization_ranges(); + EXPECT_EQ(1, ranges.count(output.tensor())); + EXPECT_EQ(6.0f, ranges[output.tensor()]); + } + { + // QuantizeAndDequantizeV2 ranges set via inputs, ok. + Reset(); + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT); + auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT); + auto quantize = ops::QuantizeAndDequantizeV2( + s.WithOpName("my_quantize"), input, weights_min, weights_max); + const NodeDef& node_def = quantize.operation.node()->def(); + AddTestTensor("input", {1, 2, 3}); + AddTestWeights("weights_min", {1}, {-6.0f}); + AddTestWeights("weights_max", {1}, {6.0f}); + RunValidationAndConversion(node_def); + TRT_TensorOrWeights output; + TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output)); + EXPECT_TRUE(output.is_tensor()); + auto ranges = quantization_ranges(); + EXPECT_EQ(1, ranges.count(output.tensor())); + EXPECT_EQ(6.0f, ranges[output.tensor()]); + } + { + // QuantizeAndDequantizeV2 Range inputs are tensors, should fail. + Reset(); + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT); + auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT); + auto quantize = ops::QuantizeAndDequantizeV2( + s.WithOpName("my_quantize"), input, weights_min, weights_max); + const NodeDef& node_def = quantize.operation.node()->def(); + AddTestTensor("input", {1, 2, 3}); + AddTestTensor("weights_min", {1}); + AddTestTensor("weights_max", {1}); + RunValidationAndConversion( + node_def, error::INVALID_ARGUMENT, + "Min and max inputs for QuantizeAndDequantizeV2 must be weights not " + "tensors, at my_quantize"); + } + { + // QuantizeAndDequantizeV3 ranges set via inputs, ok. + Reset(); + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT); + auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT); + auto num_bits = ops::Placeholder(s.WithOpName("num_bits"), DT_INT32); + auto quantize = ops::QuantizeAndDequantizeV3( + s.WithOpName("my_quantize"), input, weights_min, weights_max, num_bits); + const NodeDef& node_def = quantize.operation.node()->def(); + AddTestTensor("input", {1, 2, 3}); + AddTestWeights("weights_min", {1}, {-6.0f}); + AddTestWeights("weights_max", {1}, {6.0f}); + AddTestWeights("num_bits", {1}, {8}); + RunValidationAndConversion(node_def); + TRT_TensorOrWeights output; + TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output)); + EXPECT_TRUE(output.is_tensor()); + auto ranges = quantization_ranges(); + EXPECT_EQ(1, ranges.count(output.tensor())); + EXPECT_EQ(6.0f, ranges[output.tensor()]); + } +} + +TEST_F(OpConverterTest, ConvertRelu6) { + { + // Input list is empty, should fail. + NodeDef node_def = MakeNodeDef("my_relu6", "Relu6", {}); + RunValidationAndConversion( + node_def, error::INVALID_ARGUMENT, + "Invalid number of inputs for Relu6, at my_relu6"); + } + + // Get the NodeDef for Relu6. + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT); + auto relu6 = ops::Relu6(s.WithOpName("my_relu6"), input); + const NodeDef node_def = relu6.operation.node()->def(); + { + // Input is weights, should fail. + Reset(); + AddTestWeights("input", {1}, {1.0f}); + RunValidationAndConversion( + node_def, error::UNIMPLEMENTED, + "Relu6 is only implemented for tensors, not weights, at my_relu6"); + } + { + // Clip tensor values and set quantization ranges, ok. + Reset(); + AddTestTensor("input", {1, 2, 3}); + RunValidationAndConversion(node_def); + TRT_TensorOrWeights output; + TF_EXPECT_OK(GetTensorOrWeights("my_relu6", &output)); + EXPECT_TRUE(output.is_tensor()); + auto ranges = quantization_ranges(); + EXPECT_EQ(ranges[output.tensor()], 6.0f); + + std::vector output_data(6); + BuildAndRun({{"input", {-100, -1, 0, 3, 5, 9}}}, "my_relu6", + &output_data); + EXPECT_THAT(output_data, ElementsAre(0, 0, 0, 3, 5, 6)); + } +} + } // namespace convert } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc index b30d94b02824516906ea8880ac6de0bbee9e166c..4ac7e21d348cc1f04bed32a126fc87d5c1762ffd 100644 --- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc +++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc @@ -67,6 +67,9 @@ tensorflow::Status TRTOptimizationPass::Init( TF_RETURN_IF_ERROR(GetPrecisionMode( Uppercase(params.at("precision_mode").s()), &precision_mode_)); } + if (params.count("use_calibration")) { + use_calibration_ = params.at("use_calibration").b(); + } return tensorflow::Status::OK(); } @@ -222,6 +225,12 @@ tensorflow::Status TRTOptimizationPass::Optimize( TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true)); tensorflow::tensorrt::convert::ConversionParams cp; + if (use_calibration_ && precision_mode_ != INT8MODE) { + LOG(ERROR) << "Calibration with FP32 or FP16 is not implemented. " + << "Falling back to use_calibration = False."; + use_calibration_ = false; + } + std::vector nodes_to_preserve; for (const auto& n : item.NodesToPreserve()) { auto tokens = str_util::Split(n, ":"); @@ -250,6 +259,7 @@ tensorflow::Status TRTOptimizationPass::Optimize( cp.is_dyn_op = is_dynamic_op_; cp.cached_engine_batches = batches_; cp.max_cached_engines = max_cached_batches_; + cp.use_calibration = use_calibration_; auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(cp); VLOG(1) << "Returning from " << name_; return status; diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h index 71b51d13681cb3f75dad034f3fb0f73dea2bacc1..3e8dc0978e43e2e9ba07aaa09f74acfe8e59b9a7 100644 --- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h +++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h @@ -38,7 +38,8 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer { maximum_batch_size_(-1), is_dynamic_op_(false), max_cached_batches_(1), - max_workspace_size_bytes_(256LL << 20) { + max_workspace_size_bytes_(256LL << 20), + use_calibration_(true) { VLOG(1) << "Constructing " << name_; } @@ -67,6 +68,7 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer { std::vector batches_; int max_cached_batches_; int64_t max_workspace_size_bytes_; + bool use_calibration_; }; } // namespace convert diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index 019446813a56de6316a04c1738ae13d03e8f4713..1e907e0d2a669b2bef5fc6ca0822c1e6049c7018 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -124,8 +124,10 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) OP_REQUIRES_OK(context, context->GetAttr("segment_funcdef_name", &funcdef_name_)); OP_REQUIRES_OK(context, GetPrecisionMode(precision_string, &precision_mode_)); - calibration_mode_ = - (precision_mode_ == INT8MODE && calibration_data.size() == 0); + OP_REQUIRES_OK(context, + context->GetAttr("use_calibration", &use_calibration_)); + calibration_mode_ = (use_calibration_ && precision_mode_ == INT8MODE && + calibration_data.size() == 0); if (calibration_data.size()) { calibrator_.reset(new TRTInt8Calibrator(calibration_data)); calibration_data.resize(0); @@ -308,7 +310,7 @@ bool TRTEngineOp::ExecuteTrtEngine( std::vector buffers(num_binding); for (int i = 0; i < ctx->num_inputs(); i++) { const string input_name = StrCat(kInputPHName, i); - const size_t binding_index = + const int binding_index = trt_engine_ptr->getBindingIndex(input_name.c_str()); if (binding_index == -1) { LOG(ERROR) << "Input node not found, at " << input_name; @@ -345,7 +347,7 @@ bool TRTEngineOp::ExecuteTrtEngine( for (int i = 0; i < ctx->num_outputs(); i++) { // Create an output tensor const string output_name = StrCat(kOutputPHName, i); - const size_t binding_index = + const int binding_index = trt_engine_ptr->getBindingIndex(output_name.c_str()); Tensor* output_tensor = nullptr; @@ -497,7 +499,8 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size, // means calibration_mode_ is true and this path won't get executed. auto status = convert::ConvertGraphDefToEngine( segment_graph_, precision_mode_, batch_size, workspace_size_, shapes, - &logger, allocator, calibrator_.get(), &engine, &convert_successfully); + &logger, allocator, calibrator_.get(), &engine, use_calibration_, + &convert_successfully); if (!status.ok()) { if (convert_successfully) { // This means it fail to build the engine even when the network is built @@ -586,6 +589,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources( *segment_graph, INT8MODE, cres->calibrator_->getBatchSize(), workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(), cres->calibrator_.get(), &cres->engine_, + /*use_calibration=*/true, /*convert_successfully=*/nullptr); if (!s.ok()) { LOG(ERROR) << "Calibration failed: " << s; diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h index 8fe06758914261035c90a6fda3f114a63a8ac93a..b545f497f32d5a1a6960b748467ca189b7debf6c 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h @@ -130,6 +130,10 @@ class TRTEngineOp : public AsyncOpKernel { // The finalized calibrator for inference. std::unique_ptr calibrator_; + + // If true, create calibration graph for INT8 mode. Otherwise, we are using + // user-provided quantization ranges. + bool use_calibration_; }; } // namespace tensorrt diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc index e0c7b6272379a20e3dacb6cd7c3b39de735d844d..ce04e5806ed354e6b5a4905e790d78117cc84369 100644 --- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc @@ -39,8 +39,9 @@ REGISTER_OP("TRTEngineOp") .Attr("cached_engine_batches: list(int) = []") .Attr("max_cached_engines_count: int = 1") .Attr("workspace_size_bytes: int") - .Attr("precision_mode: {'FP32', 'FP16', 'INT8', 'INT8CALIB'}") + .Attr("precision_mode: {'FP32', 'FP16', 'INT8'}") .Attr("calibration_data: string = ''") + .Attr("use_calibration: bool = true") .Input("in_tensor: InT") .Output("out_tensor: OutT"); // TODO(jie): TF requires concrete output shape for concrete input shapes. diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py index d149dcbd563295b931812ed5671656a8641a549d..74a2c2392ad8ff935cb749bb11c9fb247fa367b0 100644 --- a/tensorflow/contrib/tensorrt/python/trt_convert.py +++ b/tensorflow/contrib/tensorrt/python/trt_convert.py @@ -63,19 +63,20 @@ class TrtPrecisionMode(object): return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8] -def tensorrt_rewriter_config(rewriter_config=None, - max_batch_size=1, - max_workspace_size_bytes=2 << 20, - precision_mode=TrtPrecisionMode.FP32, - minimum_segment_size=3, - is_dynamic_op=False, - maximum_cached_engines=1, - cached_engine_batch_sizes=None): +def get_tensorrt_rewriter_config(rewriter_config=None, + max_batch_size=1, + max_workspace_size_bytes=2 << 20, + precision_mode=TrtPrecisionMode.FP32, + minimum_segment_size=3, + is_dynamic_op=False, + maximum_cached_engines=1, + cached_engine_batch_sizes=None, + use_calibration=True): """Returns a RewriterConfig proto for TRT transformation. Args: - rewriter_config: a RewriterConfig proto to append the TensorRTOptimizer to. - If None, it will create one with default settings. + rewriter_config: a template RewriterConfig proto used to create a + TRT-enabled RewriterConfig. If None, it will use a default one. max_batch_size: max size for the input batch max_workspace_size_bytes: the maximum GPU temporary memory which the TRT engine can use at execution time. This corresponds to the 'workspaceSize' @@ -95,6 +96,15 @@ def tensorrt_rewriter_config(rewriter_config=None, use this list to determine the batch sizes of the cached engines, instead of making the decision on the fly. This is useful when we know the most common batch size(s) the application is going to generate. + use_calibration: this argument is ignored if precision_mode is not INT8. if + set to True, a calibration graph will be created to calibrate the missing + ranges. The calibration graph must be converted to an inference graph + using calib_graph_to_infer_graph() after running calibration. if set to + False, quantization nodes will be expected for every tensor in the graph + (exlcuding those which will be fused). If a range is missing, an error + will occur. Please note that accuracy may be negatively affected if there + is a mismatch between which tensors TRT quantizes and which tensors were + trained with fake quantization. Returns: A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler. @@ -107,13 +117,16 @@ def tensorrt_rewriter_config(rewriter_config=None, rewriter_config, rewriter_config_pb2.RewriterConfig): raise TypeError("rewriter_config should be a RewriterConfig proto.") + rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig() if rewriter_config is None: - rewriter_config = rewriter_config_pb2.RewriterConfig() # Layout optimizer may add Const nodes followed by Reshape nodes, thus we # need to run constant folding again. - rewriter_config.optimizers.extend(["constfold", "layout", "constfold"]) - rewriter_config.meta_optimizer_iterations = ( + rewriter_config_with_trt.optimizers.extend( + ["constfold", "layout", "constfold"]) + rewriter_config_with_trt.meta_optimizer_iterations = ( rewriter_config_pb2.RewriterConfig.ONE) + else: + rewriter_config_with_trt.CopyFrom(rewriter_config) if precision_mode.upper() not in TrtPrecisionMode.supported_precision_modes(): raise ValueError(("precision mode '{}' is not supported." @@ -121,7 +134,7 @@ def tensorrt_rewriter_config(rewriter_config=None, precision_mode, TrtPrecisionMode.supported_precision_modes)) - optimizer = rewriter_config.custom_optimizers.add() + optimizer = rewriter_config_with_trt.custom_optimizers.add() optimizer.name = "TensorRTOptimizer" optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size optimizer.parameter_map["max_batch_size"].i = max_batch_size @@ -138,7 +151,8 @@ def tensorrt_rewriter_config(rewriter_config=None, "maximum_cached_engines items.") optimizer.parameter_map["cached_engine_batches"].list.i.extend( cached_engine_batch_sizes) - return rewriter_config + optimizer.parameter_map["use_calibration"].b = use_calibration + return rewriter_config_with_trt def create_inference_graph(input_graph_def, @@ -150,7 +164,7 @@ def create_inference_graph(input_graph_def, is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batch_sizes=None, - rewriter_config=None, + use_calibration=True, input_saved_model_dir=None, input_saved_model_tags=None, output_saved_model_dir=None, @@ -182,8 +196,15 @@ def create_inference_graph(input_graph_def, use this list to determine the batch sizes of the cached engines, instead of making the decision on the fly. This is useful when we know the most common batch size(s) the application is going to generate. - rewriter_config: a RewriterConfig proto to append the TensorRTOptimizer to. - If None, it will create one with default settings. + use_calibration: this argument is ignored if precision_mode is not INT8. if + set to True, a calibration graph will be created to calibrate the missing + ranges. The calibration graph must be converted to an inference graph + using calib_graph_to_infer_graph() after running calibration. if set to + False, quantization nodes will be expected for every tensor in the graph + (exlcuding those which will be fused). If a range is missing, an error + will occur. Please note that accuracy may be negatively affected if there + is a mismatch between which tensors TRT quantizes and which tensors were + trained with fake quantization. input_saved_model_dir: the directory to load the SavedModel which contains the input graph to transforms. Used only when input_graph_def is None. input_saved_model_tags: list of tags to load the SavedModel. @@ -191,8 +212,9 @@ def create_inference_graph(input_graph_def, returned GraphDef and save it to the specified directory. This option only works when the input graph is loaded from a SavedModel, i.e. when input_saved_model_dir is specified and input_graph_def is None. - session_config: the ConfigProto used to create a Session. If not specified, - a default ConfigProto will be used. + session_config: the ConfigProto used to create a Session. It's also used as + a template to create a TRT-enabled ConfigProto for conversion. If not + specified, a default ConfigProto will be used. Returns: A GraphDef transformed from input_graph_def (or the SavedModel graph def @@ -322,23 +344,30 @@ def create_inference_graph(input_graph_def, grappler_meta_graph_def.collection_def["train_op"].CopyFrom( output_collection) - # Create RewriterConfig. - config = config_pb2.ConfigProto() - config.graph_options.rewrite_options.CopyFrom( - tensorrt_rewriter_config( - rewriter_config, max_batch_size, max_workspace_size_bytes, - precision_mode, minimum_segment_size, is_dynamic_op, - maximum_cached_engines, cached_engine_batch_sizes)) + # Create TRT-enabled ConfigProto. + session_config_with_trt = config_pb2.ConfigProto() + session_config_with_trt.CopyFrom(session_config) + rewriter_config = None + if (session_config_with_trt.HasField("graph_options") and + session_config_with_trt.graph_options.HasField("rewrite_options")): + rewriter_config = session_config_with_trt.graph_options.rewrite_options + rewriter_config_with_trt = get_tensorrt_rewriter_config( + rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode, + minimum_segment_size, is_dynamic_op, maximum_cached_engines, + cached_engine_batch_sizes, use_calibration) + session_config_with_trt.graph_options.rewrite_options.CopyFrom( + rewriter_config_with_trt) # Run Grappler. transformed_graph_def = tf_optimizer.OptimizeGraph( - config, grappler_meta_graph_def, graph_id=b"tf_graph") + session_config_with_trt, grappler_meta_graph_def, graph_id=b"tf_graph") # Optionally write the transformed graphdef as SavedModel. if output_saved_model_dir is not None: saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir) with ops.Graph().as_default(): importer.import_graph_def(transformed_graph_def, name="") + # We don't use TRT here. with session.Session(config=session_config) as sess: saved_model_builder.add_meta_graph_and_variables( sess, diff --git a/tensorflow/contrib/tensorrt/python/trt_convert_test.py b/tensorflow/contrib/tensorrt/python/trt_convert_test.py index 9f2eeac990dcacb547d336b68bc042016c3e6171..aa82f4207f5fa9c646cadbc4ca4fd7ab40c089ff 100644 --- a/tensorflow/contrib/tensorrt/python/trt_convert_test.py +++ b/tensorflow/contrib/tensorrt/python/trt_convert_test.py @@ -47,9 +47,9 @@ from tensorflow.python.tools import saved_model_utils class TrtConvertTest(test_util.TensorFlowTestCase): """Class to test Tensorflow-TensorRT integration python API.""" - def testTensorrtRewriterConfig(self): - """Test case for trt_convert.tensorrt_rewriter_config().""" - rewriter_cfg = trt_convert.tensorrt_rewriter_config( + def testGetTensorrtRewriterConfig(self): + """Test case for trt_convert.get_tensorrt_rewriter_config().""" + rewriter_cfg = trt_convert.get_tensorrt_rewriter_config( rewriter_config=None, max_batch_size=128, max_workspace_size_bytes=1234, diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py index 18096e0ff1ec6b9872346d8a84ac93c542cfb643..b325d76edfabce25f165a6b23c5f39bb6ac84247 100644 --- a/tensorflow/contrib/tensorrt/test/base_test.py +++ b/tensorflow/contrib/tensorrt/test/base_test.py @@ -56,8 +56,9 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase): strides=[1, 2, 2, 1], padding="SAME", name="conv") - bias = constant_op.constant( - [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtype) + bias = constant_op.constant([4., 1.5, 2., 3., 5., 7.], + name="bias", + dtype=dtype) added = nn.bias_add(conv, bias, name="bias_add") relu = nn.relu(added, "relu") identity = array_ops.identity(relu, "identity") @@ -73,11 +74,12 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase): def ExpectedEnginesToBuild(self, run_params): """Return the expected engines to build.""" - # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which - # breaks the connection check, fix it. - # - my_trt_op_0 should have ["weights", "conv", "bias", "bias_add", - # "relu", "identity", "max_pool"] - return ["my_trt_op_0"] + return { + "my_trt_op_0": [ + "weights", "conv", "bias", "bias_add", "relu", "identity", + "max_pool" + ] + } class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase): @@ -92,7 +94,7 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase): g = ops.Graph() with g.as_default(): inp = array_ops.placeholder( - dtype=dtype, shape=[None] + input_dims[1:], name=input_name) + dtype=dtype, shape=input_dims, name=input_name) with g.device("/GPU:0"): conv_filter = constant_op.constant( [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]], @@ -105,10 +107,10 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase): padding="SAME", name="conv") c1 = constant_op.constant( - np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c1") + np.random.randn(12, 12, 6), dtype=dtype, name="c1") p = math_ops.mul(conv, c1, name="mul") c2 = constant_op.constant( - np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c2") + np.random.randn(12, 12, 6), dtype=dtype, name="c2") q = math_ops.div(conv, c2, name="div") edge = self.trt_incompatible_op(q, name="incompatible") @@ -129,22 +131,21 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase): def ExpectedEnginesToBuild(self, run_params): """Return the expected engines to build.""" - # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which - # breaks the connection check, fix it. - # - my_trt_op_0 should have ["mul", "sub", "div1", "mul1", "add1", - # "add", "sub1"]; - # - my_trt_op_1 should have ["weights","conv", "div"] - return ["my_trt_op_0", "my_trt_op_1"] + return { + "my_trt_op_0": [ + "add", "add1", "c1", "div1", "mul", "mul1", "sub", "sub1" + ], + "my_trt_op_1": ["c2", "conv", "div", "weights"] + } - def ShouldRunTest(self, run_params): - # TODO(aaroey): LayoutOptimizer adds Transpose(Const, Const) to the graph - # which breaks the conversion. We should fix it as: - # - Detect the invalid NodeDef earlier before adding them to segment - # - Let it able to change the RewriterConfig when calling - # create_inference_graph(). - # It will be good to add debugging feature for Grappler to print the graph - # after running each optimizer. - return False + def GetConversionParams(self, run_params): + """Return a ConversionParams for test.""" + return super( + SimpleMultiEnginesTest, self + ).GetConversionParams(run_params)._replace( + # Disable layout optimizer, since it'll add Transpose(Const, Const) to + # the graph and breaks the conversion check. + rewriter_config=trt_test.OptimizerDisabledRewriterConfig()) class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase): @@ -197,7 +198,9 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase): """Whether to run the test.""" # Disable the test in fp16 mode since multiple matmul and add ops together # can cause overflow. - return run_params.precision_mode != "FP16" + return ((run_params.precision_mode != "FP16") and + not (trt_test.IsQuantizationMode(run_params.precision_mode) and + not run_params.use_calibration)) class PartiallyConvertedTestB(PartiallyConvertedTestA): diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py index 7545bb9df20f295a8fdbc82b573cdb3407f8c5e4..6546ef64778e0ee3638b3aea08c61a9b32e0dc7b 100644 --- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py +++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py @@ -41,6 +41,7 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase): input_name = "input" input_matrix_rows = 4 input_matrix_columns = 144 + # Note that tf.nn.bias_add supports up to 5 dimensions. input_dims = [input_matrix_rows, input_matrix_columns] output_name = "output" g = ops.Graph() @@ -74,18 +75,18 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase): x5 = nn.bias_add(x5, b) x5 = gen_array_ops.reshape(x5, [4, -1]) - x6 = gen_array_ops.reshape(x, [4, 12, 12]) - b = self._ConstOp((12,)) + x6 = gen_array_ops.reshape(x, [4, 24, 6]) + b = self._ConstOp((6,)) x6 = nn.bias_add(x6, b, data_format="NHWC") x6 = gen_array_ops.reshape(x6, [4, -1]) - x7 = gen_array_ops.reshape(x, [4, 12, 3, 4]) - b = self._ConstOp((4,)) + x7 = gen_array_ops.reshape(x, [4, 12, 4, 3]) + b = self._ConstOp((3,)) x7 = nn.bias_add(x7, b, data_format="NHWC") x7 = gen_array_ops.reshape(x7, [4, -1]) - x8 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2]) - b = self._ConstOp((2,)) + x8 = gen_array_ops.reshape(x, [4, 4, 3, 2, 6]) + b = self._ConstOp((6,)) x8 = nn.bias_add(x8, b, data_format="NHWC") x8 = gen_array_ops.reshape(x8, [4, -1]) @@ -94,13 +95,13 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase): x9 = nn.bias_add(x9, b, data_format="NCHW") x9 = gen_array_ops.reshape(x9, [4, -1]) - x10 = gen_array_ops.reshape(x, [4, 12, 3, 4]) - b = self._ConstOp((12,)) + x10 = gen_array_ops.reshape(x, [4, 3, 4, 12]) + b = self._ConstOp((3,)) x10 = nn.bias_add(x10, b, data_format="NCHW") x10 = gen_array_ops.reshape(x10, [4, -1]) - x11 = gen_array_ops.reshape(x, [4, 12, 12]) - b = self._ConstOp((12,)) + x11 = gen_array_ops.reshape(x, [4, 6, 24]) + b = self._ConstOp((6,)) x11 = nn.bias_add(x11, b, data_format="NCHW") x11 = gen_array_ops.reshape(x11, [4, -1]) @@ -116,9 +117,14 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase): def GetConversionParams(self, run_params): """Return a ConversionParams for test.""" - return super(BiasaddMatMulTest, - self).GetConversionParams(run_params)._replace( - max_batch_size=4, maximum_cached_engines=1) + conversion_params = super(BiasaddMatMulTest, + self).GetConversionParams(run_params) + return conversion_params._replace( + max_batch_size=4, + maximum_cached_engines=1, + # Disable layout optimizer, since it will convert BiasAdd with NHWC + # format to NCHW format under four dimentional input. + rewriter_config=trt_test.OptimizerDisabledRewriterConfig()) def ExpectedEnginesToBuild(self, run_params): """Return the expected engines to build.""" diff --git a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py new file mode 100644 index 0000000000000000000000000000000000000000..e7d6ec4ad395d38a06f97020f2f363009f2286c7 --- /dev/null +++ b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py @@ -0,0 +1,290 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Script to test TF-TRT INT8 conversion without calibration on Mnist model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.tensorrt.python import trt_convert +# pylint: disable=unused-import +from tensorflow.contrib.tensorrt.python.ops import trt_engine_op +# pylint: enable=unused-import +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python import data +from tensorflow.python import keras +from tensorflow.python.estimator.estimator import Estimator +from tensorflow.python.estimator.model_fn import EstimatorSpec +from tensorflow.python.estimator.model_fn import ModeKeys +from tensorflow.python.estimator.run_config import RunConfig +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import graph_util +from tensorflow.python.framework import importer +from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util +from tensorflow.python.keras.datasets import mnist +from tensorflow.python.layers import layers +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import metrics +from tensorflow.python.ops import nn +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops.losses import losses +from tensorflow.python.platform import test +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.summary import summary +from tensorflow.python.training import saver +from tensorflow.python.training.adam import AdamOptimizer +from tensorflow.python.training.checkpoint_management import latest_checkpoint +from tensorflow.python.training.training_util import get_global_step + +INPUT_NODE_NAME = 'input' +OUTPUT_NODE_NAME = 'output' + + +class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase): + + def _BuildGraph(self, x): + + def _Quantize(x, r): + x = gen_array_ops.quantize_and_dequantize_v2(x, -r, r) + return x + + def _DenseLayer(x, num_inputs, num_outputs, quantization_range, name): + """Dense layer with quantized outputs. + + Args: + x: input to the dense layer + num_inputs: number of input columns of x + num_outputs: number of output columns + quantization_range: the min/max range for quantization + name: name of the variable scope + + Returns: + The output of the layer. + """ + with variable_scope.variable_scope(name): + kernel = variable_scope.get_variable( + 'kernel', + shape=[num_inputs, num_outputs], + dtype=dtypes.float32, + initializer=keras.initializers.glorot_uniform()) + bias = variable_scope.get_variable( + 'bias', + shape=[num_outputs], + dtype=dtypes.float32, + initializer=keras.initializers.zeros()) + x = math_ops.matmul(x, kernel) + x = _Quantize(x, quantization_range) + x = nn.bias_add(x, bias) + x = _Quantize(x, quantization_range) + return x + + x = _Quantize(x, 1) + # Conv + Bias + Relu6 + x = layers.conv2d(x, filters=32, kernel_size=3, use_bias=True) + x = nn.relu6(x) + # Conv + Bias + Relu6 + x = layers.conv2d(x, filters=64, kernel_size=3, use_bias=True) + x = nn.relu6(x) + # Reduce + x = math_ops.reduce_mean(x, [1, 2]) + x = _Quantize(x, 6) + # FC1 + x = _DenseLayer(x, 64, 512, 6, name='dense') + x = nn.relu6(x) + # FC2 + x = _DenseLayer(x, 512, 10, 25, name='dense_1') + x = array_ops.identity(x, name=OUTPUT_NODE_NAME) + return x + + def _GetGraphDef(self, use_trt, max_batch_size, model_dir): + """Get the frozen mnist GraphDef. + + Args: + use_trt: whether use TF-TRT to convert the graph. + max_batch_size: the max batch size to apply during TF-TRT conversion. + model_dir: the model directory to load the checkpoints. + + Returns: + The frozen mnist GraphDef. + """ + graph = ops.Graph() + with self.session(graph=graph) as sess: + with graph.device('/GPU:0'): + x = array_ops.placeholder( + shape=(None, 28, 28, 1), dtype=dtypes.float32, name=INPUT_NODE_NAME) + self._BuildGraph(x) + # Load weights + mnist_saver = saver.Saver() + checkpoint_file = latest_checkpoint(model_dir) + mnist_saver.restore(sess, checkpoint_file) + # Freeze + graph_def = graph_util.convert_variables_to_constants( + sess, sess.graph_def, output_node_names=[OUTPUT_NODE_NAME]) + # Convert with TF-TRT + if use_trt: + logging.info('Number of nodes before TF-TRT conversion: %d', + len(graph_def.node)) + graph_def = trt_convert.create_inference_graph( + graph_def, + outputs=[OUTPUT_NODE_NAME], + max_batch_size=max_batch_size, + precision_mode='INT8', + max_workspace_size_bytes=4096 << 19, + minimum_segment_size=2, + use_calibration=False, + ) + logging.info('Number of nodes after TF-TRT conversion: %d', + len(graph_def.node)) + num_engines = len( + [1 for n in graph_def.node if str(n.op) == 'TRTEngineOp']) + self.assertEqual(1, num_engines) + return graph_def + + def _Run(self, is_training, use_trt, batch_size, num_epochs, model_dir): + """Train or evaluate the model. + + Args: + is_training: whether to train or evaluate the model. In training mode, + quantization will be simulated where the quantize_and_dequantize_v2 are + placed. + use_trt: if true, use TRT INT8 mode for evaluation, which will perform + real quantization. Otherwise use native TensorFlow which will perform + simulated quantization. Ignored if is_training is True. + batch_size: batch size. + num_epochs: how many epochs to train. Ignored if is_training is False. + model_dir: where to save or load checkpoint. + + Returns: + The Estimator evaluation result. + """ + # Get dataset + train_data, test_data = mnist.load_data() + + def _PreprocessFn(x, y): + x = math_ops.cast(x, dtypes.float32) + x = array_ops.expand_dims(x, axis=2) + x = 2.0 * (x / 255.0) - 1.0 + y = math_ops.cast(y, dtypes.int32) + return x, y + + def _EvalInputFn(): + mnist_x, mnist_y = test_data + dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y)) + dataset = dataset.apply( + data.experimental.map_and_batch( + map_func=_PreprocessFn, + batch_size=batch_size, + num_parallel_calls=8)) + dataset = dataset.repeat(count=1) + iterator = dataset.make_one_shot_iterator() + features, labels = iterator.get_next() + return features, labels + + def _TrainInputFn(): + mnist_x, mnist_y = train_data + dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y)) + dataset = dataset.shuffle(2 * len(mnist_x)) + dataset = dataset.apply( + data.experimental.map_and_batch( + map_func=_PreprocessFn, + batch_size=batch_size, + num_parallel_calls=8)) + dataset = dataset.repeat(count=num_epochs) + iterator = dataset.make_one_shot_iterator() + features, labels = iterator.get_next() + return features, labels + + def _ModelFn(features, labels, mode): + if is_training: + logits_out = self._BuildGraph(features) + else: + graph_def = self._GetGraphDef(use_trt, batch_size, model_dir) + logits_out = importer.import_graph_def( + graph_def, + input_map={INPUT_NODE_NAME: features}, + return_elements=[OUTPUT_NODE_NAME + ':0'], + name='')[0] + + loss = losses.sparse_softmax_cross_entropy( + labels=labels, logits=logits_out) + summary.scalar('loss', loss) + + classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out') + accuracy = metrics.accuracy( + labels=labels, predictions=classes_out, name='acc_op') + summary.scalar('accuracy', accuracy[1]) + + if mode == ModeKeys.EVAL: + return EstimatorSpec( + mode, loss=loss, eval_metric_ops={'accuracy': accuracy}) + elif mode == ModeKeys.TRAIN: + optimizer = AdamOptimizer(learning_rate=1e-2) + train_op = optimizer.minimize(loss, global_step=get_global_step()) + return EstimatorSpec(mode, loss=loss, train_op=train_op) + + config_proto = config_pb2.ConfigProto() + config_proto.gpu_options.allow_growth = True + estimator = Estimator( + model_fn=_ModelFn, + model_dir=model_dir if is_training else None, + config=RunConfig(session_config=config_proto)) + + if is_training: + estimator.train(_TrainInputFn) + results = estimator.evaluate(_EvalInputFn) + logging.info('accuracy: %s', str(results['accuracy'])) + return results + + # To generate the checkpoint, set a different model_dir and call self._Run() + # by setting is_training=True and num_epochs=1000, e.g.: + # model_dir = '/tmp/quantization_mnist' + # self._Run( + # is_training=True, + # use_trt=False, + # batch_size=128, + # num_epochs=100, + # model_dir=model_dir) + def testEval(self): + if not trt_convert.is_tensorrt_enabled(): + return + model_dir = test.test_src_dir_path('contrib/tensorrt/test/testdata') + + accuracy_tf_native = self._Run( + is_training=False, + use_trt=False, + batch_size=128, + num_epochs=None, + model_dir=model_dir)['accuracy'] + logging.info('accuracy_tf_native: %f', accuracy_tf_native) + self.assertAllClose(accuracy_tf_native, 0.9662) + + if trt_convert.get_linked_tensorrt_version()[0] < 5: + return + + accuracy_tf_trt = self._Run( + is_training=False, + use_trt=True, + batch_size=128, + num_epochs=None, + model_dir=model_dir)['accuracy'] + logging.info('accuracy_tf_trt: %f', accuracy_tf_trt) + self.assertAllClose(accuracy_tf_trt, 0.9677) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/tensorrt/test/quantization_test.py b/tensorflow/contrib/tensorrt/test/quantization_test.py new file mode 100644 index 0000000000000000000000000000000000000000..28353273edec4a2b0fd4300f87b0b1a4dbe37652 --- /dev/null +++ b/tensorflow/contrib/tensorrt/test/quantization_test.py @@ -0,0 +1,144 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Model script to test TF-TensorRT integration.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.tensorrt.python import trt_convert +from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import test + + +def _GetParams(add_quantization_nodes, dtype=dtypes.float32): + input_name = "input" + input_dims = [8, 8] + output_name = "output" + + def _Quantize(x, r): + if add_quantization_nodes: + x = gen_array_ops.fake_quant_with_min_max_vars(x, -r, r) + return x + + g = ops.Graph() + with g.as_default(): + x = array_ops.placeholder( + dtype=dtype, shape=[None] + input_dims[1:], name=input_name) + x = _Quantize(x, 10.0) + x = x + 5 + x = _Quantize(x, 15.0) + x = x - 5 + x = _Quantize(x, 10.0) + x = x * 0.1 + x = _Quantize(x, 1.0) + w = constant_op.constant(np.ones((8, 1)), dtype=dtypes.float32) + x = math_ops.matmul(x, w) + x = _Quantize(x, 10.0) + x = array_ops.identity(x, name=output_name) + + return trt_test.TfTrtIntegrationTestParams( + gdef=g.as_graph_def(), + input_names=[input_name], + input_dims=[input_dims], + output_names=[output_name], + expected_output_dims=[(8, 1)]) + + +class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase): + + def GetParams(self): + """Create a graph containing single segment with no quantization ranges.""" + return _GetParams(add_quantization_nodes=False) + + def ShouldRunTest(self, run_params): + if trt_convert.get_linked_tensorrt_version()[0] < 5: + return False + # Only test static engine mode, with or without calibration. + return (trt_test.IsQuantizationMode(run_params.precision_mode) and + not run_params.use_optimizer and not run_params.dynamic_engine) + + def ExpectedEnginesToBuild(self, run_params): + """Return the expected engines to build.""" + if run_params.use_calibration: + # In static engine mode with calibration, it should build a calibration + # engine. + return ["my_trt_op_0"] + # In static engine mode without calibration, the engine building will fail + # since no quantization ranges are set, which results in no TRT nodes. + return [] + + +class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase): + + def GetParams(self): + """Create a graph containing single segment with no quantization ranges.""" + return _GetParams(add_quantization_nodes=True) + + def ShouldRunTest(self, run_params): + if trt_convert.get_linked_tensorrt_version()[0] < 5: + return False + # Test static/dynamic engine with/without calibration. + return (trt_test.IsQuantizationMode(run_params.precision_mode) and + not run_params.use_optimizer) + + def ExpectedEnginesToBuild(self, run_params): + """Return the expected engines to build.""" + return ["my_trt_op_0"] + + def ExpectedAbsoluteTolerance(self, run_params): + """The absolute tolerance to compare floating point results.""" + return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01 + + def ExpectedRelativeTolerance(self, run_params): + """The relative tolerance to compare floating point results.""" + return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01 + + +class NonQuantizedPrecisionsWithRangesTest(trt_test.TfTrtIntegrationTestBase): + + def GetParams(self): + """Create a graph containing single segment with no quantization ranges.""" + return _GetParams(add_quantization_nodes=True) + + def ShouldRunTest(self, run_params): + # Only test FP32/FP16 mode. + return not trt_test.IsQuantizationMode(run_params.precision_mode) + + def ExpectedEnginesToBuild(self, run_params): + """Return the expected engines to build.""" + # The fake quant ops are not supported in FP32/FP16 mode, and will split the + # graph into three TRT segments. + return ["my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3"] + + def ExpectedAbsoluteTolerance(self, run_params): + """The absolute tolerance to compare floating point results.""" + return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01 + + def ExpectedRelativeTolerance(self, run_params): + """The relative tolerance to compare floating point results.""" + return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01 + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/tensorrt/test/testdata/checkpoint b/tensorflow/contrib/tensorrt/test/testdata/checkpoint new file mode 100644 index 0000000000000000000000000000000000000000..a603e1aec91adab04fd9801ba05a2ee9adfbb6e8 --- /dev/null +++ b/tensorflow/contrib/tensorrt/test/testdata/checkpoint @@ -0,0 +1,3 @@ +model_checkpoint_path: "model.ckpt-46900" +all_model_checkpoint_paths: "model.ckpt-0" +all_model_checkpoint_paths: "model.ckpt-46900" diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 new file mode 100644 index 0000000000000000000000000000000000000000..88a998f184b275121e1e76eb51d2310da149f10a Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 differ diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index new file mode 100644 index 0000000000000000000000000000000000000000..537976571337508ab1798d33646c51d62a146ecc Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index differ diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py index a725d0651c92fe18bcfd284cffd40cdfec2e6c69..80eb8552fd01531be76c228c10830c2fa33a2dec 100644 --- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py +++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py @@ -30,6 +30,7 @@ from tensorflow.contrib.tensorrt.python import trt_convert from tensorflow.contrib.tensorrt.python.ops import trt_engine_op # pylint: enable=unused-import from tensorflow.core.protobuf import config_pb2 +from tensorflow.core.protobuf import rewriter_config_pb2 from tensorflow.python.framework import dtypes from tensorflow.python.framework import graph_io from tensorflow.python.framework import importer @@ -42,14 +43,15 @@ TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [ "gdef", "input_names", "input_dims", "output_names", "expected_output_dims" ]) -RunParams = namedtuple( - "RunParams", - ["use_optimizer", "precision_mode", "dynamic_engine", "test_name"]) +RunParams = namedtuple("RunParams", [ + "use_optimizer", "precision_mode", "dynamic_engine", "test_name", + "use_calibration" +]) ConversionParams = namedtuple("ConversionParams", [ "max_batch_size", "max_workspace_size_bytes", "precision_mode", "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines", - "cached_engine_batch_sizes", "rewriter_config" + "cached_engine_batch_sizes", "rewriter_config", "use_calibration" ]) PRECISION_MODES = ["FP32", "FP16", "INT8"] @@ -65,6 +67,34 @@ class GraphState(object): INFERENCE = 2 +def OptimizerDisabledRewriterConfig(): + """Returns a RewriterConfig with all default Grappler optimizers disabled.""" + rewriter_config = rewriter_config_pb2.RewriterConfig() + + # Turn off all default Grappler optimizers. + off = rewriter_config_pb2.RewriterConfig.OFF + rewriter_config.layout_optimizer = off + rewriter_config.constant_folding = off + rewriter_config.shape_optimization = off + rewriter_config.remapping = off + rewriter_config.arithmetic_optimization = off + rewriter_config.dependency_optimization = off + rewriter_config.loop_optimization = off + rewriter_config.function_optimization = off + rewriter_config.debug_stripper = off + rewriter_config.disable_model_pruning = True + rewriter_config.scoped_allocator_optimization = off + rewriter_config.memory_optimization = ( + rewriter_config_pb2.RewriterConfig.NO_MEM_OPT) + rewriter_config.pin_to_host_optimization = off + rewriter_config.auto_parallel.enable = False + + # Run only once for each enabled optimizer. + rewriter_config.meta_optimizer_iterations = ( + rewriter_config_pb2.RewriterConfig.ONE) + return rewriter_config + + class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): """Class to test Tensorflow-TensorRT integration.""" @@ -139,11 +169,15 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): is_dynamic_op=run_params.dynamic_engine, maximum_cached_engines=1, cached_engine_batch_sizes=None, - rewriter_config=None) + rewriter_config=None, + use_calibration=run_params.use_calibration) def ShouldRunTest(self, run_params): """Whether to run the test.""" - return True + # This setting combination requires quantization nodes to be present in + # order to build the engine. + return not (IsQuantizationMode(run_params.precision_mode) and + not run_params.use_calibration) def VerifyRunForEngine(self, engine_name, graph_state, expect_run=True): """Verify the state of a particular engine after sess.run().""" @@ -198,30 +232,31 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): trt_convert.clear_test_values("my_trt_op_.*:ExecuteCalibration") trt_convert.clear_test_values("my_trt_op_.*:ExecuteNativeSegment") + def _GetGPUOptions(self): + gpu_options = config_pb2.GPUOptions() + gpu_options.allow_growth = True + return gpu_options + def _GetConfigProto(self, run_params, graph_state): """Get config proto based on specific settings.""" if graph_state != GraphState.ORIGINAL and run_params.use_optimizer: conversion_params = self.GetConversionParams(run_params) - rewriter_cfg = trt_convert.tensorrt_rewriter_config( + rewriter_cfg = trt_convert.get_tensorrt_rewriter_config( conversion_params.rewriter_config, conversion_params.max_batch_size, conversion_params.max_workspace_size_bytes, conversion_params.precision_mode, conversion_params.minimum_segment_size, conversion_params.is_dynamic_op, conversion_params.maximum_cached_engines, - conversion_params.cached_engine_batch_sizes) + conversion_params.cached_engine_batch_sizes, + conversion_params.use_calibration) graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg) else: graph_options = config_pb2.GraphOptions() - gpu_options = config_pb2.GPUOptions() - gpu_options.allow_growth = True - if trt_convert.get_linked_tensorrt_version()[0] == 3: - gpu_options.per_process_gpu_memory_fraction = 0.50 - config = config_pb2.ConfigProto( - gpu_options=gpu_options, graph_options=graph_options) + gpu_options=self._GetGPUOptions(), graph_options=graph_options) return config def _ExpectTestValue(self, engine_name, method, expected_value): @@ -291,6 +326,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): params = self._GetParamsCached() conversion_params = self.GetConversionParams(run_params) logging.info(conversion_params) + + config_for_trt = config_pb2.ConfigProto(gpu_options=self._GetGPUOptions()) + if conversion_params.rewriter_config is not None: + config_for_trt.graph_options.rewrite_options.CopyFrom( + conversion_params.rewriter_config) return trt_convert.create_inference_graph( input_graph_def=gdef, outputs=params.input_names + params.output_names, @@ -301,7 +341,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): is_dynamic_op=conversion_params.is_dynamic_op, maximum_cached_engines=conversion_params.maximum_cached_engines, cached_engine_batch_sizes=conversion_params.cached_engine_batch_sizes, - rewriter_config=conversion_params.rewriter_config) + use_calibration=conversion_params.use_calibration, + session_config=config_for_trt) def _WriteGraph(self, run_params, gdef, graph_state): if graph_state == GraphState.ORIGINAL: @@ -400,10 +441,12 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): is_dynamic_engine = not node.attr["static_engine"].b self.assertEqual(run_params.dynamic_engine, is_dynamic_engine, node.name) + self.assertEqual(node.attr["use_calibration"].b, + run_params.use_calibration, node.name) has_calibration_data = len(node.attr["calibration_data"].s) if (IsQuantizationMode(run_params.precision_mode) and - graph_state == GraphState.INFERENCE): + run_params.use_calibration and graph_state == GraphState.INFERENCE): self.assertTrue(has_calibration_data, node.name) else: self.assertFalse(has_calibration_data, node.name) @@ -438,6 +481,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): # types. scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0 dims = params.input_dims[i] + # TODO(laigd): add debug options. E.g. we can set the input data to be + # continuous natural numbers: + # seq = np.arange(np.prod(dims)) + # seq.resize(dims) + # input_data.append(scale * seq.astype(dtype)) input_data.append((scale * np.random.random_sample(dims)).astype(dtype)) self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL) @@ -449,7 +497,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): config_no_trt, GraphState.ORIGINAL) # Run calibration if necessary. - if IsQuantizationMode(run_params.precision_mode): + if (IsQuantizationMode(run_params.precision_mode) and + run_params.use_calibration): calib_config = self._GetConfigProto(run_params, GraphState.CALIBRATE) logging.info("Running calibration graph, config:\n%s", str(calib_config)) @@ -519,27 +568,38 @@ def _AddTests(test_class): use_optimizer_options = [False, True] dynamic_engine_options = [False, True] - for (use_optimizer, precision_mode, dynamic_engine) in itertools.product( - use_optimizer_options, PRECISION_MODES, dynamic_engine_options): + use_calibration_options = [False, True] + opts = itertools.product(use_optimizer_options, PRECISION_MODES, + dynamic_engine_options, use_calibration_options) + for (use_optimizer, precision_mode, dynamic_engine, use_calibration) in opts: if IsQuantizationMode(precision_mode): if use_optimizer: # TODO(aaroey): if use_optimizer is True we need to get the inference # graphdef using custom python wrapper class, which is not currently # supported yet. continue - if not dynamic_engine: + if use_calibration and not dynamic_engine: + # Static engine with use_calibration=False will be static, so we want to + # test that. If use_calibration=True, only dynamic op is supported. # TODO(aaroey): construction of static calibration engine is not # supported yet. continue + else: + if use_calibration: + # Don't calibrate in FP32 or FP16 mode + continue conversion = "OptimizerConversion" if use_optimizer else "ToolConversion" - engine_type = ("DynamicEngine" if dynamic_engine else "StaticEngine") - test_name = "%s_%s_%s" % (conversion, precision_mode, engine_type) + engine_type = "DynamicEngine" if dynamic_engine else "StaticEngine" + calibration_type = "UseCalibration" if use_calibration else "NoCalibration" + test_name = "%s_%s_%s_%s" % (conversion, engine_type, precision_mode, + calibration_type) run_params = RunParams( use_optimizer=use_optimizer, precision_mode=precision_mode, dynamic_engine=dynamic_engine, - test_name=test_name) + test_name=test_name, + use_calibration=use_calibration) setattr(test_class, "testTfTrt_" + test_name, _GetTest(run_params)) diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD index c230919168b937b26c68e141e15f0762ad70f3e6..ae7db35b47b326272dd2c7bc76e18047cec59865 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/BUILD +++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD @@ -106,6 +106,7 @@ py_test( ], srcs_version = "PY2AND3", tags = [ + "no_mac", "no_pip_gpu", # b/63391119 "nomsan", # Takes too long to run. "notsan", # b/67865658 diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py index 43c5267e632e464d43ffcbcf6c551ff83d3c5767..aab330643862c1ccf073d2a0e34e1c475b1ec15f 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py +++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py @@ -802,7 +802,7 @@ class InputStatisticsFromMiniBatch(object): array_ops.shape(times)[1] - 1, self._dtype)) # Co-locate updates with their variables to minimize race conditions when # updating statistics. - with ops.colocate_with(auxiliary_variables.max_time_seen): + with ops.device(auxiliary_variables.max_time_seen.device): # There is a race condition if this value is being updated from multiple # workers. However, it should eventually reach the correct value if the # last chunk is presented enough times. @@ -810,16 +810,16 @@ class InputStatisticsFromMiniBatch(object): auxiliary_variables.max_time_seen, gen_math_ops.maximum(auxiliary_variables.max_time_seen, math_ops.reduce_max(times))) - with ops.colocate_with(auxiliary_variables.chunk_count): + with ops.device(auxiliary_variables.chunk_count.device): chunk_count_assign = state_ops.assign_add(auxiliary_variables.chunk_count, array_ops.shape( times, out_type=dtypes.int64)[0]) - with ops.colocate_with(auxiliary_variables.inter_observation_duration_sum): + with ops.device(auxiliary_variables.inter_observation_duration_sum.device): inter_observation_duration_assign = state_ops.assign_add( auxiliary_variables.inter_observation_duration_sum, math_ops.reduce_sum(batch_inter_observation_duration)) - with ops.colocate_with(auxiliary_variables.example_count): + with ops.device(auxiliary_variables.example_count.device): example_count_assign = state_ops.assign_add( auxiliary_variables.example_count, array_ops.size(times, out_type=dtypes.int64)) @@ -829,11 +829,11 @@ class InputStatisticsFromMiniBatch(object): # the series are then members of fewer chunks. For series which are much # longer than the chunk size (the usual/expected case), this effect becomes # irrelevant. - with ops.colocate_with(auxiliary_variables.overall_feature_sum): + with ops.device(auxiliary_variables.overall_feature_sum.device): overall_feature_sum_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum, math_ops.reduce_sum(values, axis=[0, 1])) - with ops.colocate_with(auxiliary_variables.overall_feature_sum_of_squares): + with ops.device(auxiliary_variables.overall_feature_sum_of_squares.device): overall_feature_sum_of_squares_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum_of_squares, math_ops.reduce_sum(values**2, axis=[0, 1])) @@ -869,7 +869,7 @@ class InputStatisticsFromMiniBatch(object): state_ops.assign(statistics.series_start_moments.mean, mean), state_ops.assign(statistics.series_start_moments.variance, variance)) - with ops.colocate_with(statistics.start_time): + with ops.device(statistics.start_time.device): series_start_update = control_flow_ops.cond( # Update moments whenever we even match the lowest time seen so far, # to ensure that series start statistics are eventually updated to diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD index 3c07a74ed8af9e3ab70408f9b43cb62b6bd4c7f2..125750e7639ad40c481472a93353e6fb7055be96 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD +++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD @@ -40,7 +40,10 @@ py_test( timeout = "long", # Moderate but for asan srcs = ["state_space_model_test.py"], srcs_version = "PY2AND3", - tags = ["no_windows"], # TODO: needs investigation on Windows + tags = [ + "no_mac", + "no_windows", # TODO: needs investigation on Windows + ], deps = [ ":state_space_model", "//tensorflow/contrib/layers:layers_py", diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD index a0a9cb3f31a945a00eb3f6a5fd1402aab9a2df5f..8264462a06ae8d50ee2b83ccea4ce1fb35be12b2 100644 --- a/tensorflow/contrib/tpu/BUILD +++ b/tensorflow/contrib/tpu/BUILD @@ -78,6 +78,7 @@ py_library( "//tensorflow/python:init_ops", "//tensorflow/python:math_ops", "//tensorflow/python:platform", + "//tensorflow/python:session", "//tensorflow/python:state_ops", "//tensorflow/python:summary", "//tensorflow/python:summary_ops_v2", diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py index 08f58a5f5b89f92502893e222cbca3bd07b2432b..73753cd9181403d97b18f117a17e3e75e1f3b974 100644 --- a/tensorflow/contrib/tpu/python/tpu/keras_support.py +++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py @@ -1012,9 +1012,10 @@ class TPUFunction(object): optimizer=_replicated_optimizer(self._cloned_optimizer), loss=self.model.loss, loss_weights=self.model.loss_weights, - metrics=metrics_module.clone_metrics(self.model.metrics), + metrics=metrics_module.clone_metrics( + self.model._compile_metrics), weighted_metrics=metrics_module.clone_metrics( - self.model.weighted_metrics), + self.model._compile_weighted_metrics), target_tensors=tpu_targets, ) @@ -1184,12 +1185,9 @@ class TPUFunction(object): # pipelined loop. return None, None - if not isinstance(K.learning_phase(), int): + if isinstance(inputs[-1], int): # Remove the learning_phase flag at the end. We currently hard code the # learning_phase in TPUFunction. - assert isinstance(inputs[-1], int), ( - 'Expect the final element be learning_phase flag. Got {}'.format( - inputs[-1])) inputs = inputs[:-1] if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or @@ -1379,6 +1377,7 @@ class KerasTPUModel(models.Model): self.train_function = None self._fit_function = None self._eval_function = None + self._stateful_metric_functions = [] cluster_resolver = strategy._tpu_cluster_resolver self._tpu_name_or_address = cluster_resolver.get_master() @@ -1393,10 +1392,10 @@ class KerasTPUModel(models.Model): self.compile( self._cpu_model.optimizer, self._cpu_model.loss, - self._cpu_model.metrics, + self._cpu_model._compile_metrics, self._cpu_model.loss_weights, self._cpu_model.sample_weight_mode, - self._cpu_model.weighted_metrics, + self._cpu_model._compile_weighted_metrics, self._cpu_model.target_tensors, ) @@ -1700,7 +1699,7 @@ class KerasTPUModel(models.Model): callbacks.on_train_begin() for epoch in range(initial_epoch, epochs): # Reset stateful metrics - for m in self.stateful_metric_functions: + for m in self.metrics: m.reset_states() # Update callbacks callbacks.on_epoch_begin(epoch) @@ -1998,14 +1997,14 @@ class KerasTPUModel(models.Model): self._optimizer = optimizer @property - def stateful_metric_functions(self): + def metrics(self): if self._tpu_model: - return self._tpu_model.stateful_metric_functions + return self._tpu_model.metrics return self._stateful_metric_functions - @stateful_metric_functions.setter - def stateful_metric_functions(self, stateful_metric_functions): - self._stateful_metric_functions = stateful_metric_functions + @metrics.setter + def metrics(self, metrics): + self._stateful_metric_functions = metrics def _make_train_function(self): if not self.train_function: @@ -2230,10 +2229,10 @@ def tpu_model(model, strategy=None): cpu_model.compile( _clone_optimizer(model.optimizer, optimizer_config), model.loss, - metrics_module.clone_metrics(model.metrics), + metrics_module.clone_metrics(model._compile_metrics), model.loss_weights, model.sample_weight_mode, - metrics_module.clone_metrics(model.weighted_metrics), + metrics_module.clone_metrics(model._compile_weighted_metrics), ) if model_weights: diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py index e3e791faacb9b3c1fedbd83d3740e35351e38abb..def57da20d6018dcf27ccb7a9d04592f38ce2f7c 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu.py @@ -1001,8 +1001,8 @@ def rewrite(computation, `rewrite` is a list of tensors corresponding to the tensors from the output of `computation`. - All `Operation`s returned from `computation` will be executed when - evaluating any of the returned output tensors. + All `Operation`s constructed during `computation` will be executed when + evaluating any of the returned output tensors, not just the ones returned. inputs: A list of input tensors or `None` (equivalent to an empty list). infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple of arguments as inputs to `computation`. @@ -1111,7 +1111,7 @@ def validate_inference_rewrite_for_variables(graph): Raises: RuntimeError: if validation failed. """ - if not any([x.type == "GuaranteeConst" for x in graph.get_operations()]): + if not any(x.type == "GuaranteeConst" for x in graph.get_operations()): raise RuntimeError( "No GuaranteeConst ops found in the graph after running " "tpu.rewrite_for_inference(...). Please check that you are using " diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py index da6bdf67d686fba09d66386de982b57aa28d4dd4..672462447944b777375331d49727c4d5366cf295 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py @@ -41,7 +41,7 @@ _NUM_CORES_TO_COMPUTATION_SHAPE = { class TPUContext(object): - """The context of current input_fn invocation.""" + """A context that holds the current configuration of the TPU computation.""" def __init__(self, internal_ctx, diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index 932367f4dd546c7867ea75eba1ae36813c9080da..9525121ebb0e52b59dd6446a33582199294227a6 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -45,6 +45,7 @@ from tensorflow.contrib.training.python.training import hparam from tensorflow.core.framework import variable_pb2 from tensorflow.core.framework.summary_pb2 import Summary from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.client import session as tf_session from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest as data_nest from tensorflow.python.estimator import estimator as estimator_lib @@ -412,12 +413,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=True, - rendezvous=None): + rendezvous=None, + master=None, + session_config=None): self._master_job = ctx.master_job self._enqueue_ops = enqueue_ops self._dequeue_ops = dequeue_ops self._rendezvous = rendezvous - + self._master = master + self._session_config = session_config self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator self._initial_infeed_sleep_secs = ( ctx.config.tpu_config.initial_infeed_sleep_secs) @@ -429,11 +433,10 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): def begin(self): logging.info('TPU job name %s', self._master_job) self._iterations_per_loop_var = _create_or_get_iterations_per_loop() + self._init_ops = [] if self._should_initialize_tpu: - self._init_ops = [tpu.initialize_system(job=self._master_job)] self._finalize_ops = [tpu.shutdown_system(job=self._master_job)] else: - self._init_ops = [] self._finalize_ops = [] summary_writer_init_ops = contrib_summary.summary_writer_initializer_op() @@ -475,11 +478,17 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): return _OpQueueContext(name=name, target=target, args=args) def after_create_session(self, session, coord): - logging.info('Init TPU system') - start = time.time() + if self._should_initialize_tpu: + logging.info('Init TPU system') + start = time.time() + with ops.Graph().as_default(): + with tf_session.Session( + self._master, config=self._session_config) as sess: + sess.run(tpu.initialize_system(job=self._master_job)) + logging.info('Initialized TPU in %d seconds', time.time() - start) + session.run(self._init_ops, options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000)) - logging.info('Initialized TPU in %d seconds', time.time() - start) self._infeed_controller = self._create_infeed_controller( name='InfeedController', target=self._run_infeed, args=(session,)) @@ -2564,6 +2573,8 @@ class TPUEstimator(estimator_lib.Estimator): run_infeed_loop_on_coordinator=( run_infeed_loop_on_coordinator), rendezvous=self._rendezvous[mode], + master=self._config.master, + session_config=self._session_config, ), InstallSignalHandlerHook() ]) @@ -2666,8 +2677,10 @@ class TPUEstimator(estimator_lib.Estimator): eval_update_ops + host_ops, run_infeed_loop_on_coordinator=( run_infeed_loop_on_coordinator), - rendezvous=self._rendezvous[mode]), - ] + input_hooks + rendezvous=self._rendezvous[mode], + master=self._config.master, + session_config=self._session_config, + )] + input_hooks if eval_hooks: hooks.extend(eval_hooks) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py index e75a09492ec12b95bad32b221a8e78a1b79f3a6b..cf36103277de2e3b055ae89c66b198fb55bb4522 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py @@ -26,7 +26,6 @@ import numpy as np from six.moves import xrange # pylint: disable=redefined-builtin from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding -from tensorflow.compiler.xla.python_api import xla_shape from tensorflow.contrib.tpu.python.ops import tpu_ops from tensorflow.contrib.tpu.python.tpu import tpu from tensorflow.contrib.tpu.python.tpu import tpu_sharding @@ -92,8 +91,7 @@ class InfeedQueue(object): else: raise ValueError( "number of tuple elements cannot be inferred from InfeedQueue " - "constructor" - ) + "constructor") if number_of_tuple_elements <= 0: raise ValueError("number_of_tuple_elements %d must be > 0" % number_of_tuple_elements) @@ -293,9 +291,8 @@ class InfeedQueue(object): self.number_of_tuple_elements """ if len(input_tensors) != self.number_of_tuple_elements: - raise ValueError( - "input_tensors is %s, but should be a list of %d Tensors", ( - str(input_tensors), self.number_of_tuple_elements)) + raise ValueError("input_tensors is %s, but should be a list of %d Tensors" + % (str(input_tensors), self.number_of_tuple_elements)) self.set_tuple_shapes([t.shape for t in input_tensors]) self.set_tuple_types([t.dtype for t in input_tensors]) @@ -451,8 +448,8 @@ class InfeedQueue(object): for i in xrange(1, self.number_of_tuple_elements): if devices[0] != devices[i]: raise ValueError( - "input devices for shard %d are %s, but should all be the same", - index, str(devices)) + "input devices for shard %d are %s, but should all be the same" % + (index, str(devices))) with ops.colocate_with(inputs[0]): return tpu_ops.infeed_enqueue_tuple( inputs=inputs, @@ -792,18 +789,14 @@ class _PartitionedInfeedQueue(InfeedQueue): Args: tensor: Input tensor for partitioning. - dims: A list of integer describes how to partition the input tensor. + dims: 1-D np.array of the list of integer describes how to partition the + input tensor. Raises: ValueError: If the tensor can't be partitioned by dims or the num_cores_per_replica doesn't match the number of partitions(dims.prod()). """ - if dims is None: - return - - dims = np.array(dims) - if (dims < 1).any(): raise ValueError("All input partition dims must be >= 1.") @@ -823,11 +816,6 @@ class _PartitionedInfeedQueue(InfeedQueue): "partition dims = {}).".format(tensor.shape.as_list(), dims)) tensor.shape.assert_is_fully_defined() - if (np.array(tensor.shape.as_list()) % dims != 0).any(): - raise ValueError( - "All input partition dims must divide exactly into the `Tensor` " - "shape (tensor shape = {}, input partition dims = {}).".format( - tensor.shape.as_list(), dims)) def _partition_or_replicate_on_host(self, tensor, dims): """Partitions or replicates the input tensor. @@ -840,16 +828,33 @@ class _PartitionedInfeedQueue(InfeedQueue): Returns: An iterator of `Tensor`s or a list of partioned tensors. """ - self._check_input_partition_dims(tensor, dims) if dims is None: return itertools.repeat(tensor) - else: - output = [tensor] - for axis, dim in enumerate(dims): - if dim > 1: - output = [array_ops.split(x, dim, axis=axis) for x in output] - output = nest.flatten(output) - return output + dims = np.array(dims) + self._check_input_partition_dims(tensor, dims) + output = [tensor] + divds, remainders = np.divmod(np.array(tensor.shape.as_list()), dims) + for axis, (divd, remainder, dim) in enumerate( + np.dstack((divds, remainders, dims))[0]): + if dim <= 1: + continue + if remainder > 0: + # For each dimension, when it cannot be evenly partitioned, XLA assumes + # the size of last parts are smaller by 1. E.g. 2D tensor with shape + # (5, 14) and dims are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] + # => [[(3, 3), (3, 3), (2, 3), (2, 3)], + # [(2, 3), (2, 3), (2, 2), (2, 2)]] + output = [ + array_ops.split( + x, + num_or_size_splits=[divd + 1] * remainder + + [divd] * (dim - remainder), + axis=axis) for x in output + ] + else: + output = [array_ops.split(x, dim, axis=axis) for x in output] + output = nest.flatten(output) + return output def _tag_sharding_attribute_for_dequeued_tensor(self, tensor, dims): """Tags appropriate XLA sharding attribute to the dequeued tensor. @@ -866,13 +871,9 @@ class _PartitionedInfeedQueue(InfeedQueue): elif np.prod(dims) == 1: return xla_sharding.assign_device(tensor, 0) else: - tile_shape = np.array(tensor.shape.as_list()) // dims tile_assignment = np.arange(np.prod(dims)).reshape(dims) return xla_sharding.tile( tensor=tensor, - tile_shape=xla_shape.CreateShapeFromDtypeAndTuple( - dtype=np.dtype(tensor.dtype.as_numpy_dtype), - shape_tuple=tile_shape), tile_assignment=tile_assignment) def _tag_sharding_attribute_for_dequeued_tensors(self, dequeues, dims): diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 52d3f8f2b98c1ed3aba525f817f58471a0c9bb2b..1b2d944f295b7abdd473f9d7b0468cb539470990 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -1052,6 +1052,7 @@ tf_gen_op_libs( "batch_ops", "bitwise_ops", "boosted_trees_ops", + "tensor_forest_ops", "candidate_sampling_ops", "checkpoint_ops", "collective_ops", @@ -1201,6 +1202,7 @@ cc_library( ":batch_ops_op_lib", ":bitwise_ops_op_lib", ":boosted_trees_ops_op_lib", + ":tensor_forest_ops_op_lib", ":candidate_sampling_ops_op_lib", ":checkpoint_ops_op_lib", ":collective_ops_op_lib", @@ -1349,63 +1351,63 @@ cc_library( name = "all_kernels_statically_linked", visibility = ["//visibility:private"], deps = [ - "//tensorflow/core/kernels:array", - "//tensorflow/core/kernels:audio", - "//tensorflow/core/kernels:batch_kernels", - "//tensorflow/core/kernels:bincount_op", - "//tensorflow/core/kernels:boosted_trees_ops", - "//tensorflow/core/kernels:candidate_sampler_ops", - "//tensorflow/core/kernels:checkpoint_ops", - "//tensorflow/core/kernels:collective_ops", - "//tensorflow/core/kernels:control_flow_ops", - "//tensorflow/core/kernels:ctc_ops", - "//tensorflow/core/kernels:cudnn_rnn_kernels", - "//tensorflow/core/kernels:data_flow", - "//tensorflow/core/kernels:dataset_ops", - "//tensorflow/core/kernels:decode_proto_op", - "//tensorflow/core/kernels:encode_proto_op", - "//tensorflow/core/kernels:fake_quant_ops", - "//tensorflow/core/kernels:function_ops", - "//tensorflow/core/kernels:functional_ops", - "//tensorflow/core/kernels:grappler", - "//tensorflow/core/kernels:histogram_op", - "//tensorflow/core/kernels:image", - "//tensorflow/core/kernels:io", - "//tensorflow/core/kernels:linalg", - "//tensorflow/core/kernels:list_kernels", - "//tensorflow/core/kernels:lookup", - "//tensorflow/core/kernels:logging", - "//tensorflow/core/kernels:manip", - "//tensorflow/core/kernels:math", - "//tensorflow/core/kernels:multinomial_op", - "//tensorflow/core/kernels:nn", - "//tensorflow/core/kernels:parameterized_truncated_normal_op", - "//tensorflow/core/kernels:parsing", - "//tensorflow/core/kernels:partitioned_function_ops", - "//tensorflow/core/kernels:ragged_ops", - "//tensorflow/core/kernels:random_ops", - "//tensorflow/core/kernels:random_poisson_op", - "//tensorflow/core/kernels:remote_fused_graph_ops", - "//tensorflow/core/kernels:required", - "//tensorflow/core/kernels:resource_variable_ops", - "//tensorflow/core/kernels:rpc_op", - "//tensorflow/core/kernels:scoped_allocator_ops", - "//tensorflow/core/kernels:sdca_ops", - "//tensorflow/core/kernels:searchsorted_op", - "//tensorflow/core/kernels:set_kernels", - "//tensorflow/core/kernels:sparse", - "//tensorflow/core/kernels:state", - "//tensorflow/core/kernels:stateless_random_ops", - "//tensorflow/core/kernels:string", - "//tensorflow/core/kernels:summary_kernels", - "//tensorflow/core/kernels:training_ops", - "//tensorflow/core/kernels:word2vec_kernels", - ] + tf_additional_cloud_kernel_deps() + - select({ - "//tensorflow:no_nccl_support": [], - "//tensorflow:windows": [], - "//conditions:default": ["//tensorflow/core/kernels:nccl_kernels"], - }) + if_not_windows([ + "//tensorflow/core/kernels:array", + "//tensorflow/core/kernels:audio", + "//tensorflow/core/kernels:batch_kernels", + "//tensorflow/core/kernels:bincount_op", + "//tensorflow/core/kernels:boosted_trees_ops", + "//tensorflow/core/kernels:tensor_forest_ops", + "//tensorflow/core/kernels:candidate_sampler_ops", + "//tensorflow/core/kernels:checkpoint_ops", + "//tensorflow/core/kernels:collective_ops", + "//tensorflow/core/kernels:control_flow_ops", + "//tensorflow/core/kernels:ctc_ops", + "//tensorflow/core/kernels:cudnn_rnn_kernels", + "//tensorflow/core/kernels:data_flow", + "//tensorflow/core/kernels:dataset_ops", + "//tensorflow/core/kernels:decode_proto_op", + "//tensorflow/core/kernels:encode_proto_op", + "//tensorflow/core/kernels:fake_quant_ops", + "//tensorflow/core/kernels:function_ops", + "//tensorflow/core/kernels:functional_ops", + "//tensorflow/core/kernels:grappler", + "//tensorflow/core/kernels:histogram_op", + "//tensorflow/core/kernels:image", + "//tensorflow/core/kernels:io", + "//tensorflow/core/kernels:linalg", + "//tensorflow/core/kernels:list_kernels", + "//tensorflow/core/kernels:lookup", + "//tensorflow/core/kernels:logging", + "//tensorflow/core/kernels:manip", + "//tensorflow/core/kernels:math", + "//tensorflow/core/kernels:multinomial_op", + "//tensorflow/core/kernels:nn", + "//tensorflow/core/kernels:parameterized_truncated_normal_op", + "//tensorflow/core/kernels:parsing", + "//tensorflow/core/kernels:partitioned_function_ops", + "//tensorflow/core/kernels:ragged_ops", + "//tensorflow/core/kernels:random_ops", + "//tensorflow/core/kernels:random_poisson_op", + "//tensorflow/core/kernels:remote_fused_graph_ops", + "//tensorflow/core/kernels:required", + "//tensorflow/core/kernels:resource_variable_ops", + "//tensorflow/core/kernels:rpc_op", + "//tensorflow/core/kernels:scoped_allocator_ops", + "//tensorflow/core/kernels:sdca_ops", + "//tensorflow/core/kernels:searchsorted_op", + "//tensorflow/core/kernels:set_kernels", + "//tensorflow/core/kernels:sparse", + "//tensorflow/core/kernels:state", + "//tensorflow/core/kernels:stateless_random_ops", + "//tensorflow/core/kernels:string", + "//tensorflow/core/kernels:summary_kernels", + "//tensorflow/core/kernels:training_ops", + "//tensorflow/core/kernels:word2vec_kernels", + ] + tf_additional_cloud_kernel_deps() + select({ + "//tensorflow:no_nccl_support": [], + "//tensorflow:windows": [], + "//conditions:default": ["//tensorflow/core/kernels:nccl_kernels"], + }) + if_not_windows([ "//tensorflow/core/kernels:fact_op", "//tensorflow/core/kernels:array_not_windows", "//tensorflow/core/kernels:math_not_windows", diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc index 6f9885691595368ab50cfe660b1b5c75673063cf..d38a8424eb13009fbf84d7511fb1325085d8b809 100644 --- a/tensorflow/core/api_def/api_test.cc +++ b/tensorflow/core/api_def/api_test.cc @@ -182,11 +182,14 @@ void TestDeprecationVersionSetCorrectly( for (const auto& name_and_api_def : api_defs_map) { const auto& name = name_and_api_def.first; const auto& api_def = name_and_api_def.second; - ASSERT_TRUE(api_def.deprecation_version() == 0 || - api_def.deprecation_message().empty()) - << "ApiDef that includes deprecation_version > 0 must also specify " - << "a deprecation_message. Op " << name - << " has deprecation_version > 0 but deprecation_message is not set."; + if (api_def.deprecation_version() != 0) { + ASSERT_TRUE(api_def.deprecation_version() > 0) + << "Found ApiDef with negative deprecation_version"; + ASSERT_FALSE(api_def.deprecation_message().empty()) + << "ApiDef that includes deprecation_version > 0 must also specify " + << "a deprecation_message. Op " << name + << " has deprecation_version > 0 but deprecation_message is not set."; + } } } } // namespace diff --git a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt index 4e48d6c169b6641ece5f11d5add478ce25611ee8..0ba2327371a4ba0f5f553815fc9e8c991f62b424 100644 --- a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt @@ -3,13 +3,13 @@ op { in_arg { name: "input" description: <