diff --git a/.gitignore b/.gitignore index be75938ec401b1d72fa54773c85191aaac7d7f35..828bbe9bd3363853ae3f58f54a8d5f60cefad837 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ Podfile.lock /tensorflow/contrib/lite/examples/ios/simple/data/*.txt /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite xcuserdata/** +/api_init_files_list.txt # Android .gradle diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc index ca80db23ed3ccbbdc49c61db6cd03ff735470512..9b86425aa5fbc2be2872b3f5d2809eaa844f9d68 100644 --- a/tensorflow/c/c_api_test.cc +++ b/tensorflow/c/c_api_test.cc @@ -1700,7 +1700,7 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) { TestGradientsError(false); } -// REGISTER_OP for CApiTestAttributesTest test cases. +// REGISTER_OP for CApiAttributesTest test cases. // Registers two ops, each with a single attribute called 'v'. // The attribute in one op will have a type 'type', the other // will have list(type). diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index a2d96357ac8a55be7fe03bf58e33ff1733967dd1..14321191625e448637aa44a7f6a17820159b97c2 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -31,7 +31,6 @@ tf_cuda_library( "//tensorflow/core/common_runtime/eager:context", "//tensorflow/core/common_runtime/eager:eager_executor", "//tensorflow/core/common_runtime/eager:execute", - "//tensorflow/core/common_runtime/eager:execute_node", "//tensorflow/core/common_runtime/eager:kernel_and_device", "//tensorflow/core/common_runtime/eager:tensor_handle", "//tensorflow/core/common_runtime/eager:copy_to_device_node", @@ -49,6 +48,7 @@ tf_cuda_library( ], "//conditions:default": [], }) + [ + "//tensorflow/core/common_runtime/eager:eager_operation", "//tensorflow/core:gpu_runtime", ], ) @@ -71,6 +71,7 @@ tf_cuda_library( "//tensorflow/core:lib_internal", "//tensorflow/core/common_runtime/eager:context", "//tensorflow/core/common_runtime/eager:eager_executor", + "//tensorflow/core/common_runtime/eager:eager_operation", "//tensorflow/core/common_runtime/eager:kernel_and_device", "//tensorflow/core/common_runtime/eager:tensor_handle", ], diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index 393851d13c9b1ad0184f2778fa11afb271bd241b..3bf071f3abaac7dfd4113964fd49cd9322913bd5 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -34,7 +34,6 @@ limitations under the License. #include "tensorflow/core/common_runtime/device_set.h" #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h" #include "tensorflow/core/common_runtime/eager/execute.h" -#include "tensorflow/core/common_runtime/eager/execute_node.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/common_runtime/rendezvous_mgr.h" #include "tensorflow/core/framework/node_def_util.h" @@ -218,9 +217,6 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) { } return retval; } -} // extern "C" - -extern "C" { TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, TF_Status* status) { @@ -240,21 +236,18 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, void TFE_DeleteOp(TFE_Op* op) { delete op; } void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) { - tensorflow::Device* d = nullptr; - if (device_name != nullptr && strlen(device_name) > 0) { - status->status = op->ctx->context.FindDeviceByName(device_name, &d); - } - op->device = d; + status->status = op->operation.SetDevice(device_name); } const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) { - tensorflow::Device* device = - (op->device == nullptr) ? op->ctx->context.HostCPU() : op->device; + tensorflow::Device* device = (op->operation.Device() == nullptr) + ? op->operation.EagerContext()->HostCPU() + : op->operation.Device(); return device->name().c_str(); } void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) { - op->use_xla = enable; + op->operation.SetUseXla(enable); #ifndef TENSORFLOW_EAGER_USE_XLA LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not " "built with XLA support."; @@ -262,22 +255,20 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) { } void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) { - h->handle->Ref(); - op->inputs.push_back(h->handle); - op->attrs.NumInputs(op->inputs.size()); + op->operation.AddInput(h->handle); } TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name, unsigned char* is_list, TF_Status* status) { TF_AttrType ret; - if (op->is_function()) { + if (op->operation.is_function()) { status->status = tensorflow::errors::Unimplemented( "TODO(apassos): Support for attributes for TensorFlow functions is not " "ready yet."); return TF_ATTR_INT; // The compiler requires that we return something. } - status->status = - tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list); + status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(), + attr_name, &ret, is_list); return ret; } @@ -296,23 +287,24 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx, } void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const char* value) { - op->attrs.Set(attr_name, value); + op->operation.MutableAttrs()->Set(attr_name, value); } void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value) { - op->attrs.Set(attr_name, static_cast(value)); + op->operation.MutableAttrs()->Set(attr_name, static_cast(value)); } void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value) { - op->attrs.Set(attr_name, value); + op->operation.MutableAttrs()->Set(attr_name, value); } void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, unsigned char value) { - op->attrs.Set(attr_name, (value == 0) ? false : true); + op->operation.MutableAttrs()->Set(attr_name, (value == 0) ? false : true); } void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, TF_DataType value) { - op->attrs.Set(attr_name, static_cast(value)); + op->operation.MutableAttrs()->Set(attr_name, + static_cast(value)); } void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims, @@ -334,23 +326,24 @@ void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims, proto.add_dim()->set_size(dims[d]); } } - op->attrs.Set(attr_name, proto); + op->operation.MutableAttrs()->Set(attr_name, proto); } void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name, const TFE_Op* value) { tensorflow::AttrValue attr_value; tensorflow::NameAttrList* func = attr_value.mutable_func(); - func->set_name(value->name); - value->attrs.FillAttrValueMap(func->mutable_attr()); - op->attrs.Set(attr_name, attr_value); + func->set_name(value->operation.Name()); + value->operation.Attrs().FillAttrValueMap(func->mutable_attr()); + op->operation.MutableAttrs()->Set(attr_name, attr_value); } #define TFE_OP_SET_ATTR_LIST(fn, type) \ void fn(TFE_Op* op, const char* attr_name, const type* values, \ int num_values) { \ - op->attrs.Set(attr_name, tensorflow::gtl::ArraySlice( \ - values, num_values)); \ + op->operation.MutableAttrs()->Set( \ + attr_name, \ + tensorflow::gtl::ArraySlice(values, num_values)); \ } TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrStringList, char*) TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float) @@ -358,14 +351,14 @@ TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float) void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name, const int64_t* values, int num_values) { - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice( - reinterpret_cast(values), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice( + reinterpret_cast(values), num_values)); } void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name, const TF_DataType* values, int num_values) { - op->attrs.Set( + op->operation.MutableAttrs()->Set( attr_name, tensorflow::gtl::ArraySlice( reinterpret_cast(values), num_values)); @@ -377,8 +370,8 @@ void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name, for (int i = 0; i < num_values; ++i) { b[i] = values[i]; } - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice(b.get(), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice(b.get(), num_values)); } void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name, @@ -408,9 +401,9 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name, } } } - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice( - proto.get(), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice( + proto.get(), num_values)); } void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name, @@ -418,531 +411,25 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name, std::unique_ptr funcs( new tensorflow::NameAttrList[num_values]); for (int i = 0; i < num_values; i++) { - funcs[i].set_name(value[i]->name); - value[i]->attrs.FillAttrValueMap(funcs[i].mutable_attr()); - } - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice( - funcs.get(), num_values)); -} -} // extern "C" - -namespace { - -// Initializes the step stats if needed. -void MaybeInitializeStepStats(tensorflow::StepStats* step_stats, - tensorflow::EagerContext* ctx) { - // Lazily initialize the RunMetadata with information about all devices if - // this is the first call. - while (step_stats->dev_stats_size() < ctx->devices()->size()) { - int device_idx = step_stats->dev_stats_size(); - auto* dev_stats = step_stats->add_dev_stats(); - dev_stats->set_device(ctx->devices()->at(device_idx)->name()); - } -} - -int StepStatsDeviceIndex(tensorflow::StepStats* step_stats, - tensorflow::EagerContext* ctx, - tensorflow::Device* device) { - // Find the current device's index. - if (device == nullptr) { - device = ctx->HostCPU(); - } - for (int i = 0; i < ctx->devices()->size(); ++i) { - if (ctx->devices()->at(i) == device || - ctx->devices()->at(i)->name() == device->name()) { - return i; - } - } - // TODO(apassos) do not fall back to host CPU if device is unknown. - return 0; -} - -tensorflow::Status ValidateInputTypeAndPlacement( - tensorflow::EagerContext* ctx, tensorflow::Device* op_device, TFE_Op* op, - const tensorflow::OpKernel* kernel, tensorflow::RunMetadata* run_metadata) { - tensorflow::Device* host_device = ctx->HostCPU(); - const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types(); - if (memtypes.size() != op->inputs.size()) { - return tensorflow::errors::InvalidArgument( - "expected ", memtypes.size(), " inputs, got ", op->inputs.size()); - } - for (int i = 0; i < op->inputs.size(); ++i) { - const tensorflow::Device* expected_device = - memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device; - tensorflow::TensorHandle* handle = op->inputs[i]; - tensorflow::Device* handle_device = nullptr; - TF_RETURN_IF_ERROR(handle->Device(&handle_device)); - const tensorflow::Device* actual_device = - handle_device == nullptr ? host_device : handle_device; - if (expected_device != actual_device) { - switch (ctx->GetDevicePlacementPolicy()) { - case tensorflow::DEVICE_PLACEMENT_SILENT_FOR_INT32: - // TODO(xpan): See if we could bubble python related error up - // to python level. - if (handle->dtype == tensorflow::DT_INT32) { - // Note: enabling silent copies of int32 tensors to match behavior - // of graph mode. - break; - } - TF_FALLTHROUGH_INTENDED; - case tensorflow::DEVICE_PLACEMENT_EXPLICIT: - return tensorflow::errors::InvalidArgument( - "Tensors on conflicting devices:" - " cannot compute ", - op->name, " as input #", i, " was expected to be on ", - expected_device->name(), " but is actually on ", - actual_device->name(), " (operation running on ", - op_device->name(), ")", - " Tensors can be copied explicitly using .gpu() or .cpu() " - "methods," - " or transparently copied by using tf.enable_eager_execution(" - "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors " - "between devices" - " may slow down your model"); - case tensorflow::DEVICE_PLACEMENT_WARN: - LOG(WARNING) << "before computing " << op->name << " input #" << i - << " was expected to be on " << expected_device->name() - << " but is actually on " << actual_device->name() - << " (operation running on " << op_device->name() - << "). This triggers a copy which can be a performance " - "bottleneck."; - break; - case tensorflow::DEVICE_PLACEMENT_SILENT: // Do nothing. - break; - } - // We are only here if the policy is warn or silent copies, so we should - // trigger a copy. - auto pre_time = tensorflow::Env::Default()->NowMicros(); - tensorflow::TensorHandle* copied_tensor = nullptr; - tensorflow::Status status = tensorflow::EagerCopyToDevice( - handle, ctx, expected_device->name().c_str(), &copied_tensor); - if (run_metadata != nullptr) { - auto* step_stats = run_metadata->mutable_step_stats(); - MaybeInitializeStepStats(step_stats, ctx); - // Record the sending on the source device for now. - int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device); - auto* dev_stats = step_stats->mutable_dev_stats(device_idx); - auto* node_stats = dev_stats->add_node_stats(); - node_stats->set_node_name("_Send"); - node_stats->set_all_start_micros(pre_time); - node_stats->set_op_end_rel_micros( - tensorflow::Env::Default()->NowMicros() - pre_time); - } - if (!status.ok()) { - if (copied_tensor != nullptr) copied_tensor->Unref(); - return tensorflow::errors::Internal( - "Failed copying input tensor from ", actual_device->name(), " to ", - expected_device->name(), " in order to run ", op->name, ": ", - status.error_message()); - } - handle->Unref(); - handle = copied_tensor; - op->inputs[i] = copied_tensor; - } - if (handle->dtype != kernel->input_type(i)) { - return tensorflow::errors::InvalidArgument( - "cannot compute ", op->name, " as input #", i, - " was expected to be a ", - tensorflow::DataTypeString(kernel->input_type(i)), - " tensor but is a ", tensorflow::DataTypeString(handle->dtype), - " tensor"); - } - } - return tensorflow::Status::OK(); -} - -tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef, - TFE_Context* ctx, TF_Status* status) { - tensorflow::DeviceSet ds; - for (tensorflow::Device* d : *ctx->context.devices()) { - ds.AddDevice(d); - } - tensorflow::DeviceTypeVector final_devices; - status->status = tensorflow::SupportedDeviceTypesForNode( - ds.PrioritizedDeviceTypeList(), ndef, &final_devices); - if (!status->status.ok()) { - return nullptr; - } - if (final_devices.empty()) { - status->status = tensorflow::errors::Internal( - "Could not find valid device for node ", ndef.DebugString()); - return nullptr; - } - for (tensorflow::Device* d : *ctx->context.devices()) { - if (d->device_type() == final_devices[0].type_string()) { - return d; - } - } - status->status = tensorflow::errors::Unknown( - "Could not find a device for node ", ndef.DebugString()); - return nullptr; -} - -#ifdef TENSORFLOW_EAGER_USE_XLA -// Synthesizes and returns a wrapper function over `op`, which must be a -// primitive op (e.g. matmul). -// -// The wrapper function conforms to the function signature expected by -// _XlaLaunchOp, with input params ordered by . For example, if the op has input params , they will be reordered to as the input params to the synthesized function. -// -// It populates `const_input_types`, `arg_input_types` and -// `op_input_to_func_input` based on the reordering results, that the caller can -// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets -// `status` accordingly. -const tensorflow::FunctionDef* OpToFunction( - TFE_Op* op, std::vector* const_input_types, - std::vector* arg_input_types, - tensorflow::gtl::FlatMap* op_input_to_func_input, - TF_Status* status) { - DCHECK(!op->is_function()); - - tensorflow::FunctionDef fdef; - - // Get the OpDef of the op we are trying to encapsulate. - TFE_Context* ctx = op->ctx; - const tensorflow::OpRegistrationData* op_data; - { - status->status = ctx->context.FindFunctionOpData(op->name, &op_data); - if (!status->status.ok()) { - return nullptr; - } - } - const tensorflow::OpDef& op_def = op_data->op_def; - - tensorflow::OpDef* signature = fdef.mutable_signature(); - - // Handle constant inputs. - const std::unordered_set const_inputs( - *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(op->name)); - - // First add place holders for the input args, so that we can refer to them by - // position in the next loop. Also tally up the resource inputs. - int num_resource_inputs = 0; - for (int i = 0; i < op_def.input_arg_size(); ++i) { - if (op_def.input_arg(i).type() == tensorflow::DT_RESOURCE) { - ++num_resource_inputs; - } - signature->add_input_arg(); - } - - // Now we map the input params from `op_def` to `signature`, where the param - // ordering for `signature` is: . - int const_index = 0; - int arg_index = const_inputs.size(); - int resource_index = op_def.input_arg_size() - num_resource_inputs; - for (int i = 0; i < op_def.input_arg_size(); ++i) { - const tensorflow::OpDef::ArgDef& op_input_arg = op_def.input_arg(i); - tensorflow::OpDef::ArgDef* func_input_arg = nullptr; - if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) { - VLOG(1) << "For const input, mapping op input " << i << " to func input " - << const_index; - (*op_input_to_func_input)[i] = const_index; - func_input_arg = signature->mutable_input_arg(const_index++); - const_input_types->push_back( - static_cast(op->inputs[i]->dtype)); - } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) { - VLOG(1) << "For resource input, mapping op input " << i - << " to func input " << resource_index; - (*op_input_to_func_input)[i] = resource_index; - func_input_arg = signature->mutable_input_arg(resource_index++); - } else { - VLOG(1) << "For arg input, mapping op input " << i << " to func input " - << arg_index; - (*op_input_to_func_input)[i] = arg_index; - func_input_arg = signature->mutable_input_arg(arg_index++); - arg_input_types->push_back( - static_cast(op->inputs[i]->dtype)); - } - - func_input_arg->set_name(op_input_arg.name()); - func_input_arg->set_type(op->inputs[i]->dtype); - } - VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString(); - - // Resources args are at the end of the function input params, and we should - // have iterated over all of them. - DCHECK_EQ(signature->input_arg_size(), resource_index); - - // Make the synthesized function's name unique. - signature->set_name(tensorflow::strings::StrCat( - op_def.name(), func_id_generator.fetch_add(1))); - - // Add the node def and set its input names to match op_def's names. - const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef(); - DCHECK_EQ(signature->input_arg_size(), ndef.input_size()); - *fdef.add_node_def() = ndef; - for (int i = 0; i < op_def.input_arg_size(); ++i) { - fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name()); - } - VLOG(1) << "Added NodeDef: " << fdef.DebugString(); - - // Fix the output names and set output types. - for (int i = 0; i < op_def.output_arg_size(); ++i) { - tensorflow::OpDef::ArgDef* arg = signature->add_output_arg(); - const tensorflow::OpDef::ArgDef& op_def_arg = op_def.output_arg(i); - const string& out_tensor_name = tensorflow::strings::StrCat( - ndef.name(), ":", op_def_arg.name(), ":", 0); - arg->set_name(op_def_arg.name()); - (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name; - const string& type_attr = op_def_arg.type_attr(); - if (!type_attr.empty()) { - auto i = ndef.attr().find(type_attr); - if (i == ndef.attr().end()) { - status->status = tensorflow::errors::InvalidArgument( - tensorflow::strings::StrCat("Could not find attr ", type_attr, - " in NodeDef ", ndef.DebugString())); - return nullptr; - } - arg->set_type(i->second.type()); - } - } - VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString(); - - status->status = ctx->context.AddFunctionDef(fdef); - if (!status->status.ok()) return nullptr; - const auto ret = ctx->context.FindFunctionDef(signature->name()); - DCHECK(ret != nullptr); - return ret; -} - -// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed -// via XLA. -std::unique_ptr BuildXlaLaunch(TFE_Op* op, TF_Status* status) { - VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->name; - auto launch_op = - std::unique_ptr(TFE_NewOp(op->ctx, "_XlaLaunch", status)); - if (TF_GetCode(status) != TF_OK) return nullptr; - if (op->device) { - TFE_OpSetDevice(launch_op.get(), op->device->name().c_str(), status); - if (TF_GetCode(status) != TF_OK) return nullptr; - } - - const tensorflow::FunctionDef* fdef; - { fdef = op->ctx->context.FindFunctionDef(op->name); } - std::vector const_input_types; - std::vector arg_input_types; - tensorflow::gtl::FlatMap op_input_to_func_input; - if (fdef == nullptr) { - // See if this is a primitive op, and if so create a function for it, so - // that _XlaLaunchOp can access it. - fdef = OpToFunction(op, &const_input_types, &arg_input_types, - &op_input_to_func_input, status); - if (!status->status.ok()) return nullptr; - } else { - // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for - // functions, so we need to find another way to handle constant inputs. - for (int i = const_input_types.size(); - i < fdef->signature().input_arg_size(); ++i) { - VLOG(1) << "Adding Targs from input arg " << i; - const tensorflow::OpDef::ArgDef& arg = fdef->signature().input_arg(i); - arg_input_types.push_back(static_cast(arg.type())); - } - } - DCHECK(fdef != nullptr); - - // Copy inputs and their devices. - // Since input param reordering may have occurred between `op` and `launch_op` - // via `op_input_to_func_input`, adjust the actual inputs accordingly. - launch_op->inputs = op->inputs; - for (tensorflow::TensorHandle* h : launch_op->inputs) { - h->Ref(); - } - if (!op_input_to_func_input.empty()) { - DCHECK_EQ(op->inputs.size(), op_input_to_func_input.size()); - for (int i = 0; i < op_input_to_func_input.size(); ++i) { - VLOG(1) << "mapping op input " << i << " to func input " - << op_input_to_func_input[i]; - - launch_op->inputs[op_input_to_func_input[i]] = op->inputs[i]; - } - } - launch_op->attrs.NumInputs(op->inputs.size()); - - TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(), - const_input_types.size()); - - // Set Targs and Nresources attrs. - TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(), - arg_input_types.size()); - const int num_resource_inputs = fdef->signature().input_arg_size() - - const_input_types.size() - - arg_input_types.size(); - TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs); - - // Set Tresults attr. - std::vector tresults; - for (const tensorflow::OpDef::ArgDef& arg : fdef->signature().output_arg()) { - tresults.push_back(static_cast(arg.type())); + funcs[i].set_name(value[i]->operation.Name()); + value[i]->operation.Attrs().FillAttrValueMap(funcs[i].mutable_attr()); } - TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(), - tresults.size()); - - // Set function attr. - tensorflow::AttrValue attr_value; - tensorflow::NameAttrList* func = attr_value.mutable_func(); - func->set_name(fdef->signature().name()); - launch_op->attrs.Set("function", attr_value); - - return launch_op; + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice( + funcs.get(), num_values)); } -#endif // TENSORFLOW_EAGER_USE_XLA - -} // namespace - -extern "C" { void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, TF_Status* status) { - TFE_Context* ctx = op->ctx; - status->status = ctx->context.GetStatus(); + tensorflow::gtl::InlinedVector handle_retvals( + *num_retvals); + status->status = + tensorflow::EagerExecute(&op->operation, &handle_retvals, num_retvals); if (!status->status.ok()) { return; } -#ifdef TENSORFLOW_EAGER_USE_XLA - std::unique_ptr xla_launch_op; - if (op->use_xla && op->name != "_XlaLaunch") { - xla_launch_op = BuildXlaLaunch(op, status); - if (!status->status.ok()) { - return; - } - op = xla_launch_op.get(); - } -#endif // TENSORFLOW_EAGER_USE_XLA - // Ensure all resource-touching ops run in the device the resource is, - // regardless of anything else that has been specified. This is identical to - // the graph mode behavior. - for (int i = 0; i < op->inputs.size(); ++i) { - tensorflow::Device* input_op_device = nullptr; - status->status = op->inputs[i]->OpDevice(&input_op_device); - if (!status->status.ok()) return; - VLOG(2) << "for op " << op->name << " input " << i << " " - << tensorflow::DataTypeString(op->inputs[i]->dtype) << " " - << (input_op_device == nullptr ? "cpu" : input_op_device->name()) - << " " << (op->device == nullptr ? "cpu" : op->device->name()); - if (op->inputs[i]->dtype == tensorflow::DT_RESOURCE && - (input_op_device != op->device || input_op_device == nullptr)) { - tensorflow::Device* d = - input_op_device == nullptr ? ctx->context.HostCPU() : input_op_device; - VLOG(1) << "Changing device of operation " << op->name << " to " - << d->name() << " because input #" << i - << " is a resource in this device."; - op->device = d; - } - } - tensorflow::Device* device = op->device; - - tensorflow::Fprint128 cache_key = - op->attrs.CacheKey(device == nullptr ? "unspecified" : device->name()); - tensorflow::KernelAndDevice* kernel = ctx->context.GetCachedKernel(cache_key); - if (kernel == nullptr) { - const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef(); - if (device == nullptr) { - device = SelectDevice(ndef, ctx, status); - if (!status->status.ok()) { - return; - } - } - CHECK(device != nullptr); - if (ctx->context.LogDevicePlacement()) { - LOG(INFO) << "Executing op " << ndef.op() << " in device " - << device->name(); - } - kernel = new tensorflow::KernelAndDevice(ctx->context.GetRendezvous()); - // Knowledge of the implementation of Init (and in-turn - // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def - // will be accessed, so grab on to the lock. - // See WARNING comment in Execute (before kernel->Run) - would be nice to - // rework to avoid this subtlety. - tensorflow::tf_shared_lock l(*ctx->context.FunctionsMu()); - status->status = tensorflow::KernelAndDevice::Init( - ndef, ctx->context.func_lib(device), kernel); - if (!status->status.ok()) { - delete kernel; - return; - } - // Update output_dtypes inside `kernel`. - const tensorflow::OpDef* op_def = nullptr; - const tensorflow::FunctionDef* function_def = - ctx->context.FuncLibDef()->Find(ndef.op()); - if (function_def != nullptr) { - op_def = &(function_def->signature()); - } - if (op_def == nullptr) { - status->status = OpDefForOp(ndef.op().c_str(), &op_def); - if (!status->status.ok()) { - return; - } - } - tensorflow::DataTypeVector input_dtypes; - status->status = InOutTypesForNode(ndef, *op_def, &input_dtypes, - kernel->mutable_output_dtypes()); - if (!status->status.ok()) { - return; - } - ctx->context.AddKernelToCache(cache_key, kernel); - } - const tensorflow::DataTypeVector& output_dtypes = kernel->output_dtypes(); - const int output_dtypes_size = output_dtypes.size(); - if (output_dtypes_size > *num_retvals) { - TF_SetStatus(status, TF_INVALID_ARGUMENT, - tensorflow::strings::StrCat("Expecting ", output_dtypes.size(), - " outputs, but *num_retvals is ", - *num_retvals) - .c_str()); - return; - } - *num_retvals = output_dtypes_size; - if (device == nullptr) { - // TODO(apassos) debug how the assignment below might return a different - // device from the one requested above. - device = kernel->device(); - } - status->status = ValidateInputTypeAndPlacement( - &ctx->context, device, op, kernel->kernel(), - ctx->context.ShouldStoreMetadata() ? ctx->context.RunMetadataProto() - : nullptr); - if (!status->status.ok()) return; - std::unique_ptr maybe_stats; - if (ctx->context.ShouldStoreMetadata()) { - maybe_stats.reset(new tensorflow::NodeExecStats); - maybe_stats->set_node_name(op->name); - maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros()); - maybe_stats->set_op_start_rel_micros(0); - maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros()); - // TODO(apassos) track referenced tensors - } - if (ctx->context.Async()) { - // Note that for async mode, execution order will make sure that all - // input handles are ready before executing them. - // TODO(agarwal): Consider executing "cheap" kernels inline for performance. - tensorflow::gtl::InlinedVector handle_retvals( - *num_retvals); - tensorflow::uint64 id = op->ctx->context.NextId(); - for (int i = 0; i < *num_retvals; ++i) { - tensorflow::TensorHandle* h = - new tensorflow::TensorHandle(id, output_dtypes[i], &op->ctx->context); - retvals[i] = new TFE_TensorHandle(h); - handle_retvals[i] = h; - } - tensorflow::EagerNode* node = new tensorflow::ExecuteNode( - id, &op->ctx->context, op->device, op->inputs, kernel, - maybe_stats.release(), output_dtypes, handle_retvals); - ctx->context.ExecutorAdd(node); - } else { - // Execute checks if retvals[i] is nullptr or not to figure if it needs to - // allocate it. - tensorflow::gtl::InlinedVector handle_retvals( - *num_retvals); - status->status = tensorflow::EagerExecute( - &op->ctx->context, op->device, op->inputs, kernel, maybe_stats.get(), - handle_retvals.data(), *num_retvals); - for (int i = 0; i < *num_retvals; ++i) { - retvals[i] = new TFE_TensorHandle(handle_retvals[i]); - } + for (int i = 0; i < *num_retvals; ++i) { + retvals[i] = new TFE_TensorHandle(handle_retvals[i]); } } @@ -1085,9 +572,3 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op, } } } // namespace tensorflow - -TFE_Op::~TFE_Op() { - for (tensorflow::TensorHandle* h : inputs) { - h->Unref(); - } -} diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h index 05dc64f521735f944559392f470a37590e93f17c..49e1aab1cef9577256d9b081858cf094c788caf8 100644 --- a/tensorflow/c/eager/c_api_internal.h +++ b/tensorflow/c/eager/c_api_internal.h @@ -32,6 +32,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/eager/context.h" #include "tensorflow/core/common_runtime/eager/eager_executor.h" +#include "tensorflow/core/common_runtime/eager/eager_operation.h" #include "tensorflow/core/common_runtime/eager/kernel_and_device.h" #include "tensorflow/core/common_runtime/eager/tensor_handle.h" #include "tensorflow/core/common_runtime/function.h" @@ -45,7 +46,6 @@ limitations under the License. #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/public/version.h" - struct TFE_ContextOptions { TF_SessionOptions session_options; // true if async execution is enabled. @@ -85,19 +85,9 @@ struct TFE_Op { // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a // primitive operation. TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t) - : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {} - - ~TFE_Op(); - - bool const is_function() const { return attr_types == nullptr; } + : operation(&ctx->context, op, t) {} - TFE_Context* ctx; // Must outlive the TFE_Op. - const tensorflow::string name; - tensorflow::AttrBuilder attrs; - const tensorflow::AttrTypeMap* attr_types; - tensorflow::gtl::InlinedVector inputs; - tensorflow::Device* device; - bool use_xla = false; + tensorflow::EagerOperation operation; }; namespace tensorflow { diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc index 93155998b86d59ec78c7ff25f146b8e3c8eac380..e18fdf6c57bd3f432d8cb73536fb816df90b3963 100644 --- a/tensorflow/c/python_api.cc +++ b/tensorflow/c/python_api.cc @@ -110,7 +110,7 @@ void ExtendSession(TF_Session* session, TF_Status* status) { session->extend_before_run = false; } -std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) { +std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) { Node* node = &output.oper->node; CppShapeInferenceResult::HandleData handle_data; handle_data.set_is_set(true); @@ -135,4 +135,30 @@ std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) { return result; } +void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, + const void* proto, size_t proto_len, + TF_Status* status) { + tensorflow::CppShapeInferenceResult::HandleData handle_data; + if (!handle_data.ParseFromArray(proto, proto_len)) { + status->status = tensorflow::errors::InvalidArgument( + "Couldn't deserialize HandleData proto"); + return; + } + DCHECK(handle_data.is_set()); + + tensorflow::mutex_lock l(graph->mu); + tensorflow::shape_inference::InferenceContext* ic = + graph->refiner.GetContext(&output.oper->node); + + std::vector shapes_and_types; + for (const auto& shape_and_type_proto : handle_data.shape_and_type()) { + tensorflow::shape_inference::ShapeHandle shape; + status->status = + ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape); + if (status->status.ok()) return; + shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype()); + } + ic->set_output_handle_shapes_and_types(output.index, shapes_and_types); +} + } // namespace tensorflow diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h index 2d4c8cd9ed7bc926f448dab1f6b50ed74179ea14..4bcb5bde62c8a4df4e68c1ce0daaf459434ceb5d 100644 --- a/tensorflow/c/python_api.h +++ b/tensorflow/c/python_api.h @@ -55,9 +55,15 @@ void ExtendSession(TF_Session* session, TF_Status* status); // Returns the serialized CppShapeInferenceResult::HandleData proto for // `output` if its a resource tensor, or otherwise returns the empty string. -// TODO(b/74620627): remove when _USE_C_SHAPES is removed -std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output); - +std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output); + +// Sets `output` based on `proto`, which should be a serialized +// CppShapeInferenceResult::HandleData proto. +// NOTE(skyewm): `proto` is passed a void*/size_t pair instead of a std::string +// because I couldn't get SWIG to work otherwise. +void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, + const void* proto, size_t proto_len, + TF_Status* status); } // namespace tensorflow #endif // TENSORFLOW_C_PYTHON_API_H_ diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc index 6545e4ee3eb406436937a43ddac66d017af8e108..ff348fadb24e29a83bd6c8853aa67931f6df4182 100644 --- a/tensorflow/cc/gradients/array_grad.cc +++ b/tensorflow/cc/gradients/array_grad.cc @@ -385,6 +385,42 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("MirrorPadGrad", MirrorPadGradGrad); +Status StridedSliceGradHelper(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + Input x = Shape(scope, op.input(0)); + Input begin = op.input(1); + Input end = op.input(2); + Input strides = op.input(3); + int64 begin_mask; + int64 end_mask; + int64 ellipsis_mask; + int64 new_axis_mask; + int64 shrink_axis_mask; + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "begin_mask", &begin_mask)); + TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "end_mask", &end_mask)); + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "ellipsis_mask", &ellipsis_mask)); + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "new_axis_mask", &new_axis_mask)); + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "shrink_axis_mask", &shrink_axis_mask)); + grad_outputs->push_back( + StridedSliceGrad(scope, x, begin, end, strides, grad_inputs[0], + StridedSliceGrad::BeginMask(begin_mask) + .EndMask(end_mask) + .EllipsisMask(ellipsis_mask) + .NewAxisMask(new_axis_mask) + .ShrinkAxisMask(shrink_axis_mask))); + // No gradients returned for begin, end and strides + grad_outputs->push_back(NoGradient()); + grad_outputs->push_back(NoGradient()); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("StridedSlice", StridedSliceGradHelper); + } // anonymous namespace } // namespace ops } // namespace tensorflow diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc index 4a215fcc9299cf8b8da04cbf151640631ed0d449..de3bd0fc9e2493f8ff76163f5be6bd4327c58c5a 100644 --- a/tensorflow/cc/gradients/array_grad_test.cc +++ b/tensorflow/cc/gradients/array_grad_test.cc @@ -354,5 +354,29 @@ TEST_F(ArrayGradTest, MirrorPadGradGrad_Symmetric) { RunTest(x, x_shape, y, y_shape); } +TEST_F(ArrayGradTest, StridedSliceGrad) { + TensorShape x_shape({6, 4, 4}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + + // y = x[2:6:2, 1:3, 1:3] + auto y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1}); + // y.shape = [2, 2, 2]; + RunTest(x, x_shape, y, {2, 2, 2}); + + // y = x[2:6:2, 1:3, 1:3] + // begin_mask = 1<<1 (ignore begin_index = 1) + // end_mask = 1<<2 (ignore end_index = 2) + y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1}, + StridedSlice::BeginMask(1 << 1).EndMask(1 << 2)); + // y.shape = [2, 3, 3]; + RunTest(x, x_shape, y, {2, 3, 3}); + + // y = [tf.newaxis, 2:6:2, 1:3, 1:3] + y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1}, + StridedSlice::NewAxisMask(1 << 0)); + // y.shape = [1, 2, 2, 2]; + RunTest(x, x_shape, y, {1, 2, 2, 2}); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc index 7c833878818022c86fd3171ec9cef9fcd3217a24..e17a7c4bf6732177508c250aa034ecc382c505d9 100644 --- a/tensorflow/compiler/aot/compile.cc +++ b/tensorflow/compiler/aot/compile.cc @@ -88,9 +88,8 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config, // Converts the graph into an XLA computation, and compiles the // computation. // TODO(toddw): Should we let the user pick the XLA cpu vs. gpu client? - namespace gpu = perftools::gputools; - gpu::Platform* cpu_platform = - gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie(); + se::Platform* cpu_platform = + se::MultiPlatformManager::PlatformWithName("Host").ValueOrDie(); xla::CompileOnlyClient* client = xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform) .ValueOrDie(); diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc index 47ef5f82cbc718ea300afa0c4eb4b73e1ca22fd0..6b098049cbd7539a2b2e2696b13139a8a6b28e0f 100644 --- a/tensorflow/compiler/aot/test.cc +++ b/tensorflow/compiler/aot/test.cc @@ -35,6 +35,7 @@ limitations under the License. // clang-format on #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 50fa95c4f322e85c22f7be2d63f2bcd194ee419e..af2965bba5b91a66e206f05bb8945b0dcde1d2b4 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -180,6 +180,7 @@ cc_library( "//tensorflow/core/kernels:no_op", "//tensorflow/core/kernels:sendrecv_ops", "//tensorflow/core/kernels:variable_ops", + "@com_google_absl//absl/memory", ], ) @@ -256,19 +257,6 @@ cc_library( alwayslink = 1, ) -cc_library( - name = "graph_to_functiondef", - srcs = ["graph_to_functiondef.cc"], - hdrs = ["graph_to_functiondef.h"], - visibility = [":friends"], - deps = [ - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework", - "//tensorflow/core:lib", - "//tensorflow/core:protos_all_cc", - ], -) - cc_library( name = "create_xla_launch_op", srcs = [ @@ -299,7 +287,6 @@ cc_library( ], deps = [ ":common", - ":graph_to_functiondef", ":shape_inference_helpers", ":union_find", "//tensorflow/compiler/jit/graphcycles", @@ -346,28 +333,6 @@ tf_cc_test( ], ) -tf_cc_test( - name = "graph_to_functiondef_test", - size = "small", - srcs = [ - "graph_to_functiondef_test.cc", - ], - deps = [ - ":graph_to_functiondef", - "//tensorflow/cc:cc_ops", - "//tensorflow/cc:cc_ops_internal", - "//tensorflow/cc:function_ops", - "//tensorflow/cc:ops", - "//tensorflow/compiler/tf2xla:xla_compiler", - "//tensorflow/compiler/tf2xla/kernels:xla_ops", - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework_internal", - "//tensorflow/core:test", - "//tensorflow/core:test_main", - "//tensorflow/core:testlib", - ], -) - tf_cc_test( name = "compilation_passes_test", size = "small", @@ -378,7 +343,6 @@ tf_cc_test( deps = [ ":common", ":compilation_passes", - ":graph_to_functiondef", "//tensorflow/cc:cc_ops", "//tensorflow/cc:cc_ops_internal", "//tensorflow/cc:function_ops", diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc index 9465385b5856baf4d03f280ff30572e196a7663b..f06debaf316c0172a5683e56aa5de6ebb83fbece 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc @@ -22,7 +22,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/jit/graph_to_functiondef.h" +#include "tensorflow/compiler/jit/graphcycles/graphcycles.h" #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h" #include "tensorflow/compiler/jit/mark_for_compilation_pass.h" #include "tensorflow/compiler/jit/shape_inference_helpers.h" @@ -34,6 +34,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/shape_refiner.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_def_util.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/graph/algorithm.h" @@ -160,6 +161,11 @@ class Encapsulator { std::move(outside_compilation_attribute)), graph_in_(graph_in) {} + // Find dependencies between subgraphs and outside_compilation clusters that + // only manifest via edges between outside_compilation clusters in the outer + // (non-compiled) graph. + Status FindClusterDependencies(); + // Find subgraphs marked with 'group_attribute', and build a new // subgraph, one for each value of 'group_attribute'. Status SplitIntoSubgraphs(); @@ -230,6 +236,19 @@ class Encapsulator { // the shapes of any ancestor RAH outputs. If it can be determined that the // shape of the SFH inputs will not be inferrable even once the shapes of the // RAH outputs are known, an error is returned by the rewriter. + // + // Once edges between compiled and outside_compilation clusters have been + // replaced by send/recv ops, some dependencies may no longer be apparent. + // A clustering pass finds all the dependencies between HC nodes that are only + // present as a result of edges between nodes in outside_compilaton clusters. + // Suppose there is a path from outside_compilation cluster C in subgraph S + // to outside_compilation cluster D in subgraph T. If S != T then a control + // edge is added from the call node for S to the call node for T, which + // ensures that C will execute before D because S executes before T. If S==T + // then a control dependency is added between the HC nodes for C and D in S, + // and the HC node for C is added to an 'ancestors' attr in the HC node for D + // so that during compilation of the HC node for D, an XLA control dependency + // can be added to ensure C's SendToHost executes before D's RecvFromHost. class Subgraph { public: // Creates a graph to build the subgraph in, if it doesn't already exist, @@ -324,6 +343,18 @@ class Encapsulator { void RecordOutsideCompilationOutputOrControl( const string& outside_compilation_id, const Edge* edge); + // Records the fact that there is a path from a node in outside_compilation + // cluster ancestor to node in cluster successor that does not go through + // the subgraph. + void RecordOutsideCompilationDependency(const string& successor, + const string& ancestor); + + // Returns the mapping from outside_compilation cluster C to the set of + // outside_compilation clusters that have a path to C entirely outside + // compiled subgraphs. + const std::unordered_map> + OutsideCompilationAncestorMap() const; + // Adds the HostCompute nodes for each outside_compilation subgraph. Status AddHostComputes( const string& subgraph_name, @@ -406,6 +437,13 @@ class Encapsulator { Status AddHostComputeKeyPlaceholder(OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out); + // Get the set of outside_compilation clusters and the dependency edges + // between them. + void GetActiveClusterDependencyGraph( + std::unordered_set* clusters, + std::unordered_set* has_successor, + std::unordered_map>* ancestors_map); + // Builds a _RecvAtHost node producing all the inputs of an // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host. Status AddRecvAtHostNode(const string& group_attribute, @@ -468,6 +506,14 @@ class Encapsulator { // The outside_compilation clusters in this subgraph. std::unordered_map outside_compilation_subgraphs_; + // For each outside_compilation cluster C, the outside_compilation clusters + // that have a path to C outside the compiled graph. + std::unordered_map> + outside_compilation_ancestors_; + // For each outside_compilation cluster C, the outside_compilation clusters + // that have a path from C outside the compiled graph. + std::unordered_map> + outside_compilation_successors_; // NoOp node in the output graph that is sequenced after the call node and // used to prevent host-side outside_compilation sends and recvs from being @@ -556,6 +602,10 @@ class Encapsulator { std::unordered_set, NodeSlot::PairHasher>* edges_added); + // Adds control dependencies between subgraph call nodes that have + // dependencies via outside_compilation edges. + Status AddCallNodeDependencies(Graph* graph_out); + // Adds all edges to the output graph. Status AddEdgesToOutputGraph( const std::unordered_map& node_images, @@ -620,10 +670,65 @@ class Encapsulator { const Graph* graph_in_; std::unordered_map subgraphs_; + // For each subgraph S the subgraphs S' such that there is a path in some + // outside_compilation cluster C in S to some outside_compilation cluster C' + // in S', that goes only through the uncompiled graph. + std::unordered_map> subgraph_ancestors_; TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator); }; +namespace { + +// Return in 'sorted' a topological sort of clusters according to the +// dependencies encoded in ancestors. clusters is the list of all clusters +// including clusters that are not present in the ancestors map. has_successors +// is the set of clusters that are ancestors of some other cluster. +void TopologicalClusterSort( + const std::unordered_set& clusters, + const std::unordered_set& has_successors, + const std::unordered_map>& ancestors, + std::vector* sorted) { + // The nodes are placed in 'sorted' in topological order. + sorted->clear(); + // We don't use the standard DFS because we are not operating on Node* + // objects. + struct Work { + string cluster; + bool leave; + }; + std::set visited; + std::vector stack; + // Seed the processing list with clusters that have no successors. + for (const auto& cluster : clusters) { + if (has_successors.find(cluster) == has_successors.end()) { + stack.push_back({cluster, false}); + } + } + while (!stack.empty()) { + const Work item = stack.back(); + stack.pop_back(); + if (item.leave) { + sorted->push_back(item.cluster); + continue; + } + + if (visited.find(item.cluster) != visited.end()) continue; + visited.insert(item.cluster); + + stack.push_back({item.cluster, true}); + const auto& iter = ancestors.find(item.cluster); + if (iter != ancestors.end()) { + for (const auto& ancestor : iter->second) { + stack.push_back({ancestor, false}); + } + } + } + CHECK(sorted->size() == clusters.size()); +} + +} // namespace + Node* Encapsulator::Subgraph::GetCallNodeForInputs() const { return call_node_inputs_; } @@ -786,12 +891,71 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl( } } +void Encapsulator::Subgraph::RecordOutsideCompilationDependency( + const string& successor, const string& ancestor) { + outside_compilation_ancestors_[successor].insert(ancestor); + outside_compilation_successors_[ancestor].insert(successor); +} + +const std::unordered_map> +Encapsulator::Subgraph::OutsideCompilationAncestorMap() const { + return outside_compilation_ancestors_; +} + +void Encapsulator::Subgraph::GetActiveClusterDependencyGraph( + std::unordered_set* clusters, + std::unordered_set* has_successor, + std::unordered_map>* ancestors_map) { + // During initial clustering the ancestor and successor datastructures may + // have been built including oc_cluster names that never turned into subgraphs + // because they had no edges into or out of the compiled cluster. Remove them + // before proceeding to simplify the logic. Get the set of clusters that was + // actually added, then remove references to the others. + for (const auto& oc_subgraph : outside_compilation_subgraphs_) { + clusters->insert(oc_subgraph.first); + } + for (const auto& cluster : outside_compilation_successors_) { + if (clusters->find(cluster.first) != clusters->end()) { + for (const auto& successor : cluster.second) { + if (clusters->find(successor) != clusters->end()) { + has_successor->insert(cluster.first); + break; + } + } + } + } + for (const auto& cluster : outside_compilation_ancestors_) { + if (clusters->find(cluster.first) != clusters->end()) { + std::unordered_set& ancestors = (*ancestors_map)[cluster.first]; + for (const auto& ancestor : cluster.second) { + if (clusters->find(ancestor) != clusters->end()) { + ancestors.insert(ancestor); + } + } + } + } +} + Status Encapsulator::Subgraph::AddHostComputes( const string& subgraph_name, const std::unordered_map& node_images) { - for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) { - const string& oc_subgraph_name = oc_subgraph_iter.first; - OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second; + // Get the set of outside_compilation clusters and the dependency edges + // between them. + std::unordered_set clusters; + std::unordered_set has_successor; + std::unordered_map> ancestors_map; + GetActiveClusterDependencyGraph(&clusters, &has_successor, &ancestors_map); + // Topologically sort the outside_compilation clusters according to their + // dependency relation. + std::vector sorted_clusters; + TopologicalClusterSort(clusters, has_successor, ancestors_map, + &sorted_clusters); + + // The host compute nodes added for each outside_compilation_cluster; + std::unordered_map host_compute_node; + for (const string& oc_subgraph_name : sorted_clusters) { + OutsideCompilationSubgraph& oc_subgraph = + outside_compilation_subgraphs_[oc_subgraph_name]; if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() || !oc_subgraph.outputs_by_src.empty() || !oc_subgraph.control_outputs.empty()) { @@ -811,13 +975,22 @@ Status Encapsulator::Subgraph::AddHostComputes( inputs[input_index].Reset(src_image->name(), src_slot, dtype); input_dtypes[input_index] = dtype; } - for (const auto& output : oc_subgraph.outputs_by_src) { DataType dtype = output.first.dtype; int output_index = output.second; output_dtypes[output_index] = dtype; } + std::vector host_compute_ancestors; + const auto iter = ancestors_map.find(oc_subgraph_name); + if (iter != ancestors_map.end()) { + for (const string& ancestor_cluster : iter->second) { + host_compute_ancestors.push_back( + outside_compilation_subgraphs_[ancestor_cluster] + .host_compute_name); + } + } + NodeDef host_compute_def; NodeDefBuilder builder(strings::StrCat("outside_compilation_", oc_subgraph_name, "_host_compute"), @@ -825,6 +998,7 @@ Status Encapsulator::Subgraph::AddHostComputes( builder.Input(inputs); builder.Attr("Tinputs", input_dtypes); builder.Attr("Toutputs", output_dtypes); + builder.Attr("ancestors", host_compute_ancestors); builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name, "_", oc_subgraph_name)); @@ -834,6 +1008,7 @@ Status Encapsulator::Subgraph::AddHostComputes( Node* host_compute = graph_->AddNode(host_compute_def, &s); if (!s.ok()) return s; + host_compute_node[host_compute->name()] = host_compute; oc_subgraph.host_compute_name = host_compute->name(); // Connect the _HostCompute node to its producers in the subgraph. @@ -852,6 +1027,12 @@ Status Encapsulator::Subgraph::AddHostComputes( graph_->AddControlEdge(src_image, host_compute); } + // Connect the _HostCompute node to its ancestor host compute nodes. + for (const auto& ancestor_name : host_compute_ancestors) { + Node* ancestor = host_compute_node[ancestor_name]; + graph_->AddControlEdge(ancestor, host_compute); + } + // Connect the consumers in the subgraph to the _HostCompute node. for (const auto& output : oc_subgraph.outputs_by_dst) { const Node* dst_node = output.first.node; @@ -1654,6 +1835,17 @@ Status Encapsulator::CopyEdgeToOutputGraph( return Status::OK(); } +Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) { + for (const auto& ancestors : subgraph_ancestors_) { + const string& subgraph = ancestors.first; + for (const string& ancestor : ancestors.second) { + graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNodeForOutputs(), + subgraphs_[subgraph].GetCallNodeForInputs()); + } + } + return Status::OK(); +} + Status Encapsulator::AddEdgesToOutputGraph( const std::unordered_map& node_images, bool parallel_checking, Graph* graph_out) { @@ -1703,6 +1895,7 @@ Status Encapsulator::AddEdgesToOutputGraph( Subgraph& subgraph = subgraph_entry.second; subgraph.ConnectSequencerToCallNode(graph_out); } + TF_RETURN_IF_ERROR(AddCallNodeDependencies(graph_out)); return Status::OK(); } @@ -1960,6 +2153,182 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend( return Status::OK(); } +namespace { + +// Helper struct for building cluster dependencies and also debugging cycles in +// the dependencies. While computing dependencies we construct a mapping from +// Node* to PathDetails. +struct PathDetails { + struct SubgraphAndCluster { + string subgraph; + string outside_compilation_cluster; + bool operator==(const SubgraphAndCluster& other) const { + return subgraph == other.subgraph && + outside_compilation_cluster == other.outside_compilation_cluster; + } + }; + + struct SubgraphAndClusterHash { + inline std::size_t operator()(const SubgraphAndCluster& v) const { + return hash()( + strings::StrCat(v.subgraph, v.outside_compilation_cluster)); + } + }; + + typedef std::unordered_set + SubgraphAndClusterSet; + + // Returns the set of (subgraph, oc_cluster) pairs that should be recorded as + // ancestors for any successor of this node. If the node is in the outer + // graph, it returns the transitive union of the ancestors of the node's + // inputs. If the node is in an outside_compilation cluster, it returns just + // that cluster. If the node is compiled, it returns the empty set. + SubgraphAndClusterSet AncestorsForSuccessor() { + if (subgraph.empty()) { + return ancestor_clusters; + } else if (outside_compilation_cluster.empty()) { + return SubgraphAndClusterSet(); + } else { + SubgraphAndCluster entry; + entry.subgraph = subgraph; + entry.outside_compilation_cluster = outside_compilation_cluster; + return SubgraphAndClusterSet({entry}); + } + } + + // The transitive union of the ancestor's of this node's inputs. This is only + // saved for debugging in order to print out enough information to debug a + // discovered cycle. + SubgraphAndClusterSet ancestor_clusters; + // The subgraph attr on this node. + string subgraph; + // The outside_compilation attr on this node. + string outside_compilation_cluster; +}; + +// Adds an edge from ancestor to successor to the cycle detector, and returns an +// error if that edge causes the formation of a cycle. In the error case, logs +// the contents of the node_ancestors_map to facilitate debugging. +Status CheckClusterDependencyForCycles( + const string& ancestor, const string& successor, + const std::unordered_map>& ancestors, + const std::unordered_map& node_ancestors_map, + GraphCycles* cycle_detector, std::map* cycle_detector_map) { + if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) { + (*cycle_detector_map)[ancestor] = cycle_detector->NewNode(); + } + if (cycle_detector_map->find(successor) == cycle_detector_map->end()) { + (*cycle_detector_map)[successor] = cycle_detector->NewNode(); + } + + if (!cycle_detector->InsertEdge((*cycle_detector_map)[ancestor], + (*cycle_detector_map)[successor])) { + LOG(ERROR) << "Cycle in outside_compilation clusters"; + for (const auto& cluster : ancestors) { + LOG(ERROR) << "Cluster " << cluster.first << " depends on:"; + for (const auto& ancestor : cluster.second) { + LOG(ERROR) << " " << ancestor; + } + } + for (const auto& node_ancestors : node_ancestors_map) { + LOG(ERROR) << "Node " << node_ancestors.first->name() << " (" + << node_ancestors.second.subgraph << ";" + << node_ancestors.second.outside_compilation_cluster + << ") has ancestor clusters:"; + for (const auto& ancestor : node_ancestors.second.ancestor_clusters) { + LOG(ERROR) << " " << ancestor.subgraph << ";" + << ancestor.outside_compilation_cluster; + } + } + return errors::InvalidArgument( + "Can't compile outside_compilation clusters because there is a " + "dependency cycle: see error log for details."); + } + return Status::OK(); +} + +} // namespace + +Status Encapsulator::FindClusterDependencies() { + // Map from nodes to ancestor details. A node is entered into the map if it is + // in a compilation subgraph, and outside_compilation cluster, or appears on a + // path in the outer graph leading from an outside_compilation subgraph. + std::unordered_map node_ancestors_map; + // We check that clusters are acyclic using this cycle detector. + GraphCycles cycle_detector; + // Map from cluster name to cycle detector node id. + std::map cycle_detector_map; + // Process the nodes in topologically-sorted order. + std::vector nodes; + GetReversePostOrder(*graph_in_, &nodes); + for (Node* node : nodes) { + string subgraph_name; + string oc_cluster; + TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &subgraph_name, &oc_cluster)); + // First create an entry in the ancestors map if the node is in a compiled + // subgraph or outside_compilation cluster, or if any incoming edge is from + // a node with an ancestor map entry; and find the union of all the + // ancestors. + if (!subgraph_name.empty()) { + node_ancestors_map[node].subgraph = subgraph_name; + node_ancestors_map[node].outside_compilation_cluster = oc_cluster; + } + for (Node* src : node->in_nodes()) { + const auto iter = node_ancestors_map.find(src); + if (iter != node_ancestors_map.end()) { + const auto& ancestors_to_follow = iter->second.AncestorsForSuccessor(); + for (const auto& ancestor : ancestors_to_follow) { + if (ancestor.subgraph != subgraph_name || + ancestor.outside_compilation_cluster != oc_cluster) { + node_ancestors_map[node].ancestor_clusters.insert(ancestor); + } + } + } + } + if (!subgraph_name.empty()) { + // The node is in a compiled subgraph or an outside_compilation cluster. + if (oc_cluster.empty()) { + // The node is not in an outside_compilation cluster. Record the + // subgraph's ancestor dependencies. + for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) { + if (cluster.subgraph != subgraph_name) { + subgraph_ancestors_[subgraph_name].insert(cluster.subgraph); + TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles( + cluster.subgraph, subgraph_name, subgraph_ancestors_, + node_ancestors_map, &cycle_detector, &cycle_detector_map)); + } + } + } else { + Subgraph& subgraph = subgraphs_[subgraph_name]; + // The node is in an outside_compilation cluster. Record the cluster + // and/or subgraph ancestor dependencies. + for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) { + if (cluster.subgraph == subgraph_name) { + // The ancestor is in the same subgraph. + if (cluster.outside_compilation_cluster != oc_cluster) { + // But not in the same oc_cluster, so record the dependency. + subgraph.RecordOutsideCompilationDependency( + oc_cluster, cluster.outside_compilation_cluster); + TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles( + cluster.outside_compilation_cluster, oc_cluster, + subgraph.OutsideCompilationAncestorMap(), node_ancestors_map, + &cycle_detector, &cycle_detector_map)); + } + } else { + // The ancestor is in a different subgraph, so record the + // dependency. + subgraph_ancestors_[subgraph_name].insert(cluster.subgraph); + TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles( + cluster.subgraph, subgraph_name, subgraph_ancestors_, + node_ancestors_map, &cycle_detector, &cycle_detector_map)); + } + } + } + } + } + return Status::OK(); +} + Status Encapsulator::MakePrunedGraphCopyAndInline( const Graph& graph, const std::vector& sink_nodes, std::unique_ptr* pruned_graph, @@ -2166,6 +2535,7 @@ Status EncapsulateSubgraphsInFunctions( Encapsulator encapsulator(std::move(group_attribute), std::move(outside_compilation_attribute), &graph_in); + TF_RETURN_IF_ERROR(encapsulator.FindClusterDependencies()); TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs()); TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs( diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc index 8599a7038af9663e5af6f3231429cb7f6ea5f69b..5ec24d39a2c40a766dbb0ec51ebe798de620e24b 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc @@ -20,8 +20,8 @@ limitations under the License. #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/ops/standard_ops.h" -#include "tensorflow/compiler/jit/graph_to_functiondef.h" #include "tensorflow/core/framework/function_testlib.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/graph_def_builder.h" #include "tensorflow/core/lib/core/status_test_util.h" @@ -74,7 +74,7 @@ bool EqualProtoMap(const ::tensorflow::protobuf::Map& a, if (!compare(elt_a.first, elt_a.second, iter->second)) { if (diff) { *diff = strings::StrCat(map_name, " expected: element with key '", - key_to_string(elt_a.first), " has value '", + key_to_string(elt_a.first), "' has value '", value_to_string(elt_a.second), "' got: '", value_to_string(iter->second), "'"); } @@ -121,8 +121,22 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b, } return false; } + std::unordered_set control_input_a; + std::unordered_set control_input_b; for (int i = 0; i < a.input_size(); ++i) { - if (a.input(i) != b.input(i)) { + if (str_util::StartsWith(a.input(i), "^")) { + if (!str_util::StartsWith(b.input(i), "^")) { + if (diff) { + *diff = strings::StrCat( + diff_preamble, " mismatch for node ", a.name(), " input ", i, + ", expected control input ", a.input(i), " got ", b.input(i), + " expected:\n", a.DebugString(), "\ngot:\n", b.DebugString()); + } + return false; + } + control_input_a.insert(a.input(i)); + control_input_b.insert(b.input(i)); + } else if (a.input(i) != b.input(i)) { if (diff) { *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), " input ", i, ", expected ", a.input(i), @@ -132,11 +146,29 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b, return false; } } + if (control_input_a != control_input_b) { + if (diff) { + *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), + " control inputs differ expected:\n", + a.DebugString(), "\ngot:\n", b.DebugString()); + } + return false; + } return EqualProtoMap( a.attr(), b.attr(), [](const string& s) { return s; }, [](const AttrValue& v) { return v.DebugString(); }, [](const string& key, const AttrValue& av, const AttrValue& bv) { - return av.DebugString() == bv.DebugString(); + if (key == "ancestors") { + // The ancestors are added from a set so the order is unpredictable; + // just compare set equality not list equality. + std::unordered_set a_set(av.list().s().begin(), + av.list().s().end()); + std::unordered_set b_set(bv.list().s().begin(), + bv.list().s().end()); + return a_set == b_set; + } else { + return av.DebugString() == bv.DebugString(); + } }, strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()), diff); @@ -261,6 +293,7 @@ REGISTER_OP("XlaHostCompute") .Output("outputs: Toutputs") .Attr("Tinputs: list(type) >= 0") .Attr("Toutputs: list(type) >= 0") + .Attr("ancestors: list(string) >= 0") .Attr("key: string") .Attr("shape_inference_graph: string = ''") .Attr("shapes: list(shape) >= 0") @@ -899,6 +932,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) { {"C:o:0", "c:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, @@ -1044,17 +1078,20 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) { {"D:o:0", "F:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", + gtl::ArraySlice({"outside_compilation_O1_host_compute"})}, {"key", "host_compute_channel_F1_O2"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O2"}, {"shapes", gtl::ArraySlice({})}, {"_outside_compilation_subgraph", "O2"}}, - {"F"}}, + {"F", "outside_compilation_O1_host_compute"}}, {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"C:o:0", "D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, @@ -1193,6 +1230,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {"C:o:0", "D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, @@ -1215,6 +1253,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {"G:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F2_O1"}, {"shape_inference_graph", ""}, {"shapes", @@ -1279,6 +1318,179 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); } +// Test with two functions to transform, each with one outside_compilation +// cluster, with the dependency between them purely from an outside_compilation +// edge. +TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = InputShaped(b1.opts().WithName("A")); + Node* b = InputShaped(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Binary(c, d, + b1.opts() + .WithName("E") + .WithControlInputs({b, d}) + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Binary(c, e, + b1.opts().WithName("F").WithControlInput(e).WithAttr( + "_encapsulate", "F1")); + Node* g = + Binary(a, b, b1.opts().WithName("G").WithAttr("_encapsulate", "F2")); + Node* h = Unary(g, b1.opts() + .WithName("H") + .WithAttr("_encapsulate", "F2") + .WithAttr("_outside", "O1") + .WithControlInput(e)); + Node* i = Unary(h, b1.opts().WithName("I").WithAttr("_encapsulate", "F2")); + Binary(f, i, b1.opts().WithName("J")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT, DT_FLOAT}, shape.opts()); + Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1), + shape.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected)); + } + + { + GraphDefBuilder shape(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F2", "O1", + {DT_FLOAT}, shape.opts()); + Node* h = Unary(recv, shape.opts() + .WithName("H") + .WithAttr("_encapsulate", "F2") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F2", "O1", {h}, shape.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape, "F2_O1", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {}, + { + {{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, + "BinaryTest", + {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"}, + {}, + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"C:o:0", "D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}, + {"D"}}, + }, + {{"f_0_retval", "F:o:0"}}); + + *library_expected.add_function() = FunctionDefHelper::Create( + "F2", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {}, + { + {{"G"}, "BinaryTest", {"a_0_arg", "b_0_arg"}}, + {{"I"}, + "UnaryTest", + {"outside_compilation_O1_host_compute:outputs:0"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"G:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F2_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F2_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}}, + }, + {{"i_0_retval", "I:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = InputShaped(b2.opts().WithName("A")); + Node* b = InputShaped(b2.opts().WithName("B")); + + Node* key_constant1 = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1", + {DT_FLOAT, DT_FLOAT}, b2.opts()); + Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1), + b2.opts() + .WithName("E") + .WithControlInputs({recv1, b}) + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e}, + b2.opts().WithControlInput(e)); + Node* s1 = Sequencer( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}), + "F1"); + + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b); + Node* call1 = + b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1); + + Node* key_constant2 = + KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1", + {DT_FLOAT}, b2.opts()); + Node* h = Unary(recv2, b2.opts() + .WithName("H") + .WithAttr("_encapsulate", "F2") + .WithAttr("_outside", "O1") + .WithControlInput(e)); + Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h}, + b2.opts()); + + Node* s2 = Sequencer( + b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}), + "F2"); + NodeBuilder node_builder2("F2", "F2", lib_def.get()); + node_builder2.Input(a).Input(b); + Node* call2 = b2.opts() + .WithControlInputs({s2, call1}) + .FinalizeBuilder(&node_builder2); + Binary(call1, call2, b2.opts().WithName("J")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + // Test with one outside_compilation cluster that has no inputs from the // compiled subgraph. TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { @@ -1323,6 +1535,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { {}, {{"Tinputs", gtl::ArraySlice({})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", @@ -1406,6 +1619,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) { {}, {{"Tinputs", gtl::ArraySlice({})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", @@ -1487,6 +1701,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) { {"D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", gtl::ArraySlice({})}, @@ -1567,6 +1782,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) { {"D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", gtl::ArraySlice({})}, @@ -1607,6 +1823,371 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) { TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); } +// Test with two outside_compilation clusters that interact outside the compiled +// subgraph, where the ancestor has no HostCompute Op. +TEST(EncapsulateSubgraphsTest, + OutsideCompilationClusterDependencyNoSrcCluster) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = Input(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Unary(a, b1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Unary(d, b1.opts().WithName("F").WithAttr("_encapsulate", "F1")); + Node* g = Unary(f, b1.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* h = Unary(g, b1.opts().WithName("H").WithAttr("_encapsulate", "F1")); + Binary(e, h, b1.opts().WithName("I")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", + {DT_FLOAT}, shape2.opts()); + Node* g = Unary(ops::NodeOut(recv2, 0), shape2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, shape2.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {}, + { + {{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, "UnaryTest", {"D:o:0"}}, + {{"H"}, + "UnaryTest", + {"outside_compilation_O2_host_compute:outputs:0"}}, + {{"outside_compilation_O2_host_compute"}, + "XlaHostCompute", + {"F:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O2"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O2"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O2"}}}, + }, + {{"h_0_retval", "H:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = Input(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + + Node* e = Unary(a, b2.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* key_constant = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", + {DT_FLOAT}, b2.opts()); + Node* g = Unary(recv, b2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* send = + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, b2.opts()); + Node* s1 = Sequencer( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}), + "F1"); + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b).ControlInput(s1); + Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); + + Binary(e, call1, b2.opts().WithName("I")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + +// Test with two outside_compilation clusters that interact outside the compiled +// subgraph, where the successor has no HostCompute Op. +TEST(EncapsulateSubgraphsTest, + OutsideCompilationClusterDependencyNoDstCluster) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = Input(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Unary(d, b1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1")); + /*Node* g =*/Unary(a, b1.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1")); + Binary(e, h, b1.opts().WithName("I")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, shape1.opts()); + Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {}, + { + {{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, + "UnaryTest", + {"outside_compilation_O1_host_compute:outputs:0"}}, + {{"H"}, "UnaryTest", {"F:o:0"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}}, + }, + {{"h_0_retval", "H:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = Input(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + + Node* key_constant = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, b2.opts()); + Node* e = Unary(recv, b2.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* send = + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts()); + /*Node* g =*/Unary(a, b2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* s1 = Sequencer( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}), + "F1"); + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b).ControlInput(s1); + Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); + + Binary(e, call1, b2.opts().WithName("I")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + +// Test with two outside_compilation clusters that interact outside the compiled +// subgraph. +TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = Input(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Unary(d, b1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1")); + Node* g = Unary(d, b1.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1")); + /*Node* i =*/Binary(d, e, + b1.opts() + .WithName("I") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O3") + .WithControlInput(g)); + Binary(e, h, b1.opts().WithName("J")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, shape1.opts()); + Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {}, + {{{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, "UnaryTest", {"outside_compilation_O1_host_compute:outputs:0"}}, + {{"H"}, "UnaryTest", {"F:o:0"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}}, + {{"outside_compilation_O2_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", + gtl::ArraySlice({"outside_compilation_O1_host_compute"})}, + {"key", "host_compute_channel_F1_O2"}, + {"shape_inference_graph", ""}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O2"}}, + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O3_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", + gtl::ArraySlice({"outside_compilation_O1_host_compute", + "outside_compilation_O2_host_compute"})}, + {"key", "host_compute_channel_F1_O3"}, + {"shape_inference_graph", ""}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O3"}}, + {"outside_compilation_O1_host_compute", + "outside_compilation_O2_host_compute"}}}, + {{"h_0_retval", "H:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = Input(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + + Node* key_constant = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, b2.opts()); + Node* e = Unary(recv1, b2.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* send = + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts()); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", + {DT_FLOAT}, b2.opts()); + Node* g = Unary(recv2, b2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* recv3 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3", + {DT_FLOAT}, b2.opts()); + /*Node* i =*/Binary(recv3, e, + b2.opts() + .WithName("I") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O3") + .WithControlInput(g)); + Node* s1 = Sequencer(b2.opts() + .WithName("F1_sequencer") + .WithControlInputs({recv1, send, recv2, recv3}), + "F1"); + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b).ControlInput(s1); + Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); + + Binary(e, call1, b2.opts().WithName("J")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + // Test with one outside_compilation cluster that has no outputs from the // compiled subgraph. TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) { @@ -1731,6 +2312,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) { {"c:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc index f48941fce329313e4484b3c2dd900eeac884ed34..049d170fa48928474b894f2d0e1f2243c5f87275 100644 --- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc +++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc @@ -37,8 +37,6 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/util/stream_executor_util.h" -namespace gpu = perftools::gputools; - namespace tensorflow { XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx) @@ -51,9 +49,9 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx) num_constant_args_ = constant_types.size(); OP_REQUIRES_OK(ctx, ctx->GetAttr("Nresources", &num_resource_args_)); if (device_type_ == DeviceType(DEVICE_CPU)) { - platform_id_ = gpu::host::kHostPlatformId; + platform_id_ = se::host::kHostPlatformId; } else if (device_type_ == DeviceType(DEVICE_GPU)) { - platform_id_ = gpu::cuda::kCudaPlatformId; + platform_id_ = se::cuda::kCudaPlatformId; } else { platform_id_ = nullptr; } @@ -69,9 +67,9 @@ Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx, return Status::OK(); } - auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id_); + auto platform = se::MultiPlatformManager::PlatformWithId(platform_id_); if (!platform.ok()) { - return StreamExecutorUtil::ConvertStatus(platform.status()); + return platform.status(); } xla::LocalClientOptions client_options; client_options.set_platform(platform.ValueOrDie()); @@ -100,7 +98,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) { ResourceMgr* rm = ctx->resource_manager(); OP_REQUIRES(ctx, rm, errors::Internal("No resource manager.")); - gpu::Stream* stream = + se::Stream* stream = ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr; XlaCompilationCache* cache; @@ -153,7 +151,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) { options.device_type = &cache->device_type(); options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition(); options.graph_def_version = ctx->function_library()->graph_def_version(); - options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId); + options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId); options.device_allocator = xla_allocator; // TODO(b/77671268): We don't set variable_representation_shape_fn here. This // is restricted to Variables, but we need something like this to apply to diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.h b/tensorflow/compiler/jit/kernels/xla_launch_op.h index c6cc0986af0300c51283d432c671e92a1e4d8145..8f8e646f0ff6d94dfdf56721cacfce7fa658beb6 100644 --- a/tensorflow/compiler/jit/kernels/xla_launch_op.h +++ b/tensorflow/compiler/jit/kernels/xla_launch_op.h @@ -53,7 +53,7 @@ class XlaLocalLaunchOp : public OpKernel { // Number of resource variable arguments. int num_resource_args_; - perftools::gputools::Platform::Id platform_id_; + se::Platform::Id platform_id_; TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp); }; diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc index 682d6ea8ccc4a54912ccad4666cf0a7a03a7a698..60458f6f3314b2c3b65be1c90e051b2a670383bc 100644 --- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc +++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc @@ -58,7 +58,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx, launch_context.PopulateInputs(ctx, result, variables); - perftools::gputools::Stream* stream = + se::Stream* stream = ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr; TF_RET_CHECK(stream); @@ -67,6 +67,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx, run_options.set_stream(stream); run_options.set_allocator(client->backend().memory_allocator()); run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device()); + run_options.set_rng_seed(ctx->step_id()); auto run_result = executable->Run(launch_context.arguments(), run_options); TF_RETURN_IF_ERROR(run_result.status()); diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc index 12f471735f68394a3079541e9ac8532e329bd694..c814b7eb029054d9d3659d52488729ae5989506a 100644 --- a/tensorflow/compiler/jit/xla_device.cc +++ b/tensorflow/compiler/jit/xla_device.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/jit/defs.h" #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h" #include "tensorflow/compiler/jit/xla_device_context.h" @@ -50,8 +51,6 @@ limitations under the License. #include "tensorflow/core/util/device_name_utils.h" #include "tensorflow/core/util/stream_executor_util.h" -namespace se = ::perftools::gputools; - namespace tensorflow { // Caches a XlaDeviceAllocator per pair. A @@ -121,7 +120,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator( auto platform = se::MultiPlatformManager::PlatformWithName(platform_name); if (!platform.ok()) { - return StreamExecutorUtil::ConvertStatus(platform.status()); + return platform.status(); } const DeviceAttributes attrs = Device::BuildDeviceAttributes( @@ -181,9 +180,15 @@ XlaDevice::XlaDevice(const SessionOptions& options, jit_device_name_(jit_device_name), xla_allocator_(nullptr), platform_(platform), - transfer_as_literal_(transfer_as_literal) {} + transfer_as_literal_(transfer_as_literal) { + VLOG(1) << "Created XLA device " << jit_device_name; +} -XlaDevice::~XlaDevice() {} +XlaDevice::~XlaDevice() { + if (gpu_device_info_ != nullptr) { + gpu_device_info_->default_context->Unref(); + } +} xla::LocalClient* XlaDevice::client() const { // We lazily create the client because the platform commits to the @@ -191,9 +196,8 @@ xla::LocalClient* XlaDevice::client() const { // don't want to do it until we get a chance to hook the platform up // to a simulator. - // For now GetOrCreateLocalClient always returns success when passed - // a non-null platform. If that changes we may have to plumb in some - // way to pass Status back. + // TODO(b/78468222): This can fail, at least when the backend is GPU and + // there is no GPU on the host. return xla::ClientLibrary::GetOrCreateLocalClient(platform_).ValueOrDie(); } @@ -218,14 +222,31 @@ xla::StatusOr XlaDevice::GetStream() { return stream_.get(); } +Status XlaDevice::CreateAndSetGpuDeviceInfo() { + if (gpu_device_info_ == nullptr) { + TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); + // Call GetAllocator for the side-effect of ensuring the allocator + // is created. + GetAllocator({}); + // XlaDevice owns both gpu_device_info_ and + // gpu_device_info_->default_context. + gpu_device_info_ = absl::make_unique(); + gpu_device_info_->stream = stream; + gpu_device_info_->default_context = + new XlaDeviceContext(stream, client(), transfer_as_literal_); + set_tensorflow_gpu_device_info(gpu_device_info_.get()); + } + + return Status::OK(); +} + Status XlaDevice::FillContextMap(const Graph* graph, DeviceContextMap* device_context_map) { VLOG(1) << "XlaDevice::FillContextMap"; device_context_map->resize(graph->num_node_ids()); TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); - // Call GetAllocator for the side-effect of ensuring the allocator and - // XlaTensorInfoManager is created. - (void)GetAllocator({}); + // Call GetAllocator for the side-effect of ensuring the allocator is created. + GetAllocator({}); auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_); for (Node* n : graph->nodes()) { VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name(); diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h index 4fe7dd8c9fa9eb954804555e9615160dc4bc3e8a..3ae87308cc7cffa916e178893df70a3f314b11b0 100644 --- a/tensorflow/compiler/jit/xla_device.h +++ b/tensorflow/compiler/jit/xla_device.h @@ -49,20 +49,20 @@ class XlaDevice : public LocalDevice { // retrieved e.g., when lazily creating the XlaCompilationCache device. class Metadata { public: - Metadata(int device_ordinal, perftools::gputools::Platform* platform, + Metadata(int device_ordinal, se::Platform* platform, const DeviceType& device_type); // The index of the device on this host. int device_ordinal() const; - perftools::gputools::Platform* platform() const; + se::Platform* platform() const; xla::LocalClient* client() const; const DeviceType& jit_device_type() const; private: const int device_ordinal_; const DeviceType device_type_; - perftools::gputools::Platform* platform_; // Not owned. + se::Platform* platform_; // Not owned. TF_DISALLOW_COPY_AND_ASSIGN(Metadata); }; @@ -85,8 +85,7 @@ class XlaDevice : public LocalDevice { XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs, int device_ordinal, const DeviceType& jit_device_name, - ::perftools::gputools::Platform* platform, - bool transfer_as_literal); + se::Platform* platform, bool transfer_as_literal); ~XlaDevice() override; Allocator* GetAllocator(AllocatorAttributes attr) override; @@ -103,7 +102,11 @@ class XlaDevice : public LocalDevice { Tensor* tensor) override; xla::LocalClient* client() const; - xla::StatusOr<::perftools::gputools::Stream*> GetStream(); + xla::StatusOr GetStream(); + + // If not already set, create and set GpuDeviceInfo. + // Not thread-safe + Status CreateAndSetGpuDeviceInfo(); private: // The metadata of this XlaDevice. @@ -114,7 +117,7 @@ class XlaDevice : public LocalDevice { DeviceType jit_device_name_; // Memory allocator associated with this device. Allocator* xla_allocator_; // Not owned. - ::perftools::gputools::Platform* platform_; // Not owned. + se::Platform* platform_; // Not owned. // Stream associated with this device. Operations enqueued on this // stream are executed on the device. Operations include data // copying back and forth between CPU and the device, and @@ -123,6 +126,10 @@ class XlaDevice : public LocalDevice { // Must we use XLA's transfer manager for correct host<->device transfers? if // false, we can use ThenMemcpy() instead. bool transfer_as_literal_; + + // If set, holds default device context (that we must Unref) + // and its stream. + std::unique_ptr gpu_device_info_; }; // Builds OpKernel registrations on 'device' for the JIT operators diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc index 43eb164012610723214cf39360698010c9dbdbd4..bf8c1886a022310eeaacdf69463f575a393dd8d0 100644 --- a/tensorflow/compiler/jit/xla_device_context.cc +++ b/tensorflow/compiler/jit/xla_device_context.cc @@ -23,8 +23,6 @@ limitations under the License. #include "tensorflow/core/common_runtime/dma_helper.h" #include "tensorflow/core/platform/mem.h" -namespace se = ::perftools::gputools; - namespace tensorflow { // The allocator used for Tensors assigned to the XLA device. diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h index ad914a1c23b5f2ea7063722f85e027a99fdb68f9..d7f5f1d208989256f8043d2e6d93cf9bd89333b2 100644 --- a/tensorflow/compiler/jit/xla_device_context.h +++ b/tensorflow/compiler/jit/xla_device_context.h @@ -45,8 +45,7 @@ class XlaDeviceAllocator : public Allocator { // Helper class for managing data transfers between host and XLA devices. class XlaTransferManager { public: - explicit XlaTransferManager(perftools::gputools::Stream* stream, - xla::LocalClient* client, + explicit XlaTransferManager(se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal); void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, @@ -54,7 +53,7 @@ class XlaTransferManager { void CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece tensor_name, Device* device, Tensor* cpu_tensor, StatusCallback done); - perftools::gputools::Stream* stream() const { return stream_; } + se::Stream* stream() const { return stream_; } private: Status TransferLiteralToDevice(const Tensor& host_tensor, @@ -64,7 +63,7 @@ class XlaTransferManager { // Stream obtained from a Device, used to transfer tensors between // CPU and device. - perftools::gputools::Stream* stream_; + se::Stream* stream_; // For the underlying memory allocator and XLA's TransferManager. xla::LocalClient* client_; // Transfer manager, for marshalling data to and from the device. @@ -78,8 +77,8 @@ class XlaTransferManager { // wraps the methods in XlaTransferManager. class XlaDeviceContext : public DeviceContext { public: - explicit XlaDeviceContext(perftools::gputools::Stream* stream, - xla::LocalClient* client, bool transfer_as_literal); + explicit XlaDeviceContext(se::Stream* stream, xla::LocalClient* client, + bool transfer_as_literal); void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, Tensor* device_tensor, @@ -87,9 +86,7 @@ class XlaDeviceContext : public DeviceContext { void CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece tensor_name, Device* device, Tensor* cpu_tensor, StatusCallback done) override; - perftools::gputools::Stream* stream() const override { - return manager_.stream(); - } + se::Stream* stream() const override { return manager_.stream(); } private: XlaTransferManager manager_; diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index ac60423d959ca44e7d92e2d965cf731287b1f83f..a8afbf9dcd736bb292b7c5f52c7cce2b47fb85b6 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -54,6 +54,15 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options, VLOG(1) << "Failed to create XLA_GPU device: " << status; return Status::OK(); } + + // TODO(b/78468222): Uncomment after fixing this bug + // status = device->CreateAndSetGpuDeviceInfo(); + // if (!status.ok()) { + // errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT, + // " device"); + // return status; + // } + devices->push_back(device.release()); return Status::OK(); } diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index 50b0061d692f2a8c5ea475c0b00c4cb42a1a84e6..2a7f04271d4b7ea330f32b88ea1e3f4037988a91 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -32,9 +32,11 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/util/stream_executor_util.h" -namespace gpu = perftools::gputools; - namespace tensorflow { +namespace { +using xla::ScopedShapedBuffer; +using xla::ShapedBuffer; +} // anonymous namespace std::map SnapshotResourceVariables(OpKernelContext* ctx, int num_variables) { @@ -54,24 +56,23 @@ std::map SnapshotResourceVariables(OpKernelContext* ctx, return snapshot; } -XlaAllocator::XlaAllocator(const gpu::Platform* platform, Allocator* wrapped) +XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped) : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {} XlaAllocator::~XlaAllocator() {} -xla::StatusOr XlaAllocator::Allocate( +xla::StatusOr XlaAllocator::Allocate( int device_ordinal, uint64 size, bool retry_on_failure) { void* data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size); if (data == nullptr) { return errors::ResourceExhausted("Out of memory while trying to allocate ", size, " bytes."); } else { - return gpu::DeviceMemoryBase(data, size); + return se::DeviceMemoryBase(data, size); } } -Status XlaAllocator::Deallocate(int device_ordinal, - gpu::DeviceMemoryBase* mem) { +Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) { wrapped_->DeallocateRaw(mem->opaque()); return Status::OK(); } @@ -80,17 +81,17 @@ namespace { // Return the 'index''th subtree of the given ShapedBuffer as a // ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the // subtree, and sets the input's buffer pointers to nullptr for the subtree. -std::unique_ptr ExtractSubShapedBuffer( - xla::ShapedBuffer* shaped_buffer, int index, +ScopedShapedBuffer ExtractSubShapedBuffer( + ShapedBuffer* shaped_buffer, int index, xla::DeviceMemoryAllocator* allocator) { xla::Shape on_host_shape = xla::ShapeUtil::GetTupleElementShape( shaped_buffer->on_host_shape(), index); xla::Shape on_device_shape = xla::ShapeUtil::GetTupleElementShape( shaped_buffer->on_device_shape(), index); - xla::ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape, - shaped_buffer->platform(), - shaped_buffer->device_ordinal()); + ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape, + shaped_buffer->platform(), + shaped_buffer->device_ordinal()); auto& shape_tree = shaped_buffer->buffers(); auto& sub_shape_tree = sub_shaped_buffer.buffers(); @@ -99,11 +100,10 @@ std::unique_ptr ExtractSubShapedBuffer( /*target_base_index=*/{}); for (auto& index_to_buffer : shape_tree) { if (!index_to_buffer.first.empty() && index_to_buffer.first[0] == index) { - index_to_buffer.second = gpu::DeviceMemoryBase(nullptr, 0); + index_to_buffer.second = se::DeviceMemoryBase(nullptr, 0); } } - return xla::ScopedShapedBuffer::MakeScoped(&sub_shaped_buffer, allocator) - .ValueOrDie(); + return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator); } } // namespace @@ -118,10 +118,10 @@ XlaComputationLaunchContext::XlaComputationLaunchContext( void XlaComputationLaunchContext::PopulateInputs( OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, const std::map& variables) { - // Build xla::ShapedBuffers that point directly to the Tensor buffers. + // Build ShapedBuffers that point directly to the Tensor buffers. arg_buffers_.reserve(kernel->xla_input_shapes.size() + 1); arg_buffers_.resize(kernel->xla_input_shapes.size()); - arg_ptrs_ = std::vector(arg_buffers_.size()); + arg_ptrs_ = std::vector(arg_buffers_.size()); // Pass remaining parameters. const Tensor* t; @@ -140,16 +140,15 @@ void XlaComputationLaunchContext::PopulateInputs( if (xla::ShapeUtil::IsTuple(on_device_shape)) { const XlaTensor* xla_tensor = XlaTensor::FromTensor(t); CHECK(xla_tensor && xla_tensor->has_shaped_buffer()); - arg_ptrs_[i] = - const_cast(&xla_tensor->shaped_buffer()); + arg_ptrs_[i] = const_cast(&xla_tensor->shaped_buffer()); } else { CHECK(xla::ShapeUtil::Equal(shape, on_device_shape)) << "On-device shape " << xla::ShapeUtil::HumanStringWithLayout(on_device_shape) << " not the same as on-host shape " << xla::ShapeUtil::HumanStringWithLayout(shape); - gpu::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t); - arg_buffers_[i] = xla::MakeUnique( + se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t); + arg_buffers_[i] = xla::MakeUnique( /*on_host_shape=*/shape, /*on_device_shape=*/shape, client_->platform(), client_->default_device_ordinal()); arg_buffers_[i]->set_buffer(dmem, /*index=*/{}); @@ -160,15 +159,15 @@ void XlaComputationLaunchContext::PopulateInputs( void XlaComputationLaunchContext::PopulateOutputs( OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, - std::unique_ptr output) { - gpu::Stream* stream = + ScopedShapedBuffer output) { + se::Stream* stream = ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr; // Computation output should always be a tuple. if (VLOG_IS_ON(2)) { - VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString(); + VLOG(2) << "Result tuple shape: " << output.on_host_shape().DebugString(); VLOG(2) << "Result tuple shape (on device): " - << output->on_device_shape().DebugString(); + << output.on_device_shape().DebugString(); } CHECK_EQ(ctx->num_outputs(), kernel->outputs.size()); @@ -226,18 +225,18 @@ void XlaComputationLaunchContext::PopulateOutputs( const TensorShape& shape = kernel->outputs[i].shape; VLOG(2) << "Retval " << i << " shape " << shape.DebugString(); - gpu::DeviceMemoryBase buffer = output->buffer({output_num}); + se::DeviceMemoryBase buffer = output.buffer({output_num}); if (allocate_xla_tensors_) { Tensor* output_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor)); XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor); CHECK(xla_tensor); - xla_tensor->set_shaped_buffer( - ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_)); + xla_tensor->set_shaped_buffer(ScopedShapedBuffer( + ExtractSubShapedBuffer(&output, output_num, xla_allocator_))); } else { Tensor output_tensor = XlaTensorBuffer::MakeTensor( ctx->expected_output_dtype(i), shape, buffer, allocator); - output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num}); + output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num}); ctx->set_output(i, output_tensor); } ++output_num; @@ -257,7 +256,7 @@ void XlaComputationLaunchContext::PopulateOutputs( write.input_index >= 0 && write.input_index < ctx->num_inputs(), errors::Internal("Invalid input index for variable write.")); - gpu::DeviceMemoryBase buffer = output->buffer({output_num}); + se::DeviceMemoryBase buffer = output.buffer({output_num}); Var* variable = nullptr; // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor, @@ -282,12 +281,12 @@ void XlaComputationLaunchContext::PopulateOutputs( XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor); CHECK(xla_tensor); xla_tensor->set_shaped_buffer( - ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_)); + ExtractSubShapedBuffer(&output, output_num, xla_allocator_)); *variable->tensor() = output_tensor; } else { Tensor output_tensor = XlaTensorBuffer::MakeTensor( write.type, write.shape, buffer, allocator); - output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num}); + output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num}); *variable->tensor() = output_tensor; } ++output_num; diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h index 14f70fe35891040ff3460567adb223be0f1c910f..8a6ff3b0c751206d184da63ef1a36e750a1252a5 100644 --- a/tensorflow/compiler/jit/xla_launch_util.h +++ b/tensorflow/compiler/jit/xla_launch_util.h @@ -46,13 +46,11 @@ std::map SnapshotResourceVariables(OpKernelContext* ctx, // see comment on `AllowsAsynchronousDeallocation()`. class XlaAllocator : public xla::DeviceMemoryAllocator { public: - XlaAllocator(const perftools::gputools::Platform* platform, - Allocator* wrapped); + XlaAllocator(const se::Platform* platform, Allocator* wrapped); ~XlaAllocator() override; - xla::StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure) override; - Status Deallocate(int device_ordinal, - perftools::gputools::DeviceMemoryBase* mem) override; + xla::StatusOr Allocate(int device_ordinal, uint64 size, + bool retry_on_failure) override; + Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) override; // The Tensorflow BFC allocator used on GPU allows host-side deallocation // before GPU execution takes place. Tensorflow uses the ordering of the main @@ -87,7 +85,7 @@ class XlaComputationLaunchContext { // Given the XLA output in `output`, populate all outputs of `ctx`. void PopulateOutputs(OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, - std::unique_ptr output); + xla::ScopedShapedBuffer output); // Return the argument list. Only valid after PopulateInputs() has been // called. @@ -126,8 +124,7 @@ class XlaTensorBuffer : public TensorBuffer { } static Tensor MakeTensor(DataType dtype, const TensorShape& shape, - perftools::gputools::DeviceMemoryBase buffer, - Allocator* allocator) { + se::DeviceMemoryBase buffer, Allocator* allocator) { size_t expected_size = shape.num_elements() * DataTypeSize(dtype); auto* tensor_buffer = new XlaTensorBuffer(buffer.opaque(), expected_size, buffer.size(), allocator); diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc index 956328e6757f4c903e3995a54635682d19052794..ce6456880bc1b3bc15ac0ef4bae35a83771098ef 100644 --- a/tensorflow/compiler/jit/xla_tensor.cc +++ b/tensorflow/compiler/jit/xla_tensor.cc @@ -31,16 +31,15 @@ namespace tensorflow { return FromTensor(const_cast(tensor)); } -/*static*/ perftools::gputools::DeviceMemoryBase -XlaTensor::DeviceMemoryFromTensor(const Tensor& tensor) { +/*static*/ se::DeviceMemoryBase XlaTensor::DeviceMemoryFromTensor( + const Tensor& tensor) { const XlaTensor* xla_tensor = FromTensor(&tensor); if (xla_tensor) { CHECK(xla_tensor->has_shaped_buffer()); return xla_tensor->shaped_buffer().root_buffer(); } else { - return perftools::gputools::DeviceMemoryBase( - const_cast(tensor.tensor_data().data()), - tensor.tensor_data().size()); + return se::DeviceMemoryBase(const_cast(tensor.tensor_data().data()), + tensor.tensor_data().size()); } } @@ -65,10 +64,8 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape, device_ordinal, size, /*retry_on_failure=*/false)); } - TF_ASSIGN_OR_RETURN(auto scoped_buffer, - xla::ScopedShapedBuffer::MakeScoped( - &buffer, client->backend().memory_allocator())); - set_shaped_buffer(std::move(scoped_buffer)); + set_shaped_buffer(xla::ScopedShapedBuffer( + std::move(buffer), client->backend().memory_allocator())); return Status::OK(); } diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h index 5ff2fb08f03548260215c6aeded2c124f8d28f43..922a91897312096e4bb6ee2a1cc153e0039e2c7a 100644 --- a/tensorflow/compiler/jit/xla_tensor.h +++ b/tensorflow/compiler/jit/xla_tensor.h @@ -43,8 +43,7 @@ class XlaTensor { // which case the returned value is shaped_buffer()->root_buffer(), or a // normal Tensor in which case the returned value is // {tensor.tensor_data().data(), tensor.tensor_data().size}. - static perftools::gputools::DeviceMemoryBase DeviceMemoryFromTensor( - const Tensor& tensor); + static se::DeviceMemoryBase DeviceMemoryFromTensor(const Tensor& tensor); // Assign the internal ShapedBuffer to new memory for the given dtype and // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it @@ -64,9 +63,9 @@ class XlaTensor { return *shaped_buffer_; } // Mutates the TensorInfo to set the ShapedBuffer. - void set_shaped_buffer( - std::unique_ptr shaped_buffer) { - shaped_buffer_ = std::move(shaped_buffer); + void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) { + shaped_buffer_ = + xla::MakeUnique(std::move(shaped_buffer)); } // Some tensors on the device may have known values on the host. We use these diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index b9e42ca677cd82e2c18309d25ab33954206ebbe4..0c720932568b0c4eafd2127f76c0b77ad2f86904 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -308,6 +308,25 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "eager_test", + size = "small", + srcs = ["eager_test.py"], + disabled_backends = [ + # TODO(b/78199195) Support XLA CPU devices in eager runtime + "cpu", + "cpu_ondemand", + # TODO(b/78468222) Enable GPU backend + "gpu", + ], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:platform_test", + ], +) + tf_xla_py_test( name = "fft_test", size = "medium", @@ -340,7 +359,7 @@ tf_xla_py_test( tf_xla_py_test( name = "ftrl_test", - size = "small", + size = "medium", srcs = ["ftrl_test.py"], deps = [ ":xla_test", @@ -904,3 +923,15 @@ tf_xla_py_test( "//tensorflow/python:platform_test", ], ) + +tf_xla_py_test( + name = "placeholder_test", + size = "small", + srcs = ["placeholder_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:platform_test", + ], +) diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py new file mode 100644 index 0000000000000000000000000000000000000000..bdd0185dfe4abe9d9acecc5381ff82c54b8c0705 --- /dev/null +++ b/tensorflow/compiler/tests/eager_test.py @@ -0,0 +1,137 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Test cases for eager execution using XLA.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.eager import backprop +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.platform import googletest + + +class EagerTest(XLATestCase): + + def testBasic(self): + with self.test_scope(): + three = constant_op.constant(3) + five = constant_op.constant(5) + product = three * five + self.assertAllEqual(15, product) + + def testExecuteListOutputLen0(self): + with self.test_scope(): + empty = constant_op.constant([], dtype=dtypes.int32) + result = array_ops.unstack(empty, 0) + self.assertTrue(isinstance(result, list)) + self.assertEqual(0, len(result)) + + def testExecuteListOutputLen1(self): + with self.test_scope(): + split_dim = constant_op.constant(1) + value = constant_op.constant([[0, 1, 2], [3, 4, 5]]) + result = array_ops.split(value, 1, axis=split_dim) + self.assertTrue(isinstance(result, list)) + self.assertEqual(1, len(result)) + self.assertAllEqual([[0, 1, 2], [3, 4, 5]], result[0]) + + def testExecuteListOutputLen3(self): + with self.test_scope(): + split_dim = constant_op.constant(1) + value = constant_op.constant([[0, 1, 2], [3, 4, 5]]) + result = array_ops.split(value, 3, axis=split_dim) + self.assertTrue(isinstance(result, list)) + self.assertEqual(3, len(result)) + self.assertAllEqual([[0], [3]], result[0]) + self.assertAllEqual([[1], [4]], result[1]) + self.assertAllEqual([[2], [5]], result[2]) + + def testBasicGraph(self): + # Run some ops eagerly + with self.test_scope(): + three = constant_op.constant(3) + five = constant_op.constant(5) + product = three * five + self.assertAllEqual(15, product) + + # Run some ops graphly + with context.graph_mode(), self.test_session() as sess: + with self.test_scope(): + three = constant_op.constant(3) + five = constant_op.constant(5) + product = three * five + self.assertAllEqual(15, sess.run(product)) + + def testDegenerateSlices(self): + with self.test_scope(): + npt = np.arange(1, 19, dtype=np.float32).reshape(3, 2, 3) + t = constant_op.constant(npt) + # degenerate by offering a forward interval with a negative stride + self.assertAllEqual(npt[0:-1:-1, :, :], t[0:-1:-1, :, :]) + # degenerate with a reverse interval with a positive stride + self.assertAllEqual(npt[-1:0, :, :], t[-1:0, :, :]) + # empty interval in every dimension + self.assertAllEqual(npt[-1:0, 2:2, 2:3:-1], t[-1:0, 2:2, 2:3:-1]) + + def testIdentity(self): + with self.test_scope(): + self.assertAllEqual(2, array_ops.identity(2)) + + def testIdentityOnVariable(self): + with self.test_scope(): + v = resource_variable_ops.ResourceVariable(True) + i = array_ops.identity(v) + self.assertAllEqual(True, i.numpy()) + + def testAssignAddVariable(self): + with self.test_scope(): + v = resource_variable_ops.ResourceVariable(1.0) + v.assign_add(2.0) + self.assertEqual(3.0, v.numpy()) + + def testGradient(self): + def f(x): + return x + + with self.test_scope(): + grad_fn = backprop.gradients_function(f) + self.assertAllEqual(2., grad_fn(1., dy=2.)[0]) + + def testVariableGradient(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(1.0) + + def f(): + x = v0 * v0 + return x + + grads = backprop.implicit_grad(f)() + self.assertEqual(2., grads[0][0].numpy()) + + +if __name__ == "__main__": + ops.enable_eager_execution( + config=config_pb2.ConfigProto(log_device_placement=True)) + googletest.main() diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py index 12791ef8ac1da948608b1585f423ca217378f031..42e637734c578fcc70473060cb156e172a0a1995 100644 --- a/tensorflow/compiler/tests/image_ops_test.py +++ b/tensorflow/compiler/tests/image_ops_test.py @@ -34,6 +34,13 @@ from tensorflow.python.ops import image_ops from tensorflow.python.platform import test +def GenerateNumpyRandomRGB(shape): + # Only generate floating points that are fractions like n / 256, since they + # are RGB pixels. Some low-precision floating point types in this test can't + # handle arbitrary precision floating points well. + return np.random.randint(0, 256, shape) / 256. + + class RGBToHSVTest(XLATestCase): def testBatch(self): @@ -43,7 +50,7 @@ class RGBToHSVTest(XLATestCase): shape = (batch_size, 2, 7, 3) for nptype in self.float_types: - inp = np.random.rand(*shape).astype(nptype) + inp = GenerateNumpyRandomRGB(shape).astype(nptype) # Convert to HSV and back, as a batch and individually with self.test_session() as sess: @@ -83,7 +90,7 @@ class RGBToHSVTest(XLATestCase): def testRGBToHSVNumpy(self): """Tests the RGB to HSV conversion matches a reference implementation.""" for nptype in self.float_types: - rgb_flat = np.random.random(64 * 3).reshape((64, 3)).astype(nptype) + rgb_flat = GenerateNumpyRandomRGB((64, 3)).astype(nptype) rgb_np = rgb_flat.reshape(4, 4, 4, 3) hsv_np = np.array([ colorsys.rgb_to_hsv( diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..5e6d1313bd0336eba71fcf3658d949bd3342ae11 --- /dev/null +++ b/tensorflow/compiler/tests/placeholder_test.py @@ -0,0 +1,48 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for xla handling of placeholder_with_default.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import googletest + + +class PlaceholderTest(XLATestCase): + + def test_placeholder_with_default_default(self): + with self.test_session() as sess, self.test_scope(): + v = resource_variable_ops.ResourceVariable(4.0) + ph = array_ops.placeholder_with_default(v, shape=[]) + out = ph * 2 + sess.run(variables.variables_initializer([v])) + self.assertEqual(8.0, sess.run(out)) + + def test_placeholder_with_default_fed(self): + with self.test_session() as sess, self.test_scope(): + v = resource_variable_ops.ResourceVariable(4.0) + ph = array_ops.placeholder_with_default(v, shape=[]) + out = ph * 2 + sess.run(variables.variables_initializer([v])) + self.assertEqual(2.0, sess.run(out, {ph: 1.0})) + + +if __name__ == '__main__': + googletest.main() diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py index ba5f829936fd82ca0cc53eda34aefbca6d80482b..ef047005b60bd156a677050368ef67ae030d6c3a 100644 --- a/tensorflow/compiler/tests/ternary_ops_test.py +++ b/tensorflow/compiler/tests/ternary_ops_test.py @@ -23,6 +23,7 @@ import numpy as np from tensorflow.compiler.tests.xla_test import XLATestCase from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_math_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import googletest @@ -68,40 +69,41 @@ class TernaryOpsTest(XLATestCase): expected=np.array([1, 3, 5], dtype=np.int32)) def testSelect(self): - self._testTernary( - array_ops.where, - np.array(0, dtype=np.bool), - np.array(2, dtype=np.float32), - np.array(7, dtype=np.float32), - expected=np.array(7, dtype=np.float32)) + for dtype in self.numeric_types: + self._testTernary( + array_ops.where, + np.array(0, dtype=np.bool), + np.array(2, dtype=dtype), + np.array(7, dtype=dtype), + expected=np.array(7, dtype=dtype)) - self._testTernary( - array_ops.where, - np.array(1, dtype=np.bool), - np.array([1, 2, 3, 4], dtype=np.float32), - np.array([5, 6, 7, 8], dtype=np.float32), - expected=np.array([1, 2, 3, 4], dtype=np.float32)) + self._testTernary( + array_ops.where, + np.array(1, dtype=np.bool), + np.array([1, 2, 3, 4], dtype=dtype), + np.array([5, 6, 7, 8], dtype=dtype), + expected=np.array([1, 2, 3, 4], dtype=dtype)) - self._testTernary( - array_ops.where, - np.array(0, dtype=np.bool), - np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32), - np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32), - expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32)) + self._testTernary( + array_ops.where, + np.array(0, dtype=np.bool), + np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype), + np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype), + expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype)) - self._testTernary( - array_ops.where, - np.array([0, 1, 1, 0], dtype=np.bool), - np.array([1, 2, 3, 4], dtype=np.float32), - np.array([5, 6, 7, 8], dtype=np.float32), - expected=np.array([5, 2, 3, 8], dtype=np.float32)) + self._testTernary( + array_ops.where, + np.array([0, 1, 1, 0], dtype=np.bool), + np.array([1, 2, 3, 4], dtype=dtype), + np.array([5, 6, 7, 8], dtype=dtype), + expected=np.array([5, 2, 3, 8], dtype=dtype)) - self._testTernary( - array_ops.where, - np.array([0, 1, 0], dtype=np.bool), - np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32), - np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32), - expected=np.array([[7, 8], [3, 4], [11, 12]], dtype=np.float32)) + self._testTernary( + array_ops.where, + np.array([0, 1, 0], dtype=np.bool), + np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype), + np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype), + expected=np.array([[7, 8], [3, 4], [11, 12]], dtype=dtype)) def testSlice(self): for dtype in self.numeric_types: @@ -119,6 +121,23 @@ class TernaryOpsTest(XLATestCase): np.array([2, 1], dtype=np.int32), expected=np.array([[2], [5]], dtype=dtype)) + def testClipByValue(self): + # TODO(b/78258593): enable integer types here too. + for dtype in self.float_types: + test_cases = [ + (np.array([2, 4, 5], dtype=dtype), dtype(7)), # + (dtype(1), np.array([2, 4, 5], dtype=dtype)), # + (np.array([-2, 7, 7], dtype=dtype), np.array([-2, 9, 8], dtype=dtype)) + ] + x = np.array([-2, 10, 6], dtype=dtype) + for lower, upper in test_cases: + self._testTernary( + gen_math_ops._clip_by_value, + x, + lower, + upper, + expected=np.minimum(np.maximum(x, lower), upper)) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index ba5c3a14849cefcb680b03425232724ff32375a8..942504e6bd4c9ce93c9482251823efcbb46ab1c8 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -412,7 +412,6 @@ cc_library( hdrs = ["functionalize_control_flow.h"], deps = [ ":tf2xla_util", - "//tensorflow/compiler/jit:graph_to_functiondef", "//tensorflow/compiler/jit:union_find", "//tensorflow/compiler/tf2xla:dump_graph", "//tensorflow/compiler/tf2xla/ops:xla_ops", diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc index 16b9142cbf7d2afe99c22acbc32fb17c09b00081..8d1f2684909e876fe5521ba6a63d745c7d3956e0 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc @@ -21,13 +21,13 @@ limitations under the License. #include #include -#include "tensorflow/compiler/jit/graph_to_functiondef.h" #include "tensorflow/compiler/jit/union_find.h" #include "tensorflow/compiler/tf2xla/dump_graph.h" #include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/control_flow.h" @@ -870,6 +870,9 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() { // Merge the inputs of the switch node with one another. This results in // predicates and control input residing in the same cluster. for (const Edge* e : n->in_edges()) { + // Only consider the data inputs to the Switch node. + if (e->IsControlEdge()) continue; + Node* src = e->src(); UnionFind* src_cluster = find_output_cluster(src); int src_cluster_depth = switch_depth[src_cluster->Get().representative]; diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index 579b66969990017688477443115cc4f61c18fe4a..00fd08b1a0750739445a124adc7ccf436a4a9b71 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -21,6 +21,7 @@ tf_kernel_library( "cast_op.cc", "categorical_op.cc", "cholesky_op.cc", + "clip_by_value_op.cc", "concat_op.cc", "const_op.cc", "conv_ops.cc", diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fdf75be7b1156540d762e3bc04a51f2478f00f46 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc @@ -0,0 +1,61 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/framework/tensor_shape.h" + +namespace tensorflow { +namespace { + +class ClipByValueOp : public XlaOpKernel { + public: + explicit ClipByValueOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + + void Compile(XlaOpKernelContext* ctx) override { + const TensorShape shape = ctx->InputShape(0); + const TensorShape min_shape = ctx->InputShape(1); + const TensorShape max_shape = ctx->InputShape(2); + + xla::ComputationBuilder* builder = ctx->builder(); + auto input = ctx->Input(0); + auto min = ctx->Input(1); + auto max = ctx->Input(2); + + auto shape_error = [&]() -> tensorflow::Status { + return errors::InvalidArgument( + "clip_value_min and clip_value_max must be either of " + "the same shape as input, or a scalar. ", + "Input shape: ", shape.DebugString(), + " clip_value_min shape: ", min_shape.DebugString(), + " clip_value_max shape: ", max_shape.DebugString()); + }; + + if (shape != min_shape) { + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(min_shape), shape_error()); + min = builder->Broadcast(min, shape.dim_sizes()); + } + if (shape != max_shape) { + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(max_shape), shape_error()); + max = builder->Broadcast(max, shape.dim_sizes()); + } + ctx->SetOutput(0, builder->Clamp(min, input, max)); + } +}; + +REGISTER_XLA_OP(Name("ClipByValue"), ClipByValueOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc index 7945c05af40df21a798a2cff51fe7f8e935793f6..0b79cb0916ee8a7d0e26c5dc12557639336f8ab1 100644 --- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc @@ -29,52 +29,54 @@ namespace tensorflow { Status XlaGather(const xla::ComputationDataHandle& input, const TensorShape& input_shape, const xla::ComputationDataHandle& indices, - TensorShape indices_shape, int64 axis, bool indices_are_nd, - DataType dtype, DataType index_type, + const TensorShape& indices_shape, int64 axis, + bool indices_are_nd, DataType dtype, DataType index_type, xla::ComputationBuilder* builder, xla::ComputationDataHandle* gather_output) { + // There is no deep reason why we need this precondition, but this is the only + // combination that is used and tested today. + CHECK(!indices_are_nd || axis == 0); + + // num_index_dims is the number of components in each index in the indices + // tensor. + // + // num_indices is the total number of (n dimensional or scalar) indices in the + // indices tensor. + // // If the indices are N-dimensional, then the minor dimension of indices // should be of size N and correspond to the N indices. - int64 num_index_dims = 1; + int64 num_index_dims; + int64 num_indices = 1; if (indices_are_nd) { CHECK_GE(indices_shape.dims(), 1); num_index_dims = indices_shape.dim_size(indices_shape.dims() - 1); - indices_shape.RemoveLastDims(1); + for (int64 i = 0, e = indices_shape.dims() - 1; i < e; i++) { + num_indices *= indices_shape.dim_size(i); + } + } else { + num_index_dims = 1; + for (int64 i = 0, e = indices_shape.dims(); i < e; i++) { + num_indices *= indices_shape.dim_size(i); + } } - // Although the indices Tensor is flattened into rank 1 during the lookup, - // and each scalar entry is used as an index into the first dimension of the - // input, the output is returned with shape: - // input.shape[:axis] + indices.shape + input.shape[axis+1:] - - const int64 num_indices = indices_shape.num_elements(); - TensorShape input_shape_pre_axis(input_shape); - input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims()); - TensorShape input_shape_post_axis(input_shape); - input_shape_post_axis.RemoveDimRange(0, axis + num_index_dims); - // Each slice of the input tensor has shape: - // [, 1, ..., 1, ] - TensorShape slice_shape(input_shape); - for (int64 i = 0; i < num_index_dims; ++i) { - slice_shape.set_dim(axis + i, 1); - } + // Degenerate case: empty indices. + if (num_indices == 0) { + TensorShape input_shape_pre_axis{input_shape}; + input_shape_pre_axis.RemoveDimRange(axis, input_shape.dims()); + TensorShape input_shape_post_axis{input_shape}; + input_shape_post_axis.RemoveDimRange(0, axis + num_index_dims); - TensorShape loop_out_shape; - loop_out_shape.AppendShape(input_shape_pre_axis); - loop_out_shape.AddDim(num_indices); - loop_out_shape.AppendShape(input_shape_post_axis); - TensorShape loop_out_slice_shape; - loop_out_slice_shape.AppendShape(input_shape_pre_axis); - loop_out_slice_shape.AddDim(1); - loop_out_slice_shape.AppendShape(input_shape_post_axis); + TensorShape indices_shape_no_index_vectors{indices_shape}; + if (indices_are_nd) { + indices_shape_no_index_vectors.RemoveLastDims(1); + } - TensorShape out_shape; - out_shape.AppendShape(input_shape_pre_axis); - out_shape.AppendShape(indices_shape); - out_shape.AppendShape(input_shape_post_axis); + TensorShape out_shape; + out_shape.AppendShape(input_shape_pre_axis); + out_shape.AppendShape(indices_shape_no_index_vectors); + out_shape.AppendShape(input_shape_post_axis); - // Degenerate case: empty indices. - if (num_indices == 0) { *gather_output = builder->Broadcast(XlaHelpers::Zero(builder, dtype), out_shape.dim_sizes()); return Status::OK(); @@ -88,76 +90,61 @@ Status XlaGather(const xla::ComputationDataHandle& input, } } - // Flatten the major dimensions of indices into a single dimension for ease of - // iteration. If there is an axis dimension, we must leave it alone. - std::vector flat_indices_shape = {num_indices}; - if (indices_are_nd) { - flat_indices_shape.push_back(num_index_dims); - } - - // Specify the shape of the loop-carried Tensor tuple. - - // Construct the initial values of the loop-carried Tensors. - auto flat_indices = builder->Reshape(indices, flat_indices_shape); - auto init_out = builder->Broadcast(XlaHelpers::Zero(builder, dtype), - loop_out_shape.dim_sizes()); - auto init = {input, flat_indices, init_out}; - - // Construct the while loop body's function. The implementation of gather is: - // for i in range(num_indices): - // index = dynamic-slice(indices, i) - // xi = dynamic-slice(input, index) - // output = dynamic-update-slice(output, xi, i) - auto body_fn = [&](xla::ComputationDataHandle i, - gtl::ArraySlice loop_vars, - xla::ComputationBuilder* bodyb) { - auto input = loop_vars[0]; - auto indices = loop_vars[1]; - auto output = loop_vars[2]; - - auto zero_index = XlaHelpers::Zero(bodyb, index_type); - - // Slice the i-th index from the indices array. - xla::ComputationDataHandle index; - auto indices_offset = bodyb->Reshape(i, {1}); - if (indices_are_nd) { - // Slice out the entire nd index, if applicable. - indices_offset = bodyb->Pad(indices_offset, zero_index, - xla::MakeEdgePaddingConfig({{0, 1}})); - index = bodyb->DynamicSlice(indices, indices_offset, {1, num_index_dims}); - index = bodyb->Collapse(index, {0, 1}); + // Example of a 1-D gather with axis=1, pulling two [3,1] tensors out of a + // tensor of shape [3,3]. + // + // operand = s32[3,3] parameter(0) + // indices = s32[2] parameter(1) + // gather = s32[3,2] gather(operand, indices), + // output_window_dims={0}, + // elided_window_dims={1}, + // gather_dims_to_operand_dims={1}, + // index_vector_dim=1, + // window_bounds={3, 1} + // + // + // Example of an N-D gather pulling out slices of shape [1,1,2] out of a + // tensor of shape [3,3,2]. + // + // operand = s32[3,3,2] parameter(0) + // indices = s32[2,2] parameter(1) + // gather = s32[2,2] gather(operand, indices), + // output_window_dims={1}, + // elided_window_dims={0,1}, + // gather_dims_to_operand_dims={0,1}, + // index_vector_dim=0, + // window_bounds={1,1,2} + + xla::GatherDimensionNumbers dim_numbers; + std::vector window_bounds; + window_bounds.reserve(input_shape.dims()); + for (int64 i = 0; i < input_shape.dims(); i++) { + int64 window_bound; + if (axis <= i && i < (axis + num_index_dims)) { + dim_numbers.add_elided_window_dims(i); + window_bound = 1; } else { - index = bodyb->DynamicSlice(indices, indices_offset, {1}); + window_bound = input_shape.dim_size(i); + } + + window_bounds.push_back(window_bound); + + if (i < axis) { + dim_numbers.add_output_window_dims(i); + } else if (i >= (axis + num_index_dims)) { + int64 indices_rank = + indices_are_nd ? (indices_shape.dims() - 1) : indices_shape.dims(); + dim_numbers.add_output_window_dims(i + indices_rank - num_index_dims); } + } + + dim_numbers.set_index_vector_dim(indices_are_nd ? (indices_shape.dims() - 1) + : indices_shape.dims()); + for (int64 i = axis; i < axis + num_index_dims; i++) { + dim_numbers.add_gather_dims_to_operand_dims(i); + } - // Slice the corresponding data from the input array. - auto start_indices = bodyb->Pad( - index, zero_index, - xla::MakeEdgePaddingConfig( - {{input_shape_pre_axis.dims(), input_shape_post_axis.dims()}})); - auto slice_i = bodyb->Reshape( - bodyb->DynamicSlice(input, start_indices, slice_shape.dim_sizes()), - loop_out_slice_shape.dim_sizes()); - - // Construct the index into the output Tensor 0, ..., , 0, ... - std::vector out_index_vals( - loop_out_shape.dims(), bodyb->Reshape(zero_index, {1})); - out_index_vals[input_shape_pre_axis.dims()] = bodyb->Reshape(i, {1}); - auto out_index = bodyb->ConcatInDim(out_index_vals, 0); - - // Update the output Tensor - auto updated_output = bodyb->DynamicUpdateSlice(output, slice_i, out_index); - - return std::vector{input, indices, - updated_output}; - }; - - // Construct the While loop, extract and reshape the output. - xla::PrimitiveType ptype; - TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(index_type, &ptype)); - TF_ASSIGN_OR_RETURN(auto outputs, XlaForEachIndex(num_indices, ptype, body_fn, - init, "gather", builder)); - *gather_output = builder->Reshape(outputs[2], out_shape.dim_sizes()); + *gather_output = builder->Gather(input, indices, dim_numbers, window_bounds); return Status::OK(); } diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h index bd8b92c22d71fe89ab8951ec79f411feef6505e3..f9376f0eabdc0f0c565eb4b9f86425de96b5aa22 100644 --- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h +++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h @@ -36,8 +36,8 @@ namespace tensorflow { Status XlaGather(const xla::ComputationDataHandle& input, const TensorShape& input_shape, const xla::ComputationDataHandle& indices, - TensorShape indices_shape, int64 axis, bool indices_are_nd, - DataType dtype, DataType index_type, + const TensorShape& indices_shape, int64 axis, + bool indices_are_nd, DataType dtype, DataType index_type, xla::ComputationBuilder* builder, xla::ComputationDataHandle* gather_output); diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc index 39af662b638cb9d723118e58fcfc983633fed497..e72200bfbcff20c55ac03030f1afc4bacaabf7ce 100644 --- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc @@ -38,6 +38,7 @@ class IdentityOp : public XlaOpKernel { REGISTER_XLA_OP(Name("Identity").CompilationOnly(), IdentityOp); REGISTER_XLA_OP(Name("IdentityN").CompilationOnly(), IdentityOp); +REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp); REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp); REGISTER_XLA_OP(Name("StopGradient"), IdentityOp); REGISTER_XLA_OP(Name("Snapshot"), IdentityOp); diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc index 7f72a6073df218b9e2bd4cc0c0b5bb10b5cd4b84..9bf5821b54abe3994085ad72043ff143077824c5 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc @@ -83,15 +83,6 @@ xla::StatusOr TriangularSolve( block_size); } - // Returns [b1, b2, ... , bn, indices[0], indices[1]]. - auto prepend_batch_dims = [&](std::array indices) { - std::vector output(ndims); - std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin()); - std::copy(indices.begin(), indices.end(), - output.begin() + batch_dimensions.size()); - return output; - }; - // Applies a complex conjugation operation if `a` is complex and `conjugate_a` // is true, otherwise returns its argument. auto maybe_conj = [&](xla::ComputationBuilder* builder, @@ -108,11 +99,12 @@ xla::StatusOr TriangularSolve( std::unique_ptr sub = builder->CreateSubBuilder( tensorflow::strings::StrCat("trsm_base_", k)); - auto a_param = - sub->Parameter(0, - xla::ShapeUtil::MakeShape(b_shape->element_type(), - prepend_batch_dims({k, k})), - "a"); + auto a_param = sub->Parameter( + 0, + xla::ShapeUtil::MakeShape( + b_shape->element_type(), + PrependMajorDims(sub.get(), batch_dimensions, {k, k})), + "a"); std::array b_lastd; if (left_side) { @@ -120,11 +112,12 @@ xla::StatusOr TriangularSolve( } else { b_lastd = {m, k}; } - auto b_param = - sub->Parameter(1, - xla::ShapeUtil::MakeShape(b_shape->element_type(), - prepend_batch_dims(b_lastd)), - "b"); + auto b_param = sub->Parameter( + 1, + xla::ShapeUtil::MakeShape( + b_shape->element_type(), + PrependMajorDims(sub.get(), batch_dimensions, b_lastd)), + "b"); // We use a left-looking subroutine on the block diagonal in some common // cases, while falling back to a recursive call in unsupported cases. The @@ -380,14 +373,6 @@ xla::StatusOr TriangularSolveLeftLooking( batch_dimensions.push_back(a_size); } - auto prepend_batch_dims = [&](std::array indices) { - std::vector output(ndims); - std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin()); - std::copy(indices.begin(), indices.end(), - output.begin() + batch_dimensions.size()); - return output; - }; - auto maybe_conj = [&](xla::ComputationBuilder* builder, xla::ComputationDataHandle x) { auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a; @@ -479,30 +464,6 @@ xla::StatusOr TriangularSolveLeftLooking( auto body_b = bodyb->GetTupleElement(input_tuple, 3); auto zero = bodyb->ConstantR0(0); - // Set up some helper functions. - auto prepend_zeros = [&](std::array starts) { - auto zero = bodyb->Reshape(bodyb->ConstantR0(0), {1}); - std::vector padded_starts(ndims, zero); - padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1}); - padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1}); - return bodyb->ConcatInDim(padded_starts, 0); - }; - - auto dynamic_slice = [&](xla::ComputationDataHandle x, - std::array starts, - std::array sizes) { - auto padded_starts = prepend_zeros(starts); - auto padded_sizes = prepend_batch_dims(sizes); - return bodyb->DynamicSlice(x, padded_starts, padded_sizes); - }; - - auto update = [&](xla::ComputationDataHandle x, - xla::ComputationDataHandle update, - std::array starts) { - auto padded_starts = prepend_zeros(starts); - return bodyb->DynamicUpdateSlice(x, update, padded_starts); - }; - // We'd like to implement this: // if transpose_a: // a_row = T(a[..., i+1:, i:i+1]) @@ -516,22 +477,29 @@ xla::StatusOr TriangularSolveLeftLooking( // all zeros and use that as zero-padding (doing unnecessary FLOPs). xla::ComputationDataHandle a_row; if (transpose_a) { - a_row = dynamic_slice(body_a, {zero, i}, {m, 1}); + TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a, + {zero, i}, {m, 1})); } else { - a_row = dynamic_slice(body_a, {i, zero}, {1, m}); + TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a, + {i, zero}, {1, m})); } TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out, /*transpose_x=*/transpose_a, /*transpose_y=*/false, /*conjugate_x=*/conjugate_a, /*conjugate_y=*/false)); - auto result_row = - bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update); + TF_ASSIGN_OR_RETURN( + auto result_row_slice, + DynamicSliceInMinorDims(bodyb.get(), body_b, {i, zero}, {1, n})); + auto result_row = bodyb->Sub(result_row_slice, b_update); // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1] - auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1}); + TF_ASSIGN_OR_RETURN(auto a_elt, DynamicSliceInMinorDims(bodyb.get(), body_a, + {i, i}, {1, 1})); auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt)); - body_out = update(body_out, div_result, {i, zero}); + TF_ASSIGN_OR_RETURN(body_out, + DynamicUpdateSliceInMinorDims(bodyb.get(), body_out, + div_result, {i, zero})); // if transpose_a: // return (i - 1, body_out, a, b) diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 86263d847ae02d50e70dafb0129b2664c522f2a3..c0e996768491a6315c21021ce874b8a11557de6e 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -813,4 +813,29 @@ Status XlaCompiler::SetHostToDeviceMetadata( return Status::OK(); } +Status XlaCompiler::GetHostComputeControlDependency( + const string& host_compute_name, xla::ComputationDataHandle* handle) { + const auto iter = host_compute_control_output_.find(host_compute_name); + if (iter == host_compute_control_output_.end()) { + return errors::InvalidArgument( + "No registered control handle for host compute Op '", host_compute_name, + "'"); + } else { + *handle = iter->second; + } + return Status::OK(); +} + +Status XlaCompiler::SetHostComputeControlDependency( + const string& host_compute_name, const xla::ComputationDataHandle& handle) { + if (host_compute_control_output_.find(host_compute_name) != + host_compute_control_output_.end()) { + return errors::InvalidArgument( + "Duplicate control handles registered for for host compute Op ", + host_compute_name); + } + host_compute_control_output_[host_compute_name] = handle; + return Status::OK(); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index a6747bbe72e161b2ece55697825cce0e71145a5c..8f564f35ec81765e8998513dfd4805d221200c6c 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -325,6 +325,23 @@ class XlaCompiler { gtl::ArraySlice types, gtl::ArraySlice shapes); + // In order to avoid deadlocks from dependencies in host computations, it can + // be necessary to enforce a partial order on the execution of HostCompute + // Ops. In particular it may be necessary to constrain the SendToHost for one + // HostCompute to run before blocking on the RecvAtHost for another + // HostCompute. The compiler maintains a mapping from 'host_compute_name' to + // handle, where the handle is an 'output' of the HostCompute Op corresponding + // to 'host_compute_name'. Another HostCompute Op that needs to be sequenced + // later can add the handle as an 'input' to enforce the constraints. + // 'host_compute_name' can be any string the client wishes to use to identify + // a given HostCompute Op as long as the names are unique within the + // compilation. + Status GetHostComputeControlDependency(const string& host_compute_name, + xla::ComputationDataHandle* handle); + Status SetHostComputeControlDependency( + const string& host_compute_name, + const xla::ComputationDataHandle& handle); + const Options& options() const { return options_; } xla::Client* client() const { return options_.client; } FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_; } @@ -391,6 +408,9 @@ class XlaCompiler { std::unordered_map host_compute_sends_; std::unordered_map host_compute_recvs_; + std::unordered_map + host_compute_control_output_; + TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler); }; diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index 751777222fcc7ec073958349aa2677d5b4e6757d..1af9cb6d2ab15a33b56f1df0410f47d7e139a1ba 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -443,6 +443,9 @@ cc_library( srcs = ["executable_run_options.cc"], hdrs = ["executable_run_options.h"], visibility = ["//visibility:public"], + deps = [ + ":types", + ], ) cc_library( @@ -602,8 +605,8 @@ cc_library( ":util", ":window_util", ":xla_data_proto", - "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:padding", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_evaluator", "//tensorflow/compiler/xla/service:shape_inference", diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD index a299c2afd45aa6b785964b8a8e1400ddf54083a4..286d06d12ffca7410067f2d33398497576986807 100644 --- a/tensorflow/compiler/xla/client/BUILD +++ b/tensorflow/compiler/xla/client/BUILD @@ -130,6 +130,7 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:compile_only_service", "//tensorflow/compiler/xla/service:compiler", "//tensorflow/core:stream_executor_no_cuda", diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc index f0f94298a05f7c4bdc41cbfb8572454fbedd371d..328e1b8fa84e7baaca41c6c9a65e9a1598ac32ae 100644 --- a/tensorflow/compiler/xla/client/client.cc +++ b/tensorflow/compiler/xla/client/client.cc @@ -235,6 +235,11 @@ StatusOr Client::LoadSnapshot(const SessionModule& module) { return Computation(stub_, response.computation()); } +StatusOr Client::LoadSnapshot(const HloSnapshot& module) { + TF_RET_CHECK(module.has_hlo() && module.hlo().has_hlo_module()); + return XlaComputation(module.hlo().hlo_module()); +} + StatusOr> Client::Execute( const Computation& computation, tensorflow::gtl::ArraySlice arguments, diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h index 14c685d94ea31c382d84223ca4e2eba544420d78..a63ff4c56d1dd78c7abfa2bf163b5fbd54d82b2b 100644 --- a/tensorflow/compiler/xla/client/client.h +++ b/tensorflow/compiler/xla/client/client.h @@ -255,6 +255,9 @@ class Client { StatusOr LoadSnapshot(const SessionModule& module); + // TODO(b/74197823): This is a part of a NOT YET ready refactor. + StatusOr LoadSnapshot(const HloSnapshot& module); + ServiceInterface* stub() { return stub_; } private: diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc index b1663bc815719c3da75b37593ac665b1f3493db8..803a9e40094391ba47ed27713f4538caf875c4f6 100644 --- a/tensorflow/compiler/xla/client/client_library.cc +++ b/tensorflow/compiler/xla/client/client_library.cc @@ -23,22 +23,19 @@ limitations under the License. namespace xla { -LocalClientOptions::LocalClientOptions(perftools::gputools::Platform* platform, +LocalClientOptions::LocalClientOptions(se::Platform* platform, int number_of_replicas, int intra_op_parallelism_threads) : platform_(platform), number_of_replicas_(number_of_replicas), intra_op_parallelism_threads_(intra_op_parallelism_threads) {} -LocalClientOptions& LocalClientOptions::set_platform( - perftools::gputools::Platform* platform) { +LocalClientOptions& LocalClientOptions::set_platform(se::Platform* platform) { platform_ = platform; return *this; } -perftools::gputools::Platform* LocalClientOptions::platform() const { - return platform_; -} +se::Platform* LocalClientOptions::platform() const { return platform_; } LocalClientOptions& LocalClientOptions::set_number_of_replicas( int number_of_replicas) { @@ -69,7 +66,7 @@ ClientLibrary::ClientLibrary() = default; ClientLibrary::~ClientLibrary() = default; /* static */ StatusOr ClientLibrary::GetOrCreateLocalClient( - perftools::gputools::Platform* platform) { + se::Platform* platform) { LocalClientOptions default_options; default_options.set_platform(platform); return GetOrCreateLocalClient(default_options); @@ -77,7 +74,7 @@ ClientLibrary::~ClientLibrary() = default; /* static */ StatusOr ClientLibrary::GetOrCreateLocalClient( const LocalClientOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); int replica_count = options.number_of_replicas(); ClientLibrary& client_library = Singleton(); tensorflow::mutex_lock lock(client_library.service_mutex_); @@ -115,7 +112,7 @@ ClientLibrary::~ClientLibrary() = default; } /* static */ LocalService* ClientLibrary::GetXlaService( - perftools::gputools::Platform* platform) { + se::Platform* platform) { ClientLibrary& client_library = Singleton(); tensorflow::mutex_lock lock(client_library.service_mutex_); auto it = client_library.local_instances_.find(platform->id()); @@ -124,8 +121,7 @@ ClientLibrary::~ClientLibrary() = default; } /* static */ StatusOr -ClientLibrary::GetOrCreateCompileOnlyClient( - perftools::gputools::Platform* platform) { +ClientLibrary::GetOrCreateCompileOnlyClient(se::Platform* platform) { ClientLibrary& client_library = Singleton(); tensorflow::mutex_lock lock(client_library.service_mutex_); diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h index a6f30d82e43587135697e76e8bc7d122edc0f602..3ad558fa532931937fab898f7b855f0a3370eaec 100644 --- a/tensorflow/compiler/xla/client/client_library.h +++ b/tensorflow/compiler/xla/client/client_library.h @@ -43,13 +43,13 @@ namespace xla { // Options to configure the local client when it is created. class LocalClientOptions { public: - LocalClientOptions(perftools::gputools::Platform* platform = nullptr, + LocalClientOptions(se::Platform* platform = nullptr, int number_of_replicas = 1, int intra_op_parallelism_threads = -1); // Set the platform backing the service, or nullptr for the default platform. - LocalClientOptions& set_platform(perftools::gputools::Platform* platform); - perftools::gputools::Platform* platform() const; + LocalClientOptions& set_platform(se::Platform* platform); + se::Platform* platform() const; // Set the number of replicas to use when compiling replicated // programs. @@ -61,7 +61,7 @@ class LocalClientOptions { int intra_op_parallelism_threads() const; private: - perftools::gputools::Platform* platform_; + se::Platform* platform_; int number_of_replicas_; int intra_op_parallelism_threads_; }; @@ -74,7 +74,7 @@ class ClientLibrary { // platform : The platform the underlying XLA service should target. If // null then default platform is used. static StatusOr GetOrCreateLocalClient( - perftools::gputools::Platform* platform = nullptr); + se::Platform* platform = nullptr); static StatusOr GetOrCreateLocalClient( const LocalClientOptions& options); @@ -84,14 +84,14 @@ class ClientLibrary { // Returns the service from the service thread. Only used in unit tests to // access user computations from client. - static LocalService* GetXlaService(perftools::gputools::Platform* platform); + static LocalService* GetXlaService(se::Platform* platform); // Singleton constructor-or-accessor for compile-only clients. Arguments: // // platform : The platform the underlying XLA service should target. If // null then default platform is used. static StatusOr GetOrCreateCompileOnlyClient( - perftools::gputools::Platform* platform = nullptr); + se::Platform* platform = nullptr); // Clears the local instance and compile only instance caches. The client // pointers returned by the previous GetOrCreateLocalClient() or @@ -120,12 +120,10 @@ class ClientLibrary { }; tensorflow::mutex service_mutex_; // Guards the singleton creation state. - std::unordered_map> + std::unordered_map> local_instances_ GUARDED_BY(service_mutex_); - std::unordered_map> + std::unordered_map> compile_only_instances_ GUARDED_BY(service_mutex_); TF_DISALLOW_COPY_AND_ASSIGN(ClientLibrary); diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc index 59662c95ac15e7c23790c5b5ff5d75a694613aeb..96e38bca01087991943aff40ed1cb3e21f9e6cba 100644 --- a/tensorflow/compiler/xla/client/compile_only_client.cc +++ b/tensorflow/compiler/xla/client/compile_only_client.cc @@ -39,6 +39,24 @@ CompileOnlyClient::CompileAheadOfTime( return compiler_service_->CompileAheadOfTime(service_instances, options); } +StatusOr>> +CompileOnlyClient::CompileAheadOfTime( + const tensorflow::gtl::ArraySlice computations, + const AotCompilationOptions& options) { + std::vector service_instances; + service_instances.reserve(computations.size()); + for (const AotXlaComputationInstance& instance : computations) { + service_instances.emplace_back(); + CompileOnlyService::AotXlaComputationInstance& service_instance = + service_instances.back(); + TF_RET_CHECK(instance.computation != nullptr); + service_instance.computation = instance.computation->proto(); + service_instance.argument_layouts = instance.argument_layouts; + service_instance.result_layout = instance.result_layout; + } + return compiler_service_->CompileAheadOfTime(service_instances, options); +} + int64 CompileOnlyClient::PointerSizeForTriple(tensorflow::StringPiece triple) { llvm::Triple llvm_triple( llvm::Triple::normalize(llvm::StringRef(triple.data(), triple.size()))); diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h index 5900048711384e0240a3cd502260eb388eb40f51..c8725b8517484acdaf093bc3b34adb00f69155b1 100644 --- a/tensorflow/compiler/xla/client/compile_only_client.h +++ b/tensorflow/compiler/xla/client/compile_only_client.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client.h" #include "tensorflow/compiler/xla/client/computation.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/service/compile_only_service.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/statusor.h" @@ -54,6 +55,27 @@ class CompileOnlyClient : public Client { const tensorflow::gtl::ArraySlice computations, const AotCompilationOptions& options); + // A description of an xla computation to compile using CompileAheadOfTime. + // + // TODO(b/74197823): This is a part of a NOT YET ready refactor. + struct AotXlaComputationInstance { + const XlaComputation* computation; + // Inform the compiler of the expected layout for arguments. + std::vector argument_layouts; + // Specifies the expected result layout. + const Shape* result_layout; + }; + + // Compiles a list of xla computations for ahead-of-time execution. This is + // intended for use in static compilation. The |options| parameter describes + // the target for which the compiler should emit code. + // + // TODO(b/74197823): This is a part of a NOT YET ready refactor. + StatusOr>> + CompileAheadOfTime( + const tensorflow::gtl::ArraySlice computations, + const AotCompilationOptions& options); + // Returns the size of a pointer in bytes for a given triple. static int64 PointerSizeForTriple(tensorflow::StringPiece triple); diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc index 4d3b0ee0d6e9ba82cfa09af0fbff0ae1efa0ac64..83c7cb174402133706fbde6a734a29afd8edfe80 100644 --- a/tensorflow/compiler/xla/client/computation_builder.cc +++ b/tensorflow/compiler/xla/client/computation_builder.cc @@ -1046,6 +1046,11 @@ ComputationDataHandle ComputationBuilder::Neg( return UnaryOp(UNOP_NEGATE, operand); } +ComputationDataHandle ComputationBuilder::Clz( + const ComputationDataHandle& operand) { + return UnaryOp(UNOP_CLZ, operand); +} + ComputationDataHandle ComputationBuilder::Clamp( const ComputationDataHandle& min, const ComputationDataHandle& operand, const ComputationDataHandle& max) { diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h index 019c6f3afb5d57bfe453988ded19120a4483cf36..9431c2c459a564e3cf509d9dae16e71fc27ee2c0 100644 --- a/tensorflow/compiler/xla/client/computation_builder.h +++ b/tensorflow/compiler/xla/client/computation_builder.h @@ -657,6 +657,9 @@ class ComputationBuilder { // Enqueues a negate instruction onto the computation. ComputationDataHandle Neg(const ComputationDataHandle& operand); + // Enqueues a count-leading-zeros instruction onto the computation. + ComputationDataHandle Clz(const ComputationDataHandle& operand); + // Enqueues a transpose instruction onto the computation. ComputationDataHandle Transpose( const ComputationDataHandle& operand, diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 30594243dcf51d2b5312b9dcb2bea7d0cd78524d..1c1270590375ab54e5d7b56344db1b2d40e5b89c 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -24,8 +24,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/source_map_util.h" #include "tensorflow/compiler/xla/status_macros.h" -namespace se = ::perftools::gputools; - using xla::source_map_util::InvalidParameterArgument; namespace xla { @@ -136,7 +134,7 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions( return Status::OK(); } -StatusOr> LocalExecutable::Run( +StatusOr LocalExecutable::Run( const tensorflow::gtl::ArraySlice arguments, ExecutableRunOptions run_options) { TF_RETURN_IF_ERROR( @@ -168,28 +166,23 @@ StatusOr> LocalExecutable::Run( if (executable_->dumping()) { return ExecuteAndDump(&service_options, arguments); } - TF_ASSIGN_OR_RETURN( - std::unique_ptr result, - executable_->ExecuteOnStreamWrapper( - &service_options, run_options.execution_profile(), arguments)); - - return MakeUnique(std::move(*result), - run_options.allocator()); + return executable_->ExecuteOnStreamWrapper( + &service_options, run_options.execution_profile(), arguments); } -StatusOr> LocalExecutable::ExecuteAndDump( +StatusOr LocalExecutable::ExecuteAndDump( const ServiceExecutableRunOptions* run_options, const tensorflow::gtl::ArraySlice arguments) { executable_->session_module()->set_execution_platform( backend_->platform()->Name()); TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module())); TF_ASSIGN_OR_RETURN( - std::unique_ptr result, + ScopedShapedBuffer result, executable_->ExecuteOnStream(run_options, arguments, /*hlo_execution_profile=*/nullptr)); - TF_RETURN_IF_ERROR(RecordResult(result.get(), executable_->session_module())); + TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module())); TF_RETURN_IF_ERROR(executable_->DumpSessionModule()); - return ScopedShapedBuffer::MakeScoped(result.get(), run_options->allocator()); + return std::move(result); } tensorflow::Status LocalExecutable::RecordArguments( @@ -283,9 +276,9 @@ StatusOr> LocalClient::Compile( updated_options)); } -StatusOr> -LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal, - DeviceMemoryAllocator* allocator) { +StatusOr LocalClient::LiteralToShapedBuffer( + const Literal& literal, int device_ordinal, + DeviceMemoryAllocator* allocator) { if (allocator == nullptr) { allocator = backend().memory_allocator(); } @@ -295,7 +288,7 @@ LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal, TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor, backend().stream_executor(device_ordinal)); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice( - executor, literal, *scoped_buffer)); + executor, literal, scoped_buffer)); return std::move(scoped_buffer); } diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h index 98ee7c62c94be7c618cedd3dc12ecbfc812ee180..f306c520ede0014be52d1b952849c8894b092baf 100644 --- a/tensorflow/compiler/xla/client/local_client.h +++ b/tensorflow/compiler/xla/client/local_client.h @@ -38,7 +38,7 @@ class LocalExecutable { public: // Run the compiled computation with the given arguments and options and // return the result. - StatusOr> Run( + StatusOr Run( const tensorflow::gtl::ArraySlice arguments, ExecutableRunOptions run_options); @@ -73,7 +73,7 @@ class LocalExecutable { // Records the computation in a SessionModule proto with the arguments used to // invoke it, and the result. Enabled by flag: --tla_dump_executions_to. - StatusOr> ExecuteAndDump( + StatusOr ExecuteAndDump( const ServiceExecutableRunOptions* run_options, const tensorflow::gtl::ArraySlice arguments); @@ -136,7 +136,7 @@ class LocalClient : public Client { // ScopedShapedBuffer. If non-null the given memory allocator is used for // device memory allocation. If null, the default memory allocator for the // device is used. - StatusOr> LiteralToShapedBuffer( + StatusOr LiteralToShapedBuffer( const Literal& literal, int device_ordinal, DeviceMemoryAllocator* allocator = nullptr); @@ -167,7 +167,7 @@ class LocalClient : public Client { StatusOr ReplicaNumberToDeviceOrdinal(int replica_number); // Returns the platform that the underlying service targets. - perftools::gputools::Platform* platform() const; + se::Platform* platform() const; // Returns the number of devices on the system of the service platform // type. Not all devices may be supported by the service (see diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD index 31fa1241ee474a31575c45cf7652063dfc818fac..0d6e207971ec64515ec5e6da292910920edd101a 100644 --- a/tensorflow/compiler/xla/client/xla_client/BUILD +++ b/tensorflow/compiler/xla/client/xla_client/BUILD @@ -31,9 +31,9 @@ cc_library( hdrs = ["xla_computation.h"], deps = [ "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:hlo_proto", - "//tensorflow/core:lib", ], ) diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc index 7ccdc2ded2c099690bc9187936db6491ef4142dd..1899983e442116d3ebf8a3e79b0515653cd624cb 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc +++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc @@ -1193,6 +1193,10 @@ XlaOp XlaBuilder::Sign(const XlaOp& operand) { return UnaryOp(HloOpcode::kSign, operand); } +XlaOp XlaBuilder::Clz(const XlaOp& operand) { + return UnaryOp(HloOpcode::kClz, operand); +} + XlaOp XlaBuilder::Cos(const XlaOp& operand) { return UnaryOp(HloOpcode::kCos, operand); } diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h index 1f7c731064dc004adcac56547e4717ff1638a491..4955f1515d66af00ddf72e4c7621292a590e662c 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h +++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h @@ -57,11 +57,27 @@ class XlaOp { StatusOr GetShape() const; + const XlaBuilder* builder() const { return builder_; } + + bool operator==(const XlaOp& rhs) const { + return handle_ == rhs.handle_ && builder_ == rhs.builder_; + } + + bool operator!=(const XlaOp& rhs) const { + return handle_ != rhs.handle_ || builder_ != rhs.builder_; + } + + friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) { + out << op.handle(); + return out; + } + private: XlaOp(int64 handle, XlaBuilder* builder) : handle_(handle), builder_(builder) {} int64 handle() const { return handle_; } + friend class XlaBuilder; int64 handle_; @@ -571,6 +587,9 @@ class XlaBuilder { // Enqueues a sign instruction onto the computation. XlaOp Sign(const XlaOp& operand); + // Enqueues a count leading zeros instruction onto the computation. + XlaOp Clz(const XlaOp& operand); + // Enqueues a cosine instruction onto the computation. XlaOp Cos(const XlaOp& operand); diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc index a6752c601026518825c7994f6b6fa20d20f34f24..72e3935696e0c44ae3893fc8f1ceb261fa5e2646 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc +++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc @@ -17,7 +17,9 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/util.h" namespace xla { @@ -26,4 +28,13 @@ StatusOr XlaComputation::GetProgramShape() const { return proto_.program_shape(); } +StatusOr> XlaComputation::Snapshot() const { + if (IsNull()) { + return InvalidArgument("Computation is invalid."); + } + auto session = MakeUnique(); + *session->mutable_hlo()->mutable_hlo_module() = proto_; + return std::move(session); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h index 7ad212aa24cd32d104cc4db7aa164c22c9f5be8f..b70b57e9ffec40188f246f5e884146012c02f4a2 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h +++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h @@ -48,6 +48,10 @@ class XlaComputation { const HloModuleProto& proto() const { return proto_; } + // Requests that we snapshot the computation into a serializable protocol + // buffer form. + StatusOr> Snapshot() const; + // Returns true if this object is a null Computation. bool IsNull() const { return unique_id_ == -1; } diff --git a/tensorflow/compiler/xla/device_util.h b/tensorflow/compiler/xla/device_util.h index 23a622b1ad0e2f3b220645f62767271f28df24e9..1a51fdee680721a4a03fa5de79a81746d92af76b 100644 --- a/tensorflow/compiler/xla/device_util.h +++ b/tensorflow/compiler/xla/device_util.h @@ -29,7 +29,7 @@ namespace xla { // Returns a string that represents the device in terms of platform and ordinal; // e.g. the first CUDA device will be "cuda:0" -string DeviceIdentifier(perftools::gputools::StreamExecutor* stream_exec) { +string DeviceIdentifier(se::StreamExecutor* stream_exec) { return tensorflow::strings::StrCat(stream_exec->platform()->Name(), ":", stream_exec->device_ordinal()); } diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc index 1700c977189a9e4aedf6a6a75923c13678dae667..99b8f0558e6e39649c0e2e6ef345332e2aa55736 100644 --- a/tensorflow/compiler/xla/executable_run_options.cc +++ b/tensorflow/compiler/xla/executable_run_options.cc @@ -36,12 +36,12 @@ DeviceMemoryAllocator* ExecutableRunOptions::allocator() const { } ExecutableRunOptions& ExecutableRunOptions::set_stream( - perftools::gputools::Stream* stream) { + stream_executor::Stream* stream) { stream_ = stream; return *this; } -perftools::gputools::Stream* ExecutableRunOptions::stream() const { +stream_executor::Stream* ExecutableRunOptions::stream() const { return stream_; } diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h index 2c1d9ffff10ed26410898ad258aa6b5b2cd37518..a306ae16ba4aee87e565ec3d119b84d90bc69c6c 100644 --- a/tensorflow/compiler/xla/executable_run_options.h +++ b/tensorflow/compiler/xla/executable_run_options.h @@ -16,26 +16,27 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_ #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_ -// Intentionally forward declared so that ExecutableRunOptions can be linked +// Pulls in the ::stream_executor -> ::xla::se namespace alias. +#include "tensorflow/compiler/xla/types.h" + +// These classes are forward declared so that ExecutableRunOptions can be linked // into an XLA-compiled binary without having to link all of the pointed-to // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't // need to be linked). -namespace perftools { -namespace gputools { +namespace stream_executor { class Stream; class Platform; -} -} +} // namespace stream_executor namespace tensorflow { namespace thread { class ThreadPool; -} -} +} // namespace thread +} // namespace tensorflow namespace Eigen { struct ThreadPoolDevice; -} +} // namespace Eigen namespace xla { @@ -61,8 +62,8 @@ class ExecutableRunOptions { // If set, this is the stream to run the computation on. The platform of the // stream must match the platform the executable was built for. A value of // nullptr indicates the option has not been set. - ExecutableRunOptions& set_stream(perftools::gputools::Stream* stream); - perftools::gputools::Stream* stream() const; + ExecutableRunOptions& set_stream(stream_executor::Stream* stream); + stream_executor::Stream* stream() const; // Sets the thread pool on which to run parallel CPU backend // computations. Does not take ownership. @@ -91,7 +92,7 @@ class ExecutableRunOptions { DeviceMemoryAllocator* allocator_ = nullptr; int device_ordinal_ = -1; DeviceAssignment* device_assignment_ = nullptr; - perftools::gputools::Stream* stream_ = nullptr; + stream_executor::Stream* stream_ = nullptr; tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr; const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr; ExecutionProfile* execution_profile_ = nullptr; diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc index 70ae95bf47398589e3c20f72c1f2084a738f253a..bc8405703b02dc1b0c4c87005ea3c15372552157 100644 --- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc +++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc @@ -43,7 +43,7 @@ void SetDebugOptionsDefaults(DebugOptions* flags) { #ifdef INTEL_MKL flags->set_xla_cpu_use_mkl_dnn(true); #endif // INTEL_MKL - flags->set_xla_gpu_max_kernel_unroll_factor(1); + flags->set_xla_gpu_max_kernel_unroll_factor(4); // Set cudnn batchnorm off by default; it does not provide a performance win // on average. flags->set_xla_gpu_use_cudnn_batchnorm(false); diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc index c315b4ff30059147ee33dcdd5b0858a1c39e5999..bb6dd4f9098aefc1c2bbb1b1c41b3cee856b67de 100644 --- a/tensorflow/compiler/xla/literal_util.cc +++ b/tensorflow/compiler/xla/literal_util.cc @@ -44,8 +44,16 @@ namespace { constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; -// Converts between little and big endian, assuming elements in the array are 16 -// bits long. +// Converts between little and big endian. +// +// Precondition: size % 2 == 0 (elements in the array are 16 bits long) +void ConvertEndianShort(string* bytes) { + CHECK_EQ(bytes->size() / 2, 0); + for (int64 i = 0; i < bytes->size(); i += 2) { + std::swap((*bytes)[i], (*bytes)[i + 1]); + } +} + void ConvertEndianShort(char* bytes, int64 size) { CHECK_EQ(size / 2, 0); for (int64 i = 0; i < size; i += 2) { @@ -1930,16 +1938,14 @@ void Literal::Piece::WriteToProto(LiteralProto* proto) const { *proto->mutable_f16s() = string( reinterpret_cast(data().data()), size_bytes()); if (!kLittleEndian) { - ConvertEndianShort(const_cast(proto->mutable_f16s()->data()), - proto->f16s().size()); + ConvertEndianShort(proto->mutable_f16s()); } break; case BF16: *proto->mutable_bf16s() = string( reinterpret_cast(data().data()), size_bytes()); if (!kLittleEndian) { - ConvertEndianShort(const_cast(proto->mutable_bf16s()->data()), - proto->bf16s().size()); + ConvertEndianShort(proto->mutable_bf16s()); } break; case F32: diff --git a/tensorflow/compiler/xla/ptr_util.h b/tensorflow/compiler/xla/ptr_util.h index c58c19db2cacbe9b038160f27b9bd76aa58146eb..bfcdfc62f9541ab09b94a48d5121e16bad4d43cd 100644 --- a/tensorflow/compiler/xla/ptr_util.h +++ b/tensorflow/compiler/xla/ptr_util.h @@ -28,26 +28,8 @@ limitations under the License. #include "tensorflow/core/util/ptr_util.h" namespace xla { - -template -std::unique_ptr WrapUnique(T* ptr) { - return tensorflow::WrapUnique(ptr); -} - -template -typename tensorflow::helper::MakeUniqueResult::scalar MakeUnique( - Args&&... args) { - return tensorflow::MakeUnique(std::forward(args)...); -} - -// Overload for array of unknown bound. -// The allocation of arrays needs to use the array form of new, -// and cannot take element constructor arguments. -template -typename tensorflow::helper::MakeUniqueResult::array MakeUnique(size_t n) { - return tensorflow::MakeUnique(n); -} - +using tensorflow::MakeUnique; +using tensorflow::WrapUnique; } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_ diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index 0517a5502e686def4ffea59f929aef225186a8aa..ecb87bd8893276fbb9ecffaa0f8a3233d2e0043f 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -20,6 +20,7 @@ py_test( srcs = ["xla_client_test.py"], main = "xla_client_test.py", srcs_version = "PY2AND3", + tags = ["no_oss"], deps = [ ":xla_client", "//tensorflow/python:platform_test", diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc index 2bacc6a9142971f6d14b3929fb1a69e2a40052e2..24e17abbe06197d2a090421508132e611da3cfa0 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.cc +++ b/tensorflow/compiler/xla/python/local_computation_builder.cc @@ -89,17 +89,16 @@ StatusOr> TransferFromOutfeedLocalReplica( return client->TransferFromOutfeedLocal(shape, device_ordinal); } -LocalShapedBuffer::LocalShapedBuffer( - std::unique_ptr shaped_buffer) +LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer) : shaped_buffer_(std::move(shaped_buffer)) {} -const std::unique_ptr& LocalShapedBuffer::shaped_buffer() - const { - return shaped_buffer_; +const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const { + return &shaped_buffer_; } -static StatusOr> ToBuffer( - LocalClient* client, int device_ordinal, const Literal& arg) { +static StatusOr ToBuffer(LocalClient* client, + int device_ordinal, + const Literal& arg) { return client->LiteralToShapedBuffer(arg, device_ordinal, client->backend().memory_allocator()); } @@ -109,14 +108,15 @@ LocalShapedBuffer* LocalShapedBuffer::FromLiteral( const Literal& argument, const tensorflow::gtl::optional& shape_with_layout) { LocalClient* client = GetOrCreateLocalClient(); - std::unique_ptr buf; - if (shape_with_layout) { - std::unique_ptr relaid = - argument.Relayout(shape_with_layout.value()); - buf = ToBuffer(client, /*device_ordinal=*/0, *relaid).ConsumeValueOrDie(); - } else { - buf = ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie(); - } + ScopedShapedBuffer buf = [&] { + if (shape_with_layout) { + std::unique_ptr relaid = + argument.Relayout(shape_with_layout.value()); + return ToBuffer(client, /*device_ordinal=*/0, *relaid) + .ConsumeValueOrDie(); + } + return ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie(); + }(); return new LocalShapedBuffer(std::move(buf)); } @@ -158,14 +158,14 @@ StatusOr> CompiledLocalComputation::Execute( << device_ordinal; // Transfer arguments in - std::vector> scoped_buffers; + std::vector scoped_buffers; scoped_buffers.reserve(arguments.size()); for (int i = 0; i < arguments.size(); ++i) { const Literal& argument = arguments[i]; const tensorflow::gtl::optional& shape_with_layout = shapes_with_layout[i]; - StatusOr> pushed; + StatusOr pushed; if (shape_with_layout) { std::unique_ptr relaid = argument.Relayout(shape_with_layout.value()); @@ -185,7 +185,7 @@ StatusOr> CompiledLocalComputation::Execute( std::vector argument_buffers; argument_buffers.reserve(scoped_buffers.size()); for (auto& buffer : scoped_buffers) { - argument_buffers.push_back(buffer.get()); + argument_buffers.push_back(&buffer); } DeviceAssignment device_assignment = @@ -202,7 +202,7 @@ StatusOr> CompiledLocalComputation::Execute( options.set_intra_op_thread_pool( client->backend().eigen_intra_op_thread_pool_device()); options.set_device_assignment(&device_assignment); - StatusOr> result_buffer_status = + StatusOr result_buffer_status = executable_->Run(argument_buffers, options); if (!result_buffer_status.ok()) { results[replica] = result_buffer_status.status(); @@ -210,8 +210,8 @@ StatusOr> CompiledLocalComputation::Execute( } // Transfer result out - results[replica] = - client->ShapedBufferToLiteral(*result_buffer_status.ValueOrDie()); + results[replica] = client->ShapedBufferToLiteral( + std::move(result_buffer_status).ValueOrDie()); }); } } @@ -236,7 +236,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers( std::vector argument_buffers; argument_buffers.reserve(argument_handles.size()); for (auto& handle : argument_handles) { - argument_buffers.push_back(handle->shaped_buffer().get()); + argument_buffers.push_back(handle->shaped_buffer()); } // Execute @@ -245,7 +245,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers( options.set_inter_op_thread_pool(client->backend().inter_op_thread_pool()); options.set_intra_op_thread_pool( client->backend().eigen_intra_op_thread_pool_device()); - std::unique_ptr result_buffer = + ScopedShapedBuffer result_buffer = executable_->Run(argument_buffers, options).ConsumeValueOrDie(); return new LocalShapedBuffer(std::move(result_buffer)); diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h index 31046e60f11af9cc89ddec4c5fd16babfc8eb231..e1048909ab29c2147a37ed72844391400d99e90d 100644 --- a/tensorflow/compiler/xla/python/local_computation_builder.h +++ b/tensorflow/compiler/xla/python/local_computation_builder.h @@ -62,12 +62,12 @@ class LocalShapedBuffer { static LocalShapedBuffer* FromLiteral( const Literal& argument, const tensorflow::gtl::optional& shape_with_layout); - LocalShapedBuffer(std::unique_ptr shaped_buffer); - const std::unique_ptr& shaped_buffer() const; + LocalShapedBuffer(ScopedShapedBuffer shaped_buffer); + const ScopedShapedBuffer* shaped_buffer() const; std::unique_ptr ToLiteral() const; private: - std::unique_ptr shaped_buffer_; + ScopedShapedBuffer shaped_buffer_; }; // Wraps a LocalExecutable produced by compiling a diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc index eec48479c929ab0823fef342fc284bfdc4b1f339..dc6f5fe5fcc067c99ced01812f9f2388a00766d0 100644 --- a/tensorflow/compiler/xla/python/numpy_bridge.cc +++ b/tensorflow/compiler/xla/python/numpy_bridge.cc @@ -181,16 +181,6 @@ StatusOr XlaShapeFromPyShape(PyObject* o) { PyObjectCppRepr(o).c_str()); }; - auto get_attr = [o, &error](const string& field) -> StatusOr { - PyObject* result = - PyObject_GetAttrString(o, const_cast(field.c_str())); - if (result == nullptr) { - return error(tensorflow::strings::StrCat( - "Failed to get attribute of Shape object:", field)); - } - return result; - }; - auto call_method = [o, &error](const string& method) -> StatusOr { PyObject* result = PyObject_CallMethod(o, const_cast(method.c_str()), nullptr); @@ -202,12 +192,16 @@ StatusOr XlaShapeFromPyShape(PyObject* o) { }; PyObject* np_type; - TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype")); + TF_ASSIGN_OR_RETURN(np_type, call_method("numpy_dtype")); if (np_type->ob_type != &PyArrayDescr_Type) { - return error("Shape attribute np_dtype is not an integer numpy dtype"); + return error( + "Return value of shape method numpy_dtype " + "is not an integer numpy dtype"); } if (!NumpyTypeIsValid(NumpyTypenum(np_type))) { - return error("Shape attribute np_dtype is not a valid integer numpy dtype"); + return error( + "Return value of shape method numpy_dtype " + "is not a valid integer numpy dtype"); } const PrimitiveType element_type = NumpyTypeToPrimitiveType(NumpyTypenum(np_type)); diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index 9c81f6439d0d9f0a0f0d1d3402e9c1ada46e8691..f6809b6b871d7e246dd43811c7e8c08378d53989 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -166,14 +166,14 @@ class LocalBuffer(object): self._delete = c_api.DeleteLocalShapedBuffer @staticmethod - def from_py(npval, layout_fn=None): - npval = require_numpy_array_layout(npval) + def from_pyval(pyval, layout_fn=None): + pyval = require_numpy_array_layout(pyval) if layout_fn: - shape = Shape.from_numpy(npval) + shape = Shape.from_pyval(pyval) shape = shape.map_leaves(layout_fn) else: shape = None - return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape)) + return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(pyval, shape)) def to_py(self): return self.c_local_shaped_buffer.ToLiteral() @@ -191,53 +191,104 @@ class LocalBuffer(object): class Shape(object): - """XLA shape. + """Represents an XLA shape. - Represents an XLA shape by a corresponding Python/Numpy type and a - list of dimensions, which are themselves Shapes in case this one - represents an XLA tuple. + A shape is either an array shape, having rank-many integer + dimensions and an element type (represented by a Numpy dtype), or it + is a tuple shape, having a shape for every tuple component: + + type shape = + TupleShape of shape list + | ArrayShape of { dimensions: int list; element_type: dtype } + + Callers are expected to instantiate this class only via the static + constructors: tuple_shape, array_shape, and from_pyval. """ - def __init__(self, np_dtype, dimensions, minor_to_major=None): + @staticmethod + def tuple_shape(tuple_shapes): + """Construct a tuple shape.""" + if (not isinstance(tuple_shapes, (tuple, list)) or + not all(isinstance(t, Shape) for t in tuple_shapes)): + raise TypeError('tuple_shapes must be a tuple of Shapes') + return Shape(tuple_shapes, tuple) + + @staticmethod + def array_shape(element_type, dimensions, minor_to_major=None): + """Construct an array shape.""" + if (not isinstance(dimensions, tuple) or + not all(isinstance(i, int) for i in dimensions)): + dimensions = tuple(int(i) for i in dimensions) + return Shape(dimensions, np.dtype(element_type), + minor_to_major=minor_to_major) + + @staticmethod + def from_pyval(pyval): + def convert(pyval): + if isinstance(pyval, tuple): + return Shape.tuple_shape(tuple(convert(elt) for elt in pyval)) + else: + pyval = require_numpy_array_layout(pyval) + return Shape.array_shape(pyval.dtype, np.shape(pyval)) + return convert(pyval) + + def __init__(self, dimensions, dtype, minor_to_major=None): assert isinstance(dimensions, tuple) - self.np_dtype = np_dtype self._dimensions = dimensions + self._dtype = dtype + self._is_tuple = dtype == tuple self._minor_to_major = minor_to_major self._check_minor_to_major() def __eq__(self, other): # pylint: disable=protected-access - return (self.np_dtype == other.np_dtype and + return (self._dtype == other._dtype and self._dimensions == other._dimensions and self._minor_to_major == other._minor_to_major) def __repr__(self): - return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, ' - 'minor_to_major={!r})').format(self.np_dtype, self._dimensions, - self._minor_to_major) - - def element_type(self): - return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)] + return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, ' + '_is_tuple={!r}), _minor_to_major={!r}').format( + self._dtype, self._dimensions, self._is_tuple, + self._minor_to_major) def is_tuple(self): - return self.element_type() == xla_data_pb2.TUPLE + return self._is_tuple - def dimensions(self): - if self.is_tuple(): - raise ValueError('Tuple shape has no dimensions') - return self._dimensions - - def minor_to_major(self): - return self._minor_to_major + def is_array(self): + return not self._is_tuple def tuple_shapes(self): if not self.is_tuple(): - raise ValueError('Shape is not a tuple shape') + raise ValueError('not a tuple shape') + return self._dimensions + + def numpy_dtype(self): + """Like element_type(), but returns dtype('O') in case of a tuple shape.""" + if self.is_tuple(): + return np.dtype(np.object) + else: + return self.element_type() + + def xla_element_type(self): + return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.numpy_dtype())] + + def element_type(self): + if not self.is_array(): + raise ValueError('not an array shape') + return self._dtype + + def dimensions(self): + if not self.is_array(): + raise ValueError('not an array shape') return self._dimensions def rank(self): return len(self.dimensions()) + def minor_to_major(self): + return self._minor_to_major + def map_leaves(self, f): """Map f over each leaf-level array subshape. @@ -250,7 +301,7 @@ class Shape(object): """ if self.is_tuple(): children = tuple(child.map_leaves(f) for child in self.tuple_shapes()) - return Shape(np.dtype('O'), children) + return Shape.tuple_shape(children) else: mapped = f(self) return self if mapped is None else mapped @@ -264,30 +315,24 @@ class Shape(object): assert sorted(mtm) == range(len(mtm)), self def update_minor_to_major(self, minor_to_major): + if not self.is_array(): + raise ValueError('not an array shape') if not isinstance(minor_to_major, tuple): raise TypeError('minor_to_major must be a tuple') - updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major) + updated = Shape.array_shape( + self.element_type(), self.dimensions(), minor_to_major) updated._check_minor_to_major() # pylint: disable=protected-access return updated - @staticmethod - def from_numpy(npval): - - def convert(npval): - if isinstance(npval, tuple): - return Shape(np.dtype('O'), tuple(convert(elt) for elt in npval)) - else: - return Shape(npval.dtype, np.shape(npval)) - - return convert(require_numpy_array_layout(npval)) - def _wrap_shape(shape_info): dtype, dims = shape_info element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)] if element_type == xla_data_pb2.TUPLE: - dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims) - return Shape(dtype, dims) + shapes = tuple(_wrap_shape(subshape_info) for subshape_info in dims) + return Shape.tuple_shape(shapes) + else: + return Shape.array_shape(dtype, dims) def _wrap_data_handle(handle): @@ -420,7 +465,7 @@ class LocalComputation(object): compile_options=None, layout_fn=None): return self.Compile( - argument_shapes=[Shape.from_numpy(arg) for arg in arguments], + argument_shapes=[Shape.from_pyval(arg) for arg in arguments], compile_options=compile_options, layout_fn=layout_fn) @@ -428,7 +473,7 @@ class LocalComputation(object): """Execute with Python values as arguments and return value.""" if not self.is_compiled: raise ValueError('Cannot execute an uncompiled local XLA computation.') - argument_shapes = [Shape.from_numpy(arg) for arg in arguments] + argument_shapes = [Shape.from_pyval(arg) for arg in arguments] if layout_fn: argument_shapes = [ shape.map_leaves(layout_fn) for shape in argument_shapes @@ -607,7 +652,7 @@ class ComputationBuilder(object): A ComputationDataHandle message. """ return self.ParameterWithShape( - Shape.from_numpy(value), name=name, parameter_num=parameter_num) + Shape.from_pyval(value), name=name, parameter_num=parameter_num) def Broadcast(self, operand, sizes): """Enqueues a broadcast operation onto the computation. @@ -968,7 +1013,7 @@ class ComputationBuilder(object): Returns: a ComputationDataHandle to the generated array of F32 values. """ - shape = Shape(self.GetShape(mu).np_dtype, dims) + shape = Shape.array_shape(self.GetShape(mu).element_type(), dims) return _wrap_data_handle( self._client.RngNormal( _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape)) @@ -988,7 +1033,7 @@ class ComputationBuilder(object): Returns: a ComputationDataHandle to the generated array of values with the same numeric type (F32, S32, or U32) as the arguments a and b. """ - shape = Shape(self.GetShape(a).np_dtype, dims) + shape = Shape.array_shape(self.GetShape(a).element_type(), dims) return _wrap_data_handle( self._client.RngUniform( _unwrap_data_handle(a), _unwrap_data_handle(b), shape)) diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py index 433ea568776102539d4cfe7364eedafe22caf6ac..c073c02040e4d260cf760ea2b25f70d60ddd41a1 100644 --- a/tensorflow/compiler/xla/python/xla_client_test.py +++ b/tensorflow/compiler/xla/python/xla_client_test.py @@ -319,7 +319,7 @@ class LocalBufferTest(LocalComputationTest): def _Execute(self, c, arguments): compiled_c = c.Build().CompileWithExampleArguments(arguments) - arg_buffers = [xla_client.LocalBuffer.from_py(arg) for arg in arguments] + arg_buffers = [xla_client.LocalBuffer.from_pyval(arg) for arg in arguments] result_buffer = compiled_c.ExecuteWithLocalBuffers(arg_buffers) return result_buffer.to_py() @@ -350,7 +350,7 @@ class LocalBufferTest(LocalComputationTest): c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14)) arg = NumpyArrayF32(1.11) compiled_c = c.Build().CompileWithExampleArguments([arg]) - arg_buffer = xla_client.LocalBuffer.from_py(arg) + arg_buffer = xla_client.LocalBuffer.from_pyval(arg) arg_buffer.delete() with self.assertRaises(ValueError): compiled_c.ExecuteWithLocalBuffers([arg_buffer]) @@ -1287,7 +1287,7 @@ class EmbeddedComputationsTest(LocalComputationTest): def testInfeedS32Values(self): to_infeed = NumpyArrayS32([1, 2, 3, 4]) c = self._NewComputation() - c.Infeed(xla_client.Shape.from_numpy(to_infeed[0])) + c.Infeed(xla_client.Shape.from_pyval(to_infeed[0])) compiled_c = c.Build().CompileWithExampleArguments() for item in to_infeed: xla_client.transfer_to_infeed(item) @@ -1299,7 +1299,7 @@ class EmbeddedComputationsTest(LocalComputationTest): def testInfeedThenOutfeedS32(self): to_round_trip = NumpyArrayS32([1, 2, 3, 4]) c = self._NewComputation() - x = c.Infeed(xla_client.Shape.from_numpy(to_round_trip[0])) + x = c.Infeed(xla_client.Shape.from_pyval(to_round_trip[0])) c.Outfeed(x) compiled_c = c.Build().CompileWithExampleArguments() @@ -1309,7 +1309,7 @@ class EmbeddedComputationsTest(LocalComputationTest): execution.start() xla_client.transfer_to_infeed(want) got = xla_client.transfer_from_outfeed( - xla_client.Shape.from_numpy(to_round_trip[0])) + xla_client.Shape.from_pyval(to_round_trip[0])) execution.join() self.assertEqual(want, got) diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc index ad3a28e11939d6259ebd75d544a950ba7abd741f..df9dbc58308f047484cccd6919ca0abf328622eb 100644 --- a/tensorflow/compiler/xla/reference_util.cc +++ b/tensorflow/compiler/xla/reference_util.cc @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/computation_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h" #include "tensorflow/compiler/xla/service/hlo_evaluator.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -90,7 +90,7 @@ std::unique_ptr> MatmulArray2DImpl( Padding padding) { return ConvArray3DGeneralDimensionsDilated( lhs, rhs, kernel_stride, padding, 1, 1, - ComputationBuilder::CreateDefaultConvDimensionNumbers(1)); + XlaBuilder::CreateDefaultConvDimensionNumbers(1)); } /*static*/ std::unique_ptr> @@ -140,7 +140,7 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated( std::pair kernel_stride, Padding padding) { return ConvArray4DGeneralDimensions( lhs, rhs, kernel_stride, padding, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); } /* static */ std::unique_ptr> diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc index 414829d6e76354672c7c1998d1fb1bd185043d78..0b100bd108e239964483ed5ba279dff61bce0023 100644 --- a/tensorflow/compiler/xla/rpc/grpc_service.cc +++ b/tensorflow/compiler/xla/rpc/grpc_service.cc @@ -20,7 +20,7 @@ limitations under the License. namespace xla { /* static */ StatusOr> GRPCService::NewService( - perftools::gputools::Platform* platform) { + se::Platform* platform) { std::unique_ptr grpc_service(new GRPCService()); TF_ASSIGN_OR_RETURN(grpc_service->service_, ::xla::Service::NewService(platform)); diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h index 7c9e484517e9ced45c40dda78a2bd427a24c2722..fad74375bd59f7254d97c4adbc6b3d2f5fbf6b29 100644 --- a/tensorflow/compiler/xla/rpc/grpc_service.h +++ b/tensorflow/compiler/xla/rpc/grpc_service.h @@ -29,7 +29,7 @@ class GRPCService : public grpc::XlaService::Service { // that the service should target. If platform is null then the default // platform is used. static StatusOr> NewService( - perftools::gputools::Platform* platform = nullptr); + se::Platform* platform = nullptr); ::grpc::Status Computation(::grpc::ServerContext* context, const ComputationRequest* arg, diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 9831a09c1fd491a5553fd46e9d7e33ca3e5f4891..d55da3686cdfdf0a33ebd6d4e5c1d273192b2f6d 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -359,6 +359,7 @@ cc_library( ":hlo", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:lib", ], ) @@ -755,6 +756,7 @@ cc_library( ":hlo", ":hlo_execution_profile", ":hlo_graph_dumper", + ":hlo_proto", ":pool", ":session_proto", ":shaped_buffer", @@ -1220,6 +1222,23 @@ cc_library( ], ) +tf_cc_test( + name = "hlo_creation_utils_test", + srcs = ["hlo_creation_utils_test.cc"], + deps = [ + ":hlo", + ":hlo_creation_utils", + ":hlo_evaluator", + ":hlo_matchers", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:test", + ], +) + cc_library( name = "batchnorm_expander", srcs = ["batchnorm_expander.cc"], @@ -2015,6 +2034,7 @@ cc_library( srcs = ["hlo_verifier.cc"], hdrs = ["hlo_verifier.h"], deps = [ + ":hlo", ":hlo_pass", ":shape_inference", "//tensorflow/compiler/xla:status_macros", diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 8d26938c6e59beaea400ab605d84be5a6ddebf6d..8e785de68cb1fbe4ce9fd58a661bdc208725483b 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -1412,7 +1412,6 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) { return Status::OK(); } -// TODO(b/74536353): do this simplification for BroadcastDimOne as well. StatusOr AlgebraicSimplifierVisitor:: TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand( HloInstruction* reshape_or_broadcast) { diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 4f819a743c48f30df8dde00ece72a0b4e1748802..cf1231bcce4d004284b71a49063e3e470a9eb93f 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -31,52 +31,68 @@ limitations under the License. namespace xla { StatusOr AllocationTracker::Register( - std::unique_ptr shaped_buffer, const string& tag) { + ScopedShapedBuffer shaped_buffer, const string& tag) { tensorflow::mutex_lock lock(mutex_); VLOG(2) << "Register"; - std::vector> replicated_buffers; + std::vector replicated_buffers; replicated_buffers.emplace_back(std::move(shaped_buffer)); return RegisterInternal(std::move(replicated_buffers), tag); } StatusOr AllocationTracker::RegisterReplicatedBuffers( - std::vector> replicated_buffers, - const string& tag) { + std::vector replicated_buffers, const string& tag) { tensorflow::mutex_lock lock(mutex_); VLOG(2) << "RegisterReplicatedBuffers"; return RegisterInternal(std::move(replicated_buffers), tag); } +// ReleaseIfScopedShapedBuffer lets RegisterInternal(b) call +// b.release() if b is a ScopedShapedBuffer, or otherwise pass b through +// unmodified. +static ShapedBuffer ReleaseIfScopedShapedBuffer(ShapedBuffer b) { return b; } +static ShapedBuffer ReleaseIfScopedShapedBuffer(ScopedShapedBuffer b) { + return b.release(); +} + +template StatusOr AllocationTracker::RegisterInternal( - std::vector> replicated_buffers, - const string& tag) { + std::vector replicated_buffers, const string& tag) { + static_assert(std::is_same::value || + std::is_same::value, + "ShapedBufferTy must be ShapedBuffer or ScopedShapedBuffer."); VLOG(2) << "RegisterInternal(" << "tag: \"" << tag << "\" with " << replicated_buffers.size() << " shaped_buffers."; for (const auto& shaped_buffer : replicated_buffers) { - VLOG(2) << "shaped_buffer:" << *shaped_buffer; - if (shaped_buffer->platform() != backend_->platform()) { + VLOG(2) << "shaped_buffer:" << shaped_buffer; + if (shaped_buffer.platform() != backend_->platform()) { return InvalidArgument( "AllocationTracker for platform %s cannot register buffer from " "platform %s", backend_->platform()->Name().c_str(), - shaped_buffer->platform()->Name().c_str()); + shaped_buffer.platform()->Name().c_str()); } } int64 handle = next_handle_++; for (auto& shaped_buffer : replicated_buffers) { std::vector shape_indices; - ShapeUtil::ForEachSubshape(shaped_buffer->on_device_shape(), - [this, &shape_indices](const Shape& /*subshape*/, - const ShapeIndex& index) { - shape_indices.push_back(index); - }); + ShapeUtil::ForEachSubshape( + shaped_buffer.on_device_shape(), + [&](const Shape& /*subshape*/, const ShapeIndex& index) { + shape_indices.push_back(index); + }); + // Add shaped_buffer's buffers to opaque_to_allocation_map_, which owns + // them. for (const ShapeIndex& index : shape_indices) { - AddAllocationOrIncrementRefCount(shaped_buffer->buffer(index), - shaped_buffer->device_ordinal()); + AddAllocationOrIncrementRefCount(shaped_buffer.buffer(index), + shaped_buffer.device_ordinal()); } - handle_to_shaped_buffers_[handle].emplace_back(std::move(shaped_buffer)); + // If ShapedBufferTy is ScopedShapedBuffer, release the ScopedShapedBuffer + // into a regular ShapedBuffer, which is stored in + // handle_to_shaped_buffers_. + handle_to_shaped_buffers_[handle].emplace_back(MakeUnique( + ReleaseIfScopedShapedBuffer(std::move(shaped_buffer)))); } GlobalDataHandle result; @@ -103,10 +119,6 @@ tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) { shaped_buffer->device_ordinal())); } } - return Reset(data); -} - -Status AllocationTracker::Reset(const GlobalDataHandle& data) { // Keep a nullptr as a tombstone for unregistered handles. This enables // better error messages. That is, "handle has been deallocated" versus // "handle does not exist". @@ -146,14 +158,14 @@ StatusOr> AllocationTracker::DeconstructTuple( for (int i = 0; i < ShapeUtil::TupleElementCount(shaped_buffer->on_device_shape()); ++i) { - auto element_buffer = MakeUnique( + auto element_buffer = ShapedBuffer( ShapeUtil::GetTupleElementShape(shaped_buffer->on_host_shape(), i), ShapeUtil::GetTupleElementShape(shaped_buffer->on_device_shape(), i), shaped_buffer->platform(), shaped_buffer->device_ordinal()); - element_buffer->set_buffer(shaped_buffer->buffer(/*index=*/{i}), - /*index=*/{}); - std::vector> replicated_buffers; - replicated_buffers.emplace_back(std::move(element_buffer)); + element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}), + /*index=*/{}); + std::vector replicated_buffers; + replicated_buffers.push_back(std::move(element_buffer)); TF_ASSIGN_OR_RETURN( GlobalDataHandle element_handle, RegisterInternal(std::move(replicated_buffers), "deconstructed tuple")); @@ -204,7 +216,7 @@ StatusOr> AllocationTracker::ResolveInternal( } void AllocationTracker::AddAllocationOrIncrementRefCount( - perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) { + se::DeviceMemoryBase device_memory, int device_ordinal) { AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal]; auto it = allocation_map.find(device_memory.opaque()); if (it == allocation_map.end()) { @@ -215,8 +227,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount( } } -Status AllocationTracker::DecrementRefCount( - perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) { +Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory, + int device_ordinal) { AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal]; auto it = allocation_map.find(device_memory.opaque()); TF_RET_CHECK(it != allocation_map.end()); diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h index 038aee8541b297d6f91fe2b3bce7455fd9a7084e..1174fa641c06ae053bcc652416bfbc30cabc777c 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.h +++ b/tensorflow/compiler/xla/service/allocation_tracker.h @@ -45,14 +45,13 @@ class AllocationTracker { // Registers a shaped buffer of device memory, and returns a corresponding // handle that can be used for talking to XLA clients. The given shaped buffer // will be treated as the buffer corresponding to the only replica. - StatusOr Register( - std::unique_ptr shaped_buffer, const string& tag); + StatusOr Register(ScopedShapedBuffer shaped_buffer, + const string& tag); // Registers a vector of shaped buffers of device memory, one per replica, and // returns a corresponding handle that can be used for talking to XLA clients. StatusOr RegisterReplicatedBuffers( - std::vector> replicated_buffers, - const string& tag); + std::vector replicated_buffers, const string& tag); // Unregister the allocation for the given data handle. Status Unregister(const GlobalDataHandle& data); @@ -77,7 +76,7 @@ class AllocationTracker { // Data structure encapsulating single memory allocation on the device. struct Allocation { // The pointer to this allocation. - perftools::gputools::DeviceMemoryBase device_memory; + se::DeviceMemoryBase device_memory; // The device that the memory is allocated on. int device_ordinal; @@ -88,28 +87,28 @@ class AllocationTracker { }; // Internal helper which resolves the given GlobalDataHandle to a - // ShapedBuffer. + // list of ScopedShapedBuffers. StatusOr> ResolveInternal( const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Internal helper which registers a vector of shaped buffers, one per - // replica. + // replica. ShapedBufferTy is either ScopedShapedBuffer or ShapedBuffer. If + // it's ShapedBuffer, all of the given buffers must already be tracked by this + // object -- presumably this is a call from DeconstructTuple. + template StatusOr RegisterInternal( - std::vector> replicated_buffers, - const string& tag) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Resets the shaped buffers corresponding to the given handle. - Status Reset(const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_); + std::vector replicated_buffers, const string& tag) + EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Adds the given device address to the allocation tracker, or if it already - // exists, then increment it's reference count. - void AddAllocationOrIncrementRefCount( - perftools::gputools::DeviceMemoryBase device_memory, int device_ordinal) + // exists, then increment its reference count. + void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory, + int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Decrements the reference count of the given device memory. Then, if it is // zero, deallocate the memory. - Status DecrementRefCount(perftools::gputools::DeviceMemoryBase device_memory, + Status DecrementRefCount(se::DeviceMemoryBase device_memory, int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // A map from device memory opaque value to allocation. One such map is @@ -132,6 +131,21 @@ class AllocationTracker { // A map from data handle to a vector of shaped buffers that represent the // buffers for different replicas. + // + // The ShapedBuffers in this map's vectors need to be unique_ptrs, because our + // public API returns pointers to them. We expect the concrete class to be + // ShapedBuffer and never ScopedShapedBuffer; deallocation of buffers is + // handled by opaque_to_allocation_map_. + // + // The elements of the vectors need to be unique_ptrs because we return + // pointers to them. (In theory we could use std::list or something instead, + // but we also want to be able to null out these elements.) + // + // The reason that the elements can't be unique_ptrs is + // the existence of DeconstructTuple(). This function allows us to create a + // non-owning "view" into a tuple's sub-buffers. The sub-buffers are then + // free'd when both the view *and* the original tuple are Unregistered. This + // refcounting is managed in opaque_to_allocation_map_. tensorflow::gtl::FlatMap>> handle_to_shaped_buffers_ GUARDED_BY(mutex_); diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc index 05f2d062784147108a94ffb7bb0ca42ddfe4f010..b1d616ec3506f9e65dac045c8192c4bf061e90f4 100644 --- a/tensorflow/compiler/xla/service/backend.cc +++ b/tensorflow/compiler/xla/service/backend.cc @@ -31,24 +31,20 @@ limitations under the License. #include "tensorflow/core/common_runtime/eigen_thread_pool.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { -BackendOptions& BackendOptions::set_platform( - perftools::gputools::Platform* platform) { +BackendOptions& BackendOptions::set_platform(se::Platform* platform) { platform_ = platform; return *this; } -perftools::gputools::Platform* BackendOptions::platform() const { - return platform_; -} +se::Platform* BackendOptions::platform() const { return platform_; } BackendOptions& BackendOptions::set_intra_op_parallelism_threads( int num_threads) { @@ -77,7 +73,7 @@ struct Backend::EigenThreadPoolWrapper { /* static */ StatusOr> Backend::CreateBackend( const BackendOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform)); TF_ASSIGN_OR_RETURN(auto stream_executors, PlatformUtil::GetStreamExecutors(platform)); @@ -121,7 +117,7 @@ StatusOr Backend::BorrowStream( } Backend::Backend( - perftools::gputools::Platform* platform, Compiler* compiler, + se::Platform* platform, Compiler* compiler, tensorflow::gtl::ArraySlice stream_executors, TransferManager* transfer_manager, ComputationPlacer* computation_placer, int intra_op_parallelism_threads) @@ -178,7 +174,7 @@ tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const { return intra_op_thread_pool_wrapper_->pool.get(); } -StatusOr Backend::stream_executor( +StatusOr Backend::stream_executor( int device_ordinal) const { if (device_ordinal < 0 || device_ordinal > stream_executors_.back()->device_ordinal()) { @@ -201,9 +197,9 @@ StatusOr Backend::devices_equivalent(int device_ordinal_a, // bit crude but works for GPUs which is the important case where we compile // an executable for one GPU and want to know if it will run (well) on // another. - TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_a, + TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_a, stream_executor(device_ordinal_a)); - TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * executor_b, + TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor_b, stream_executor(device_ordinal_b)); return (executor_a->GetDeviceDescription().name() == executor_b->GetDeviceDescription().name()); diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h index b5ca483b7274d20c31e932d748b6a4c9dea926f9..d32a0a400d8bd51fed6bfb6c6a581e16c8249a5c 100644 --- a/tensorflow/compiler/xla/service/backend.h +++ b/tensorflow/compiler/xla/service/backend.h @@ -44,8 +44,8 @@ namespace xla { class BackendOptions { public: // Set the platform backing the backend, or nullptr for the default platform. - BackendOptions& set_platform(perftools::gputools::Platform* platform); - perftools::gputools::Platform* platform() const; + BackendOptions& set_platform(se::Platform* platform); + se::Platform* platform() const; // Sets the thread pool size for parallel execution of an individual operator. // The default value of -1 will result in initializing the thread pool with @@ -54,7 +54,7 @@ class BackendOptions { int intra_op_parallelism_threads() const; private: - perftools::gputools::Platform* platform_ = nullptr; + se::Platform* platform_ = nullptr; int intra_op_parallelism_threads_ = -1; }; @@ -66,7 +66,7 @@ class BackendOptions { // StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie(); class Backend { public: - using StreamPtr = Pool::SmartPtr; + using StreamPtr = Pool::SmartPtr; // Creates a new backend. static StatusOr> CreateBackend( @@ -79,7 +79,7 @@ class Backend { ~Backend(); // Accessors for the various objects. - perftools::gputools::Platform* platform() const { return platform_; } + se::Platform* platform() const { return platform_; } Compiler* compiler() const { return compiler_; } DeviceMemoryAllocator* memory_allocator() const { return memory_allocator_.get(); @@ -96,19 +96,17 @@ class Backend { // Returns stream executors of all supported devices for this backend. The // executors are ordered by the device ordinal. - const std::vector& stream_executors() - const { + const std::vector& stream_executors() const { return stream_executors_; } // Returns the stream executor for the given device ordinal. - StatusOr stream_executor( - int device_ordinal) const; + StatusOr stream_executor(int device_ordinal) const; // Returns the stream executor for the default device ordinal. This stream // executor can only be used when the number of computations is 1 (replication // can be > 1). - perftools::gputools::StreamExecutor* default_stream_executor() const { + se::StreamExecutor* default_stream_executor() const { CHECK(!stream_executors_.empty()); return stream_executors_[0]; } @@ -117,8 +115,7 @@ class Backend { // internal pool, or by constructing/initializating it, and returns the result // to the caller. StatusOr BorrowStream(int device_ordinal); - StatusOr BorrowStream( - perftools::gputools::StreamExecutor* executor); + StatusOr BorrowStream(se::StreamExecutor* executor); // Returns a function to borrow a stream, as `BorrowStream` above does. // Purely for convenience, the caller could rather make this anonymous @@ -157,29 +154,26 @@ class Backend { private: struct EigenThreadPoolWrapper; - Backend(perftools::gputools::Platform* platform, Compiler* compiler, - tensorflow::gtl::ArraySlice - stream_executors, + Backend(se::Platform* platform, Compiler* compiler, + tensorflow::gtl::ArraySlice stream_executors, TransferManager* transfer_manager, ComputationPlacer* computation_placer, int intra_op_parallelism_threads); Backend(const Backend&) = delete; Backend& operator=(const Backend&) = delete; - perftools::gputools::Platform* platform_; + se::Platform* platform_; Compiler* compiler_; TransferManager* transfer_manager_; ComputationPlacer* computation_placer_; // Vector of stream executors. stream_executors_[0] is the default executor. - std::vector stream_executors_; + std::vector stream_executors_; tensorflow::mutex mu_; // Mapping from stream executor to stream pools, used by `BorrowStream` above. - std::map> - stream_pools_ GUARDED_BY(mu_); + std::map> stream_pools_ GUARDED_BY(mu_); // The default memory allocator to use. std::unique_ptr memory_allocator_; diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc index c26d2feef584faeff013a602409cdd58c2d44a5a..43ebe92c5ec1c945780f76ca4178a94f948a81b9 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc +++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc @@ -392,7 +392,6 @@ void BFloat16Propagation::AdjustCalledComputationRoot(HloInstruction* hlo) { adjust_computation(hlo->fused_instructions_computation(), hlo->shape()); break; case HloOpcode::kWhile: - adjust_computation(hlo->while_condition(), hlo->shape()); adjust_computation(hlo->while_body(), hlo->shape()); break; default: diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc index 88f83014164ff726a11e45e762b9c082cf12720d..183db1652e498edb0b94e9c9a272e2b8a7fc53ba 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc +++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc @@ -426,8 +426,62 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) { EXPECT_TRUE(OutputsBF16(xpose)); } -// Tests that BF16 is propagated properly through while computations. -TEST_F(BFloat16PropagationTest, PropagateThroughWhile) { +// Tests that BF16 is propagated properly through a while computation with +// non-tuple input/output. +TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) { + auto module = CreateNewModule(); + auto builder = HloComputation::Builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {4, 4}); + + HloInstruction* param0 = builder.AddInstruction( + HloInstruction::CreateParameter(0, shape, "param0")); + HloInstruction* param1 = builder.AddInstruction( + HloInstruction::CreateParameter(1, shape, "param1")); + HloInstruction* add = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1)); + + auto builder_cond = HloComputation::Builder("cond"); + auto cond_param = builder_cond.AddInstruction( + HloInstruction::CreateParameter(0, shape, "cond_param")); + auto cond_dot = builder_cond.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kDot, cond_param, cond_param)); + auto cond_root = builder_cond.AddInstruction(HloInstruction::CreateBinary( + ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt, + builder_cond.AddInstruction(HloInstruction::CreateSlice( + ShapeUtil::MakeShape(F32, {}), cond_dot, {0, 0}, {1, 1}, {1, 1})), + builder_cond.AddInstruction(HloInstruction::CreateSlice( + ShapeUtil::MakeShape(F32, {}), cond_dot, {1, 1}, {2, 2}, {1, 1})))); + auto cond = module->AddEmbeddedComputation(builder_cond.Build()); + + auto builder_body = HloComputation::Builder("body"); + auto body_param = builder_body.AddInstruction( + HloInstruction::CreateParameter(0, shape, "body_param")); + auto body_dot = builder_body.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kDot, body_param, body_param)); + auto body = module->AddEmbeddedComputation(builder_body.Build()); + + auto while_hlo = builder.AddInstruction( + HloInstruction::CreateWhile(shape, cond, body, add)); + + auto dot = builder.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kDot, while_hlo, while_hlo)); + auto computation = module->AddEntryComputation(builder.Build()); + + EXPECT_TRUE(PropagatePrecision(module.get())); + + EXPECT_EQ(computation->root_instruction(), dot); + EXPECT_TRUE( + ShapeUtil::Equal(cond_root->shape(), ShapeUtil::MakeShape(PRED, {}))); + EXPECT_TRUE(OutputsBF16(add)); + EXPECT_TRUE(OutputsBF16(body_dot)); + EXPECT_TRUE(OutputsBF16(body_param)); + EXPECT_TRUE(OutputsBF16(cond_param)); + EXPECT_FALSE(OutputsBF16(dot)); +} + +// Tests that BF16 is propagated properly through while computations with +// tuple-shaped input/output. +TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) { auto module = CreateNewModule(); auto builder = HloComputation::Builder(TestName()); Shape shape = ShapeUtil::MakeShape(F32, {4, 4}); diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc index c83da9eddc8f8b156dd9acfc99b393bf844575da..c9f78a0f9f1c0e889cd2c761e3129ec329a7b647 100644 --- a/tensorflow/compiler/xla/service/compile_only_service.cc +++ b/tensorflow/compiler/xla/service/compile_only_service.cc @@ -37,7 +37,7 @@ limitations under the License. namespace xla { /* static */ StatusOr> -CompileOnlyService::NewService(perftools::gputools::Platform* platform) { +CompileOnlyService::NewService(se::Platform* platform) { ServiceOptions default_options; default_options.set_platform(platform); return NewService(default_options); @@ -45,7 +45,7 @@ CompileOnlyService::NewService(perftools::gputools::Platform* platform) { /* static */ StatusOr> CompileOnlyService::NewService(const ServiceOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); if (platform == nullptr) { TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform()); } @@ -61,6 +61,33 @@ CompileOnlyService::CompileOnlyService(const ServiceOptions& options, Compiler* compiler) : Service(options, /*execute_backend=*/nullptr), compiler_(compiler) {} +StatusOr>> +CompileOnlyService::CompileAheadOfTime( + const tensorflow::gtl::ArraySlice computations, + const AotCompilationOptions& options) { + std::vector> hlo_modules; + for (const AotXlaComputationInstance& instance : computations) { + TF_RET_CHECK(instance.computation.has_program_shape()); + + const DebugOptions& debug_options = options.debug_options(); + const auto& program_shape = instance.computation.program_shape(); + ExecutionOptions execution_options; + *execution_options.mutable_debug_options() = debug_options; + TF_ASSIGN_OR_RETURN( + std::unique_ptr module_config, + CreateModuleConfig(program_shape, instance.argument_layouts, + &execution_options)); + + TF_ASSIGN_OR_RETURN( + std::unique_ptr hlo_module, + HloModule::CreateFromProto(instance.computation, *module_config)); + TF_RETURN_IF_ERROR(MaybeDumpHloModule(*hlo_module)); + hlo_modules.push_back(std::move(hlo_module)); + } + + return compiler_->CompileAheadOfTime(std::move(hlo_modules), options); +} + StatusOr>> CompileOnlyService::CompileAheadOfTime( const tensorflow::gtl::ArraySlice computations, diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h index 9859941c6c17460939e5b6817f1c7c415e63443c..c10609e67fcdec459baf25a95173bbf700994be9 100644 --- a/tensorflow/compiler/xla/service/compile_only_service.h +++ b/tensorflow/compiler/xla/service/compile_only_service.h @@ -34,7 +34,7 @@ class CompileOnlyService : public Service { // platform that the service should target. If platform is null then the // default platform is used. static StatusOr> NewService( - perftools::gputools::Platform* platform); + se::Platform* platform); static StatusOr> NewService( const ServiceOptions& options); @@ -53,6 +53,25 @@ class CompileOnlyService : public Service { const tensorflow::gtl::ArraySlice computations, const AotCompilationOptions& Options); + // A description of a xla computation to compile using CompileAheadOfTime. + // + // TODO(b/74197823): This is a part of a NOT YET ready refactor. + struct AotXlaComputationInstance { + HloModuleProto computation; + std::vector argument_layouts; + const Shape* result_layout = nullptr; + }; + + // Compiles a list of xla computations for ahead-of-time execution. This is + // intended for use in static compilation. See + // |CompileOnlyClient::CompileAheadOfTime| for additional details. + // + // TODO(b/74197823): This is a part of a NOT YET ready refactor. + StatusOr>> + CompileAheadOfTime( + const tensorflow::gtl::ArraySlice computations, + const AotCompilationOptions& options); + // Override Service methods that require or imply the existence of an // execute backend. Note that this does not include TransferToClient, as // computing constants produces global data that we may wish to transfer. diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc index 0392d4af48a040c4a648f7bf9bf21a62ce03a990..8b01a6c4b5004d03e6e7d23b99b923fdcdeaff99 100644 --- a/tensorflow/compiler/xla/service/compiler.cc +++ b/tensorflow/compiler/xla/service/compiler.cc @@ -23,26 +23,21 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" -namespace se = ::perftools::gputools; - namespace xla { /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_( tensorflow::LINKER_INITIALIZED); -/* static */ std::map* +/* static */ std::map* Compiler::GetPlatformCompilerFactories() { - static auto* r = - new std::map; + static auto* r = new std::map; return r; } /* static */ -std::map>* +std::map>* Compiler::GetPlatformCompilers() { - static auto* r = new std::map>; + static auto* r = new std::map>; return r; } diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h index b4b53ae2ed425a48de5bcb6ba5c37b5d37e1f371..5c14591d93cc995a0b75efb14da8ec98d5859ff5 100644 --- a/tensorflow/compiler/xla/service/compiler.h +++ b/tensorflow/compiler/xla/service/compiler.h @@ -70,7 +70,7 @@ class AotCompilationOptions { virtual ~AotCompilationOptions() = default; // Returns the ID of the platform to which these options apply. - virtual perftools::gputools::Platform::Id PlatformId() const = 0; + virtual se::Platform::Id PlatformId() const = 0; // Optional allocator that may be used for allocating temp space on the device // during compilation. @@ -109,7 +109,7 @@ class Compiler { virtual ~Compiler() {} // Returns the ID of the platform that this compiler targets. - virtual perftools::gputools::Platform::Id PlatformId() const = 0; + virtual se::Platform::Id PlatformId() const = 0; // Runs Hlo passes to optimize the given Hlo module, returns the optimized // module. @@ -120,8 +120,7 @@ class Compiler { // algorithm over those buffers, to see which variant is fastest. Any space // allocated should be deallocated before this function returns. virtual StatusOr> RunHloPasses( - std::unique_ptr module, - perftools::gputools::StreamExecutor* executor, + std::unique_ptr module, se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) = 0; // Compiles the HLO module for execution on a device given by the executor, @@ -137,8 +136,7 @@ class Compiler { // // Use the overload below to compile computations that run in parallel. virtual StatusOr> RunBackend( - std::unique_ptr module, - perftools::gputools::StreamExecutor* executor, + std::unique_ptr module, se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) = 0; // Compiles a set of HLO modules that can run in parallel, potentially @@ -151,8 +149,7 @@ class Compiler { // modules to RunHloPasses and RunBackends. virtual StatusOr>> Compile( std::vector> modules, - std::vector> - stream_exec, + std::vector> stream_exec, DeviceMemoryAllocator* device_allocator) = 0; // Compiles the HLO module for ahead-of-time execution. This is intended for @@ -171,14 +168,12 @@ class Compiler { // be a singleton, so no ownership is transferred. // // Precondition: a platform kind must not be registered more than once. - static void RegisterCompilerFactory( - perftools::gputools::Platform::Id platform_id, - CompilerFactory compiler_factory); + static void RegisterCompilerFactory(se::Platform::Id platform_id, + CompilerFactory compiler_factory); // Returns the compiler singleton pointer if it is available for the given // platform, or an error status if it is not. - static StatusOr GetForPlatform( - const perftools::gputools::Platform* platform); + static StatusOr GetForPlatform(const se::Platform* platform); // Returns a function that computes the size in bytes of the logical // buffer that contains a shape. @@ -198,12 +193,12 @@ class Compiler { static tensorflow::mutex platform_compiler_mutex_; // Map from platform kind to compiler factory. - static std::map* + static std::map* GetPlatformCompilerFactories(); // Map from platform kind to compiler instance, if we made one already (based // on the factories above). - static std::map>* + static std::map>* GetPlatformCompilers(); }; diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc index 657fba6b6231104bf47f9dec80f7cd36a0ba3efd..7c1bacff92b231661477b9931a3066fd91110445 100644 --- a/tensorflow/compiler/xla/service/computation_placer.cc +++ b/tensorflow/compiler/xla/service/computation_placer.cc @@ -32,8 +32,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const { @@ -132,11 +130,9 @@ StatusOr ComputationPlacer::AssignDevices( ComputationPlacer::platform_computation_placer_mutex_( tensorflow::LINKER_INITIALIZED); -/* static */ std::map* +/* static */ std::map* ComputationPlacer::GetPlatformComputationPlacers() { - static auto* r = - new std::map; + static auto* r = new std::map; return r; } @@ -147,10 +143,10 @@ static std::unique_ptr CreateComputationPlacer() { } static bool InitModule() { - xla::ComputationPlacer::RegisterComputationPlacer(se::host::kHostPlatformId, - &CreateComputationPlacer); - xla::ComputationPlacer::RegisterComputationPlacer(se::cuda::kCudaPlatformId, - &CreateComputationPlacer); + xla::ComputationPlacer::RegisterComputationPlacer( + stream_executor::host::kHostPlatformId, &CreateComputationPlacer); + xla::ComputationPlacer::RegisterComputationPlacer( + stream_executor::cuda::kCudaPlatformId, &CreateComputationPlacer); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h index 737ccabaa7a61931b6e2787f75b02857562d4820..737d00e93ecb51a9bd544bbcbe99d93374d108fb 100644 --- a/tensorflow/compiler/xla/service/computation_placer.h +++ b/tensorflow/compiler/xla/service/computation_placer.h @@ -80,13 +80,13 @@ class ComputationPlacer { // Registers a computation placer creation function for a particular platform. static void RegisterComputationPlacer( - perftools::gputools::Platform::Id platform_id, + se::Platform::Id platform_id, ComputationPlacerCreationFunction creation_function); // Returns the computation placer singleton pointer if it is available for the // given platform, or an error status if it is not. static StatusOr GetForPlatform( - const perftools::gputools::Platform* platform); + const se::Platform* platform); private: // The mutex that guards the platform-to-computation placer map. @@ -101,10 +101,9 @@ class ComputationPlacer { }; // Map from platform kind to computation placer singleton. - static std::map* - GetPlatformComputationPlacers(); + static std::map* GetPlatformComputationPlacers(); - perftools::gputools::Platform::Id platform_id_; + se::Platform::Id platform_id_; TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer); }; diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc index f35de080853f7ec986565cb2df1050946ac3f244..e560abc87f84566905333181c159edd3ca297563 100644 --- a/tensorflow/compiler/xla/service/conditional_simplifier.cc +++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc @@ -69,7 +69,7 @@ static StatusOr TryRemoveConditional(HloInstruction* conditional) { conditional->shape(), {conditional->mutable_operand(2)}, conditional->false_computation())); } - + conditional->SetupDerivedInstruction(call_op); TF_RETURN_IF_ERROR(computation->ReplaceInstruction(conditional, call_op)); TF_RETURN_IF_ERROR(CallInliner::Inline(call_op).status()); diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index 246b80286189286dd29a306dd0bda495df9dad3e..04fda3b2df574502aa882dd6bcff5ff2022c8edf 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -89,12 +89,10 @@ cc_library( ":cpu_instruction_fusion", ":cpu_layout_assignment", ":cpu_options", - ":cpu_parallelization_preparation", ":disassembler", ":dot_op_emitter", ":ir_emission_utils", ":ir_emitter", - ":parallel_cpu_executable", ":parallel_task_assignment", ":simple_orc_jit", "//tensorflow/compiler/xla:literal_util", @@ -232,35 +230,6 @@ cc_library( ], ) -cc_library( - name = "parallel_cpu_executable", - srcs = ["parallel_cpu_executable.cc"], - hdrs = [ - "parallel_cpu_executable.h", - ], - deps = [ - ":cpu_runtime", - ":shape_partition", - ":simple_orc_jit", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:buffer_assignment", - "//tensorflow/compiler/xla/service:device_memory_allocator", - "//tensorflow/compiler/xla/service:executable", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_execution_profile", - "//tensorflow/compiler/xla/service:logical_buffer", - "//tensorflow/compiler/xla/service:shaped_buffer", - "//tensorflow/core:lib", - "//tensorflow/core:stream_executor_no_cuda", - "@llvm//:orc_jit", - ], -) - cc_library( name = "ir_emitter", srcs = [ @@ -661,25 +630,6 @@ cc_library( ], ) -cc_library( - name = "cpu_parallelization_preparation", - srcs = ["cpu_parallelization_preparation.cc"], - hdrs = [ - "cpu_parallelization_preparation.h", - ], - deps = [ - ":ir_emission_utils", - ":parallel_task_assignment", - ":shape_partition", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_cost_analysis", - "//tensorflow/compiler/xla/service:hlo_pass", - "//tensorflow/core:lib", - ], -) - cc_library( name = "ir_emission_utils", srcs = ["ir_emission_utils.cc"], diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index e43777c5e5e8afcf08e1e334c8847f6b94d0d047..3c0c367df30639dc7da148c180e2697c71223789 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -56,12 +56,10 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h" #include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h" #include "tensorflow/compiler/xla/service/cpu/cpu_options.h" -#include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h" #include "tensorflow/compiler/xla/service/cpu/disassembler.h" #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h" #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h" -#include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h" #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h" #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" @@ -100,8 +98,6 @@ limitations under the License. #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" -namespace se = ::perftools::gputools; - namespace xla { namespace cpu { @@ -310,10 +306,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) { module->config().intra_op_parallelism_threads() > 0 ? module->config().intra_op_parallelism_threads() : tensorflow::port::NumSchedulableCPUs(); - if (options::CpuParallelBackendRequested(module->config())) { - pipeline.AddPass(max_parallelism, - ShapeSizeBytesFunction()); - } else if (!is_aot_compile) { + if (!is_aot_compile) { // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module. // Note this is not run for AOT because it would bring in thread pool // and thread synchronization dependencies which would likely increase @@ -331,13 +324,6 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) { pipeline.AddPass(); pipeline.AddPass(); pipeline.AddPass(); - if (options::CpuParallelBackendRequested(module->config())) { - // Re-run the outlining, in case any copies were inserted into the entry - // computation. - pipeline.AddPass(max_parallelism, - ShapeSizeBytesFunction()); - pipeline.AddPass(); - } pipeline.AddPass(); return pipeline.Run(module).status(); } @@ -440,8 +426,7 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) { } // namespace StatusOr> CpuCompiler::RunHloPasses( - std::unique_ptr module, - perftools::gputools::StreamExecutor* /*stream_exec*/, + std::unique_ptr module, se::StreamExecutor* /*stream_exec*/, DeviceMemoryAllocator* /*device_allocator*/) { VLOG(2) << "Before optimization:"; XLA_VLOG_LINES(2, module->ToString()); @@ -454,8 +439,7 @@ StatusOr> CpuCompiler::RunHloPasses( } StatusOr> CpuCompiler::RunBackend( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* /*device_allocator*/) { const string timer_message = "Compiling [" + module->name() + "] for CPU using JIT"; @@ -526,190 +510,80 @@ StatusOr> CpuCompiler::RunBackend( const string xla_dump_optimized_hlo_proto_to = module->config().debug_options().xla_dump_optimized_hlo_proto_to(); - if (options::CpuParallelBackendRequested(module->config())) { - VLOG(1) << "Using parallel cpu backend"; - - // Run buffer analysis on the HLO graph. This analysis figures out which - // temporary buffers are required to run the computation. - // DependencyHloOrdering is used for the parallel emitter because the order - // of HLO instruction execution is not known ahead of time. - // DependencyHloOrdering is the most conservative partial order and only - // uses data dependencies for determining order. - TF_ASSIGN_OR_RETURN( - std::unique_ptr assignment, - BufferAssigner::Run( - module.get(), xla::MakeUnique(module.get()), - BufferSizeBytesFunction(), memory_alignment)); - // BufferAssignment::ToString() includes a header, so no need for us to - // print one ourselves. - XLA_VLOG_LINES(2, assignment->ToString()); - - if (!xla_dump_optimized_hlo_proto_to.empty()) { - HloProto proto = MakeHloProto(*module, *assignment); - TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( - proto, xla_dump_optimized_hlo_proto_to, module->name())); - } - - // If we are using the parallel CPU backend, we need to create map from - // HloInstruction to the corresponding generated function name. - std::map parallel_computations; - std::unordered_map> - aligned_constants; - for (auto instruction : entry_computation->MakeInstructionPostOrder()) { - // Parameters and constants don't get their own computation. - if (instruction->opcode() == HloOpcode::kParameter) { - continue; - } - if (instruction->opcode() == HloOpcode::kConstant) { - // Copy the constant out of the ProtocolBuffer so that we can give it a - // higher alignment. - const void* data = instruction->literal().untyped_data(); - int64 size = CpuExecutable::ShapeSizeBytes(instruction->shape()); - auto iter = aligned_constants.emplace( - instruction, xla::MakeUnique(size)); - CHECK_EQ(iter.second, true); - unsigned char* aligned_data = iter.first->second.get(); - memcpy(aligned_data, data, size); - continue; - } - // The parallel preparation should have ensured that the top-level - // computation consists solely of Call instructions. - TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall) - << module->ToString(); - HloComputation* to_apply = instruction->to_apply(); - parallel_computations.emplace(to_apply, instruction); - } - - IrEmitter ir_emitter(*module, *assignment, llvm_module.get(), - std::move(instruction_to_profile_idx), - std::move(computation_to_profile_idx), - jit->target_machine(), jit->external_constant_pool()); - - std::unique_ptr> function_names( - new HloInstructionMap()); - for (auto embedded_computation : - entry_computation->MakeEmbeddedComputationsList()) { - if (embedded_computation->IsFusionComputation()) { - continue; - } - auto parallel_computation_iter = - parallel_computations.find(embedded_computation); - // All parallel computations are considered to be an entry computation for - // IR generation purposes. - bool computation_is_parallel = - parallel_computation_iter != parallel_computations.end(); - TF_ASSIGN_OR_RETURN( - llvm::Function * ir_function, - ir_emitter.EmitComputation( - embedded_computation, embedded_computation->name(), - /*is_top_level_computation=*/computation_is_parallel, - /*instruction_order=*/nullptr)); - // If this computation is parallel, remember it in the function name map. - // This way we know what function to execute when we try to run code for - // the Call instruction. - if (computation_is_parallel) { - HloInstruction* call_instruction = parallel_computation_iter->second; - InsertOrDie(function_names.get(), call_instruction, - llvm_ir::AsString(ir_function->getName())); - } - } - - string ir_module_string; - if (embed_ir_in_executable) { - ir_module_string = llvm_ir::DumpModuleToString(*llvm_module); - } - TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); - - // JIT compile the LLVM IR module to in-memory machine code. - jit->AddModule(std::move(llvm_module)); - cpu_executable.reset(new ParallelCpuExecutable( - std::move(jit), std::move(assignment), std::move(module), - std::move(function_names), std::move(aligned_constants), - std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map))); - - if (embed_ir_in_executable) { - static_cast(*cpu_executable) - .set_ir_module_string(ir_module_string); - } - } else { - VLOG(1) << "Using sequential cpu backend"; - - // Select an order for emitting the HLO instructions for each - // computation. Using this sequence enables tighter buffer liveness analysis - // and reduced memory usage (as compared to using DependencyHloOrdering). - TF_ASSIGN_OR_RETURN( - SequentialHloOrdering::HloModuleSequence module_sequence, - CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction())); - - // Run buffer analysis on the HLO graph. This analysis figures out which - // temporary buffers are required to run the computation. - TF_ASSIGN_OR_RETURN( - std::unique_ptr assignment, - BufferAssigner::Run(module.get(), - xla::MakeUnique( - module.get(), module_sequence), - BufferSizeBytesFunction(), memory_alignment)); - // BufferAssignment::ToString() includes a header, so no need for us to - // print one ourselves. - XLA_VLOG_LINES(2, assignment->ToString()); - - if (!xla_dump_optimized_hlo_proto_to.empty()) { - HloProto proto = MakeHloProto(*module, *assignment); - TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( - proto, xla_dump_optimized_hlo_proto_to, module->name())); - } + // Select an order for emitting the HLO instructions for each + // computation. Using this sequence enables tighter buffer liveness analysis + // and reduced memory usage (as compared to using DependencyHloOrdering). + TF_ASSIGN_OR_RETURN( + SequentialHloOrdering::HloModuleSequence module_sequence, + CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction())); + + // Run buffer analysis on the HLO graph. This analysis figures out which + // temporary buffers are required to run the computation. + TF_ASSIGN_OR_RETURN( + std::unique_ptr assignment, + BufferAssigner::Run( + module.get(), + xla::MakeUnique(module.get(), module_sequence), + BufferSizeBytesFunction(), memory_alignment)); + // BufferAssignment::ToString() includes a header, so no need for us to + // print one ourselves. + XLA_VLOG_LINES(2, assignment->ToString()); + + if (!xla_dump_optimized_hlo_proto_to.empty()) { + HloProto proto = MakeHloProto(*module, *assignment); + TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( + proto, xla_dump_optimized_hlo_proto_to, module->name())); + } - // Each computation is a single function. Emit all embedded computations - // before the entry computation. The order of computations returned from - // GetEmbeddedComputations guarantees that a called computation occurs - // before a caller computation. + // Each computation is a single function. Emit all embedded computations + // before the entry computation. The order of computations returned from + // GetEmbeddedComputations guarantees that a called computation occurs + // before a caller computation. - IrEmitter ir_emitter(*module, *assignment, llvm_module.get(), - std::move(instruction_to_profile_idx), - std::move(computation_to_profile_idx), - jit->target_machine(), jit->external_constant_pool()); + IrEmitter ir_emitter(*module, *assignment, llvm_module.get(), + std::move(instruction_to_profile_idx), + std::move(computation_to_profile_idx), + jit->target_machine(), jit->external_constant_pool()); - for (auto embedded_computation : - entry_computation->MakeEmbeddedComputationsList()) { - if (embedded_computation->IsFusionComputation()) { - continue; - } - TF_RETURN_IF_ERROR( - ir_emitter - .EmitComputation(embedded_computation, - embedded_computation->name(), - /*is_top_level_computation=*/false, - &module_sequence.at(embedded_computation)) - .status()); + for (auto embedded_computation : + entry_computation->MakeEmbeddedComputationsList()) { + if (embedded_computation->IsFusionComputation()) { + continue; } - string function_name_prefix = entry_computation->name().empty() - ? "__compute" - : entry_computation->name(); - TF_ASSIGN_OR_RETURN( - llvm::Function * entry_function, - ir_emitter.EmitComputation(entry_computation, function_name_prefix, - /*is_top_level_computation=*/true, - &module_sequence.at(entry_computation))); - - string function_name = llvm_ir::AsString(entry_function->getName()); - string ir_module_string; - if (embed_ir_in_executable) { - ir_module_string = llvm_ir::DumpModuleToString(*llvm_module); - } - TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); + TF_RETURN_IF_ERROR( + ir_emitter + .EmitComputation(embedded_computation, embedded_computation->name(), + /*is_top_level_computation=*/false, + &module_sequence.at(embedded_computation)) + .status()); + } + string function_name_prefix = entry_computation->name().empty() + ? "__compute" + : entry_computation->name(); + TF_ASSIGN_OR_RETURN( + llvm::Function * entry_function, + ir_emitter.EmitComputation(entry_computation, function_name_prefix, + /*is_top_level_computation=*/true, + &module_sequence.at(entry_computation))); + + string function_name = llvm_ir::AsString(entry_function->getName()); + string ir_module_string; + if (embed_ir_in_executable) { + ir_module_string = llvm_ir::DumpModuleToString(*llvm_module); + } + TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); - XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module)); + XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module)); - // JIT compile the LLVM IR module to in-memory machine code. - jit->AddModule(std::move(llvm_module)); - cpu_executable.reset(new CpuExecutable( - std::move(jit), std::move(assignment), std::move(module), function_name, - std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map))); + // JIT compile the LLVM IR module to in-memory machine code. + jit->AddModule(std::move(llvm_module)); + cpu_executable.reset(new CpuExecutable( + std::move(jit), std::move(assignment), std::move(module), function_name, + std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map))); - if (embed_ir_in_executable) { - static_cast(*cpu_executable) - .set_ir_module_string(ir_module_string); - } + if (embed_ir_in_executable) { + static_cast(*cpu_executable) + .set_ir_module_string(ir_module_string); } VLOG(1) << "Compilation finished"; @@ -938,9 +812,9 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const { } // namespace xla static bool InitModule() { - xla::Compiler::RegisterCompilerFactory(se::host::kHostPlatformId, []() { - return xla::MakeUnique(); - }); + xla::Compiler::RegisterCompilerFactory( + stream_executor::host::kHostPlatformId, + []() { return xla::MakeUnique(); }); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h index 3498139ab95d21383c6dc008ae5614b7bfe91148..151af38438a980e40c06a1801a936cb620c6c4ba 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h @@ -53,7 +53,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions { RelocationModel relocation_model); ~CpuAotCompilationOptions() override; - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; // The triple used for compilation, similar to clang's -target flag. const string& triple() const { return triple_; } @@ -112,25 +112,23 @@ class CpuCompiler : public LLVMCompiler { // Bring in // StatusOr>> Compile( // std::vector> modules, - // std::vector> + // std::vector> // stream_execs) using LLVMCompiler::Compile; StatusOr> RunHloPasses( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::vector> modules, const AotCompilationOptions& options) override; - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index c053703c3524a47ee1de9681c1b986edbf109430..aabf4d5161e3af9d49876c6133f8ec5ddfbbf6d6 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -45,8 +45,6 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/stream_executor/host/host_stream.h" -namespace se = ::perftools::gputools; - namespace xla { namespace cpu { @@ -75,7 +73,7 @@ CpuExecutable::CpuExecutable( Status CpuExecutable::AllocateBuffers( DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers) { + std::vector* buffers) { CHECK_EQ(buffers->size(), assignment_->Allocations().size()); VLOG(3) << "Allocating " << assignment_->Allocations().size() << " allocations for module " << module().name(); @@ -245,19 +243,18 @@ static Status DeallocateTempBuffers( return Status::OK(); } -StatusOr> CpuExecutable::CreateResultShapedBuffer( +StatusOr CpuExecutable::CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice - allocated_buffers, + tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result) { se::Stream* stream = run_options->stream(); - auto result_buffer = MakeUnique( + ScopedShapedBuffer result_buffer( /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(), - stream->parent()->platform(), stream->parent()->device_ordinal()); + run_options->allocator(), stream->parent()->device_ordinal()); // Copy DeviceMemoryBase values which contain the array(s) of the result into // the respective location in ShapedBuffer which is returned to the caller. - TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus( + TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus( [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { const auto& sources = this->GetRootPointsToSet().element(index); // The points to set is unambiguous so the set should be a @@ -284,7 +281,7 @@ StatusOr> CpuExecutable::CreateResultShapedBuffer( return std::move(result_buffer); } -StatusOr> CpuExecutable::ExecuteOnStream( +StatusOr CpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -303,7 +300,7 @@ StatusOr> CpuExecutable::ExecuteOnStream( std::vector buffers_in_result(assignment_->Allocations().size(), false); TF_ASSIGN_OR_RETURN( - std::unique_ptr result_buffer, + ScopedShapedBuffer result_buffer, CreateResultShapedBuffer(run_options, buffers, &buffers_in_result)); // Free all buffers not in the result. @@ -313,7 +310,7 @@ StatusOr> CpuExecutable::ExecuteOnStream( return std::move(result_buffer); } -StatusOr> CpuExecutable::ExecuteAsyncOnStream( +StatusOr CpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { if (hlo_profiling_enabled()) { @@ -322,7 +319,7 @@ StatusOr> CpuExecutable::ExecuteAsyncOnStream( "supported on CPU."); } - auto* host_stream = dynamic_cast( + auto* host_stream = dynamic_cast( run_options->stream()->implementation()); se::Stream* stream = run_options->stream(); DeviceMemoryAllocator* memory_allocator = run_options->allocator(); @@ -333,7 +330,7 @@ StatusOr> CpuExecutable::ExecuteAsyncOnStream( std::vector buffers_in_result(assignment_->Allocations().size(), false); TF_ASSIGN_OR_RETURN( - std::unique_ptr result_buffer, + ScopedShapedBuffer result_buffer, CreateResultShapedBuffer(run_options, buffers, &buffers_in_result)); LogLiveAddresses(buffers, buffers_in_result); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index d3502b3a03e27c8f90ed74c4d826dfab1c4e8b75..68ad38cba88720a04519fc2473fe6f9decbaaf93 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -55,12 +55,12 @@ class CpuExecutable : public Executable { std::unique_ptr hlo_profile_index_map); ~CpuExecutable() override {} - StatusOr> ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr> ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; @@ -90,29 +90,27 @@ class CpuExecutable : public Executable { // assignment. Each vector element corresponds to a particular Index. If // a vector element already contains a non-null DeviceMemoryBase, then no // buffer is assigned for this element. - Status AllocateBuffers( - DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers); + Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator, + int device_ordinal, + std::vector* buffers); // Calls the generated function performing the computation with the given // arguments using the supplied buffers. Status ExecuteComputeFunction( const ExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice - buffers, + tensorflow::gtl::ArraySlice buffers, HloExecutionProfile* hlo_execution_profile); - // Create a ShapedBuffer for holding the result of the computation. The + // Creates a ScopedShapedBuffer for holding the result of the computation. The // addresses (DeviceMemoryBases) are set according to buffer assignment. // 'buffers_in_result' should point to a vector of the same size as // 'allocated_buffers'. An element in buffers_in_result is set to true if the // corresponding buffer is live out of the computation (and thus contained in // the returned ShapedBuffer). - StatusOr> CreateResultShapedBuffer( + StatusOr CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice - allocated_buffers, + tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result); // Returns the points-to set of the root instruction of the entry diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc index 09f028463af68bbc2841fecdb2ca6c6a42498798..f9c51f243c47b8069500eca3c9c2929b17f04e62 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc @@ -19,7 +19,6 @@ limitations under the License. namespace { -const char* const kXlaParallelCpuOption = "xla_cpu_parallel"; const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size"; const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce"; const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor"; @@ -30,12 +29,6 @@ namespace xla { namespace cpu { namespace options { -bool CpuParallelBackendRequested(const HloModuleConfig& config) { - const auto& extra_options_map = - config.debug_options().xla_backend_extra_options(); - return extra_options_map.count(kXlaParallelCpuOption) > 0; -} - bool OptimizeForSizeRequested(const HloModuleConfig& config) { const auto& extra_options_map = config.debug_options().xla_backend_extra_options(); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h index 6ba0fd24538b63a3da81083482e6bee3b552dfea..be62ff3cc1af23408ca8a00f1372e7a998f160c6 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_options.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h @@ -24,7 +24,6 @@ namespace xla { namespace cpu { namespace options { -bool CpuParallelBackendRequested(const HloModuleConfig& config); bool OptimizeForSizeRequested(const HloModuleConfig& config); bool VectorizedReduceDisabled(const HloModuleConfig& config); tensorflow::gtl::optional LlvmIrGemvTilingFactor( diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc deleted file mode 100644 index 662ee609232f5582ce74f4f515637b2623175e94..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h" - -#include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h" -#include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h" -#include "tensorflow/compiler/xla/service/cpu/shape_partition.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" - -namespace xla { -namespace cpu { - -StatusOr ParallelizationPreparation::Run(HloModule* module) { - XLA_VLOG_LINES(2, "ParallelizationPreparation ENTRY"); - XLA_VLOG_LINES(2, module->ToString()); - - bool changed = false; - TF_ASSIGN_OR_RETURN(changed, RunParallelTaskAssignment(module)); - - HloComputation* entry_computation = module->entry_computation(); - std::unordered_set outlined; - std::vector instructions_to_outline; - for (HloInstruction* instruction : - entry_computation->MakeInstructionPostOrder()) { - // If the instruction has been outlined, it no longer exists and we must not - // dereference it. - if (outlined.count(instruction) > 0) { - continue; - } - - // Skip parameters and constants, there is nothing to parallelize. - if (instruction->opcode() == HloOpcode::kParameter || - instruction->opcode() == HloOpcode::kConstant) { - continue; - } - - // Outline 'instruction' in isolation if it was assigned parallel tasks. - if (OutlineParallelizableInstruction(instruction)) { - outlined.insert(instruction); - changed = true; - continue; - } - - instructions_to_outline.clear(); - HloInstruction* outline_candidate = instruction; - instructions_to_outline.push_back(outline_candidate); - - // Outline sole users with the current instruction. - while (CanOutlineWithUser(outline_candidate)) { - HloInstruction* prior_candidate = outline_candidate; - outline_candidate = *outline_candidate->users().begin(); - if (std::any_of(outline_candidate->operands().begin(), - outline_candidate->operands().end(), - [&](const HloInstruction* operand) { - // Do not consider any candidates which have operands - // other than the prior candidate, constants or - // parameters. Otherwise, we'd increase the fan-in which - // would reduce parallelism. - return operand->opcode() != HloOpcode::kParameter && - operand->opcode() != HloOpcode::kConstant && - operand != prior_candidate; - })) { - break; - } - instructions_to_outline.push_back(outline_candidate); - } - - outlined.insert(instructions_to_outline.begin(), - instructions_to_outline.end()); - - // Optimization to avoid replacing a single existing kCall with another - // kCall that just calls the first one. - if (instructions_to_outline.size() == 1 && - instructions_to_outline[0]->opcode() == HloOpcode::kCall) { - continue; - } - - module->OutlineExpressionFromComputation( - instructions_to_outline, - tensorflow::strings::StrCat("pp_", instruction->name()), - entry_computation); - changed = true; - } - - XLA_VLOG_LINES(2, "ParallelizationPreparation EXIT"); - XLA_VLOG_LINES(2, module->ToString()); - return changed; -} - -StatusOr ParallelizationPreparation::RunParallelTaskAssignment( - HloModule* module) { - VLOG(1) << "RunParallelTaskAssignment max_parallelism_: " << max_parallelism_; - bool changed = false; - // Initialize ParallelTaskAssignment. - ParallelTaskAssignment parallel_task_assignment(max_parallelism_, shape_size_, - module); - // Assign parallel tasks to HLOs in entry computation. - HloComputation* computation = module->entry_computation(); - for (auto* instruction : computation->instructions()) { - // Calculate target parallel task count in [1, max_parallelism_]. - const int64 target_parallel_task_count = - parallel_task_assignment.GetTargetParallelTaskCount(instruction); - if (target_parallel_task_count == 1) { - continue; - } - - // Assign feasible dimension partitions (based on actual dimension sizes). - auto dim_partition_counts = ShapePartitionAssigner(instruction->shape()) - .Run(target_parallel_task_count); - const int64 total_partition_count = - ShapePartitionAssigner::GetTotalPartitionCount(dim_partition_counts); - if (total_partition_count <= 1) { - // Feasible partition calculation resulting in no partitioning, so skip. - continue; - } - VLOG(2) << "Assigning parallel task count: " << total_partition_count - << " to instruction: " << instruction->name(); - // Map 'instruction' to assigned dimension partitioning. - instruction->set_outer_dimension_partitions(dim_partition_counts); - } - - return changed; -} - -bool ParallelizationPreparation::OutlineParallelizableInstruction( - HloInstruction* instruction) { - if (instruction->outer_dimension_partitions().empty()) { - return false; - } - // Store dimension partition counts before outlining (which clones - // 'instruction'). - std::vector dim_partition_counts = - instruction->outer_dimension_partitions(); - // Outline 'instruction' in its own sub-computation. - HloModule* module = instruction->parent()->parent(); - auto* call = module->OutlineExpressionFromComputation( - {instruction}, tensorflow::strings::StrCat("pp_", instruction->name()), - module->entry_computation()); - // Map previously assigned 'dim_partition_counts' to cloned root instruction. - VLOG(1) << "Outlining parallelizable" - << " caller: " << call->name() - << " callee: " << call->to_apply()->root_instruction()->name(); - call->to_apply()->root_instruction()->set_outer_dimension_partitions( - dim_partition_counts); - return true; -} - -bool ParallelizationPreparation::CanOutlineWithUser( - HloInstruction* instruction) { - if (instruction->users().size() != 1) { - // Do not outline 'instruction' with multiple users. - return false; - } - if (AssignedParallelTasks(instruction) || - AssignedParallelTasks(*instruction->users().begin())) { - // Do not outline if 'instruction' (or user) were assigned parallel tasks. - return false; - } - return true; -} - -bool ParallelizationPreparation::AssignedParallelTasks( - HloInstruction* instruction) { - return !instruction->outer_dimension_partitions().empty() || - (instruction->opcode() == HloOpcode::kCall && - !instruction->to_apply() - ->root_instruction() - ->outer_dimension_partitions() - .empty()); -} - -} // namespace cpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h deleted file mode 100644 index 87be758ef5d0535fdce3a65e54ce225042019cdb..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_ - -#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" - -namespace xla { -namespace cpu { - -// This pass prepares an HLO module for parallel execution by transforming -// subgraphs of the top-level computation into embedded computations which can -// be executed in parallel. -// TODO(b/29630486): Currently, it is limited to turning all instructions (which -// are not constants or parameters) in the entry computation into embedded -// computations. However, it could make sense to coarsen the parallelization to -// improve cache locality. Also, we will need to do something to intelligently -// handle While constructs. -class ParallelizationPreparation : public HloPassInterface { - public: - // 'max_parallelism': the maximum parallel task count per instruction. - // 'shape_size': shape size function used by HloCostAnalysis during parallel - // task assignment. - ParallelizationPreparation( - const int64 max_parallelism, - const HloCostAnalysis::ShapeSizeFunction& shape_size) - : max_parallelism_(max_parallelism), shape_size_(shape_size) {} - ~ParallelizationPreparation() override {} - - tensorflow::StringPiece name() const override { - return "cpu-parallel-prepare"; - } - - // Run parallel preparation on the given computation. Returns whether the - // computation was changed. - StatusOr Run(HloModule* module) override; - - private: - // Assigns parallel task partitions to conformant instructions in 'module'. - // Returns true on success or error status otherwise. - StatusOr RunParallelTaskAssignment(HloModule* module); - - // Outlines 'instruction' from entry computation, if it had - // been assigned parallel tasks in an earlier pass through the computation. - // Returns true if 'instruction' was successfully outlined, false otherwise. - bool OutlineParallelizableInstruction(HloInstruction* instruction); - - // Returns true if 'instruction' can be outlined into the same sub-computation - // with its single user (parallelizable instructions are not outlined with - // each other). Returns false otherwise. - bool CanOutlineWithUser(HloInstruction* instruction); - - // Returns true if 'instruction' (or the root of the sub-computation that - // 'instruction' calls) has had parallel tasks assigned in earlier pass. - // Returns false otherwise. - bool AssignedParallelTasks(HloInstruction* instruction); - - const int64 max_parallelism_; - const HloCostAnalysis::ShapeSizeFunction shape_size_; -}; - -} // namespace cpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_ diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc index f5e61aef534da57ce13d3ee9bbeaeaec31f53d2e..9b39e7f5765ae5eb6a25c06eef4d74b1c00e5c91 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc @@ -34,8 +34,6 @@ limitations under the License. #include "tensorflow/core/platform/notification.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { @@ -241,21 +239,20 @@ Status CpuTransferManager::TransferLiteralFromOutfeed( } StatusOr CpuTransferManager::TransferTupleBuffersFromOutfeed( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice> buffer_data) { return TransferBuffersFromOutfeedInternal(executor, buffer_data, /*is_tuple=*/true); } StatusOr CpuTransferManager::TransferArrayBufferFromOutfeed( - perftools::gputools::StreamExecutor* executor, void* destination, - int64 size_bytes) { + se::StreamExecutor* executor, void* destination, int64 size_bytes) { return TransferBuffersFromOutfeedInternal( executor, {{destination, size_bytes}}, /*is_tuple=*/false); } StatusOr CpuTransferManager::TransferBuffersFromOutfeedInternal( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice> buffer_data, bool is_tuple) { std::vector> buffers; @@ -306,8 +303,8 @@ static std::unique_ptr CreateCpuTransferManager() { } static bool InitModule() { - xla::TransferManager::RegisterTransferManager(se::host::kHostPlatformId, - &CreateCpuTransferManager); + xla::TransferManager::RegisterTransferManager( + stream_executor::host::kHostPlatformId, &CreateCpuTransferManager); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h index 6c7524d94716464218ba18ad9950f702d2759f89..3ecb0d236498371f48caf63249f9cd4e8777752b 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h @@ -37,36 +37,35 @@ class CpuTransferManager : public GenericTransferManager { CpuTransferManager(); ~CpuTransferManager() override {} - Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor, + Status TransferLiteralToInfeed(se::StreamExecutor* executor, const Literal& literal) override; - Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor, - int64 size, const void* source) override; - Status TransferLiteralFromOutfeed( - perftools::gputools::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) override; + Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, + const void* source) override; + Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, + const Shape& literal_shape, + Literal* literal) override; private: // Transfers infeed data to device. InfeedBuffer->Done() must be // called to clean up the memory allocated for InfeedBuffer. StatusOr TransferBufferToInfeedInternal( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source); + se::StreamExecutor* executor, int64 size, const void* source); // Helper that transfers a tuple of element buffers from the device's outfeed. StatusOr TransferTupleBuffersFromOutfeed( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice> buffer_data); // Helper that transfers an array buffer from the device's outfeed. - StatusOr TransferArrayBufferFromOutfeed( - perftools::gputools::StreamExecutor* executor, void* destination, - int64 size_bytes); + StatusOr TransferArrayBufferFromOutfeed(se::StreamExecutor* executor, + void* destination, + int64 size_bytes); // On success, returns the shape that was transferred from the outfeed -- if // is_tuple is true, the returned shape will be a tuple of the returned shapes // for the given buffers. StatusOr TransferBuffersFromOutfeedInternal( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice> buffer_data, bool is_tuple); diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc index 29afd8ea5f9822ea9ae969ae035511a58de4888e..495fecc4aa8b3cf8fcb3ab63d82d8146546854da 100644 --- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc @@ -1070,7 +1070,8 @@ static bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape, // 1) be matrices with no padding, and // 2) have an allowed element type. PrimitiveType output_primitive_type = output_shape.element_type(); - return (output_primitive_type == F32 || output_primitive_type == F16) && + return (output_primitive_type == F64 || output_primitive_type == F32 || + output_primitive_type == F16) && IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) && IsRank2WithNoPadding(output_shape); } diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 3405277d449f2d9e558f2d3f83277163655af592..0b08ad8da3cf1734c96dd41e9145aece08dfe779 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -93,8 +93,6 @@ IrEmitter::IrEmitter( computation_to_profile_idx_(std::move(computation_to_profile_idx)), alias_analysis_(hlo_module, assignment, &llvm_module->getContext()), hlo_module_config_(hlo_module.config()), - parallel_cpu_backend_( - options::CpuParallelBackendRequested(hlo_module_config_)), is_top_level_computation_(false), target_machine_features_(target_machine), external_constant_pool_(external_constant_pool) { @@ -2076,7 +2074,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) { TF_RETURN_IF_ERROR(ElementTypesSameAndSupported( /*instruction=*/*root, /*operands=*/{lhs, rhs}, - /*supported_types=*/{F16, F32})); + /*supported_types=*/{F16, F32, F64})); llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs)); llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs)); @@ -2163,8 +2161,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) { TF_RETURN_IF_ERROR(EmitTargetAddressForOp(call)); - if (!computation->root_instruction()->outer_dimension_partitions().empty() && - !parallel_cpu_backend_) { + if (!computation->root_instruction()->outer_dimension_partitions().empty()) { // ParallelTaskAssignment assigned partitions, emit call to // ParallelForkJoin. std::vector call_args = GetArrayFunctionCallArguments( @@ -2550,22 +2547,6 @@ Status IrEmitter::FinishVisit(HloInstruction* root) { } }; - // For the parallel cpu backend, we record the total for each embedded - // computation callee with its caller kCall HLO. - if (parallel_cpu_backend_ && is_top_level_computation_) { - auto* computation = root->parent(); - auto* entry_computation = computation->parent()->entry_computation(); - if (computation != entry_computation) { - for (HloInstruction* instruction : entry_computation->instructions()) { - if (instruction->opcode() == HloOpcode::kCall && - instruction->to_apply()->root_instruction() == root) { - record_complete_computation(GetProfileCounterFor(*instruction)); - return Status::OK(); - } - } - } - } - // For the entry computation this increment is cumulative of embedded // computations since it includes cycles spent in computations invoked by // While, Call etc. diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h index 509440251497cd7337284c39dae05c5f6c28e7c2..0f2f3d1817d6e891211bed843cd05c414771f151 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h @@ -532,8 +532,6 @@ class IrEmitter : public DfsHloVisitorWithDefault { const HloModuleConfig& hlo_module_config_; - const bool parallel_cpu_backend_; - bool is_top_level_computation_; TargetMachineFeatures target_machine_features_; diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc deleted file mode 100644 index 07a9f0efcb64db4b2ff0c6518d4b48eee9a505e0..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc +++ /dev/null @@ -1,531 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" -#include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/service/buffer_assignment.h" -#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h" -#include "tensorflow/compiler/xla/service/cpu/shape_partition.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/service/logical_buffer.h" -#include "tensorflow/compiler/xla/service/shaped_buffer.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/mem.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/types.h" - -namespace se = ::perftools::gputools; - -namespace xla { -namespace cpu { - -ParallelCpuExecutable::ParallelCpuExecutable( - std::unique_ptr jit, - std::unique_ptr assignment, - std::unique_ptr hlo_module, - std::unique_ptr> function_names, - std::unordered_map> - aligned_constants, - std::unique_ptr hlo_profile_printer_data, - std::unique_ptr hlo_profile_index_map) - : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data), - std::move(hlo_profile_index_map)), - jit_(std::move(jit)), - assignment_(std::move(assignment)), - function_names_(std::move(function_names)), - aligned_constants_(std::move(aligned_constants)) {} - -// Type of the computation function we expect in the JIT. -using ComputeFunctionType = void (*)(void*, const void*, const void**, void**, - int64*, int64*); - -// Given a pointer to an output buffer (following the CPU JIT calling -// conventions), mark addresses that are "live". The initial pointer itself is -// trivially live. If the shape of the buffer is a tuple, this analysis looks -// into the tuple's elements and marks them live as well (since tuples keep -// pointers to buffers) and also works recursively. -// address is an in-memory buffer address that contains some runtime XLA object. -// shape is its shape. marked_addresses is the set of live addresses to -// populate. -static void MarkLiveAddressesInOutput( - const void* address, const Shape& shape, - std::unordered_set* marked_addresses) { - marked_addresses->insert(address); - const uintptr_t* address_buffer = static_cast(address); - if (ShapeUtil::IsTuple(shape)) { - for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { - const uintptr_t* element_address = address_buffer + i; - const void* element = reinterpret_cast(*element_address); - MarkLiveAddressesInOutput( - element, ShapeUtil::GetTupleElementShape(shape, i), marked_addresses); - } - } -} - -namespace { - -// Executor manages the concurrent execution of 'functions' for instructions -// in 'pending' on 'thread_pool' (storing resulting data in 'results'). -class Executor { - public: - Executor(const HloInstructionMap& functions, - const ServiceExecutableRunOptions* run_options, - std::list* pending, - HloInstructionMap* results, void** temps_array, - int64* profile_counters_array, const BufferAssignment* assignment) - : functions_(functions), - run_options_(run_options), - pending_(pending), - results_(results), - temps_array_(temps_array), - profile_counters_array_(profile_counters_array), - thread_pool_(CHECK_NOTNULL(run_options_->xla_intra_op_thread_pool())), - assignment_(assignment) {} - - // Executes pending list of instructions on thread pool. - // Returns OK status on success, error status otherwise. - Status Run(); - - private: - // Schedules a parallel invocation of compute function for 'instruction' on - // 'thread_pool_', storing result in 'result_buffer'. - // If 'partition_buffers' is non-null, parallel task will be invoked on - // per-dimension partition [start, limit) values stored in - // 'partition_buffers'. - void Schedule(HloInstruction* instruction, int64* partition_buffers, - void* result_buffer); - - // Returns true if 'instruction' has been assigned parallel tasks (returns - // false otherwise). - bool HasParallelTasks(HloInstruction* instruction); - - // Returns in 'partition_buffers' the partition [size, limit) for each - // dimension. - int64* GetPartitionBuffers( - const std::vector>& partition); - - // Returns array of result buffers for all operands in 'instruction'. - const void** GetOperandBuffers(HloInstruction* instruction); - - // Arguments passed into Executor. - const HloInstructionMap& functions_; - const ServiceExecutableRunOptions* run_options_; - std::list* pending_; - HloInstructionMap* results_; - void** temps_array_; - int64* profile_counters_array_; - tensorflow::thread::ThreadPool* thread_pool_; - const BufferAssignment* assignment_; - - // Members used to manage instruction execution. - tensorflow::mutex completion_queue_lock_; - tensorflow::condition_variable completion_queue_cv_; - std::deque completion_queue_; - int64 instructions_in_flight_ = 0; - std::unordered_map tasks_in_flight_; -}; - -Status Executor::Run() { - while (!pending_->empty() || instructions_in_flight_ > 0) { - auto pending_it = pending_->begin(); - while (pending_it != pending_->end()) { - HloInstruction* instruction = *pending_it; - // Skip pending instructions whose operands aren't ready. - if (std::any_of(instruction->operands().begin(), - instruction->operands().end(), - [&](HloInstruction* operand) { - return !ContainsKey(*results_, operand); - })) { - ++pending_it; - continue; - } - - // Get 'result_buffer' reference to result buffer for 'instruction'. - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, - assignment_->GetUniqueTopLevelSlice(instruction)); - void* result_buffer = - static_cast(temps_array_[result_slice.index()]) + - result_slice.offset(); - - if (HasParallelTasks(instruction)) { - // 'instruction' has been assigned parallel task partitions. - CHECK_EQ(HloOpcode::kCall, instruction->opcode()); - HloInstruction* root = instruction->to_apply()->root_instruction(); - - // Create ShapePartitionIterator to iterate through all outer dimension - // partitions of 'instruction'. - ShapePartitionIterator partition_iterator( - root->shape(), root->outer_dimension_partitions()); - - const int64 partition_count = - partition_iterator.GetTotalPartitionCount(); - - // Record total parallel task count for 'instruction' before dispatch. - { - tensorflow::mutex_lock l(completion_queue_lock_); - tasks_in_flight_.insert(std::make_pair(instruction, partition_count)); - VLOG(2) << "Schedule PARALLEL" - << " instruction: " << instruction->name() - << " instruction.callee: " - << instruction->to_apply()->root_instruction()->name() - << " partition_count: " << partition_count; - } - - for (int64 i = 0; i < partition_count; ++i) { - // Get partition [start, limit) for each dimension. - auto partition_buffers = - GetPartitionBuffers(partition_iterator.GetPartition(i)); - Schedule(instruction, partition_buffers, result_buffer); - } - - } else { - // Set tasks in-flight to '1' for sequential instruction execution. - { - tensorflow::mutex_lock l(completion_queue_lock_); - tasks_in_flight_.insert(std::make_pair(instruction, 1)); - VLOG(2) << "Schedule SEQUENTIAL" - << " instruction: " << instruction->name() - << " instruction.callee: " - << instruction->to_apply()->root_instruction()->name(); - } - Schedule(instruction, nullptr, result_buffer); - } - - ++instructions_in_flight_; - pending_it = pending_->erase(pending_it); - } - // Wait for a completed HLO instruction to be present in the queue. We will - // pop it out of the queue and make the result available to its users. - HloInstruction* instruction; - do { - tensorflow::mutex_lock l(completion_queue_lock_); - if (completion_queue_.empty()) { - completion_queue_cv_.wait(l); - } - if (!completion_queue_.empty()) { - instruction = completion_queue_.front(); - completion_queue_.pop_front(); - break; - } - } while (true); - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, - assignment_->GetUniqueTopLevelSlice(instruction)); - void* result_buffer = - static_cast(temps_array_[result_slice.index()]) + - result_slice.offset(); - InsertOrDie(results_, instruction, result_buffer); - --instructions_in_flight_; - } - return Status::OK(); -} - -void Executor::Schedule(HloInstruction* instruction, int64* partition_buffers, - void* result_buffer) { - // The thread pool entry takes ownership of |operand_buffers|. - auto operand_buffers = GetOperandBuffers(instruction); - - auto function = FindOrDie(functions_, instruction); - const auto* exec_run_options = &run_options_->run_options(); - thread_pool_->Schedule([this, instruction, result_buffer, operand_buffers, - partition_buffers, exec_run_options, function]() { - function(result_buffer, exec_run_options, operand_buffers, temps_array_, - partition_buffers, profile_counters_array_); - - delete[] operand_buffers; - delete[] partition_buffers; - // Push the completed HLO instruction on the queue, the main - // thread will pop it off and potentially launch more work which - // uses the result. - // TODO(b/27458679) Consider alternative task scheduling and synchronization - // schemes. For example, we could avoid the overhead associate with the - // condvar here if the thread just dequed the next instruction to execute - // on completion. - { - tensorflow::mutex_lock l(completion_queue_lock_); - // Decrement in-flight task count for this completion. - if (--FindOrDie(tasks_in_flight_, instruction) == 0) { - completion_queue_.push_back(instruction); - completion_queue_cv_.notify_all(); - tasks_in_flight_.erase(instruction); - } - } - }); -} - -int64* Executor::GetPartitionBuffers( - const std::vector>& partition) { - // Return in 'partition_buffers' partition [size, limit) for each dimension. - auto partition_buffers = new int64[partition.size() * 2]; - for (int i = 0; i < partition.size(); ++i) { - partition_buffers[2 * i + 0] = partition[i].first; - partition_buffers[2 * i + 1] = partition[i].first + partition[i].second; - } - return partition_buffers; -} - -bool Executor::HasParallelTasks(HloInstruction* instruction) { - return instruction->opcode() == HloOpcode::kCall && - !instruction->to_apply() - ->root_instruction() - ->outer_dimension_partitions() - .empty(); -} - -const void** Executor::GetOperandBuffers(HloInstruction* instruction) { - // We cannot use a move-only RAII type like std::unique_ptr because the - // list of operands is allocated on the main thread and transferred to the - // worker via the lambda passed to enqueue_function. In order for the - // lambda to take ownership, we would need to use generalized lambda - // capture which is a feature new to C++14. - // TODO(b/27458679) Avoid dynamic allocations in Executor. - auto operand_buffers = new const void*[instruction->operand_count()]; - std::transform(instruction->operands().begin(), instruction->operands().end(), - operand_buffers, [this](HloInstruction* operand) { - return FindOrDie(*results_, operand); - }); - return operand_buffers; -} - -} // namespace - -Status ParallelCpuExecutable::AllocateBuffers( - DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers) { - CHECK_EQ(buffers->size(), assignment_->Allocations().size()); - VLOG(3) << "Allocating " << assignment_->Allocations().size() - << " allocations for module " << module().name(); - for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); - ++i) { - auto& allocation = assignment_->GetAllocation(i); - - VLOG(3) << allocation.ToString(); - - if (allocation.is_entry_computation_parameter()) { - VLOG(3) << "allocation #" << i << " is a parameter"; - continue; - } - - if (allocation.is_thread_local()) { - VLOG(3) << "buffer #" << i << " is thread-local"; - continue; - } - - int64 buffer_size = allocation.size(); - if (!(*buffers)[i].is_null()) { - VLOG(3) << "buffer #" << i - << " is in the preallocated result ShapedBuffer"; - } else { - TF_ASSIGN_OR_RETURN((*buffers)[i], memory_allocator->Allocate( - device_ordinal, buffer_size)); - - VLOG(3) << "buffer #" << i << " allocated " << buffer_size << " bytes [" - << (*buffers)[i].opaque() << "]"; - } - - // Since the output buffer and all the temporary buffers were written into - // by the JITed code, msan has no way of knowing their memory was - // initialized. Mark them initialized so that msan doesn't flag loads from - // these buffers. - TF_ANNOTATE_MEMORY_IS_INITIALIZED((*buffers)[i].opaque(), buffer_size); - } - - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, - assignment_->GetUniqueTopLevelOutputSlice()); - VLOG(3) << "result index: " << result_slice.index(); - - return Status::OK(); -} - -Status ParallelCpuExecutable::ExecuteComputeFunctions( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice buffers, - HloExecutionProfile* hlo_execution_profile) { - // Allocate profiling counters for each hlo instruction that we would like to - // profile. - std::vector* profile_counters = nullptr; - if (hlo_execution_profile) { - profile_counters = hlo_execution_profile->mutable_profile_counters(); - } - - std::vector buffer_pointers; - buffer_pointers.reserve(buffers.size()); - for (auto device_allocation : buffers) { - buffer_pointers.push_back(device_allocation.opaque()); - } - - // Resolve functions for all the HLO instructions ahead of time. - HloInstructionMap functions; - for (auto& entry : *function_names_) { - tensorflow::mutex_lock lock(jit_mutex_); - HloInstruction* instruction = entry.first; - llvm::JITSymbol sym = jit_->FindCompiledSymbol(entry.second); - TF_RET_CHECK(sym); - InsertOrDie( - &functions, instruction, - reinterpret_cast(cantFail(sym.getAddress()))); - } - - // Map containing pointers to result buffers for each instruction. - HloInstructionMap results; - - uint64 start_micros = tensorflow::Env::Default()->NowMicros(); - - std::list pending; - - // Call the function for each HLO instruction in topological order. - const HloComputation& entry_computation = *module().entry_computation(); - for (auto* instruction : entry_computation.MakeInstructionPostOrder()) { - // Parameters and constants have no functions associated with them. Instead - // just copy the existing buffer into the map containing instruction - // results.. - if (instruction->opcode() == HloOpcode::kParameter) { - InsertOrDie( - &results, instruction, - arguments[instruction->parameter_number()]->root_buffer().opaque()); - } else if (instruction->opcode() == HloOpcode::kConstant) { - unsigned char* aligned_data = - FindOrDie(aligned_constants_, instruction).get(); - InsertOrDie(&results, instruction, aligned_data); - } else { - TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall); - pending.push_back(instruction); - } - } - - // TODO(b/27458679) Manage scheduling based on in-flight concurrency limits. - // For example, if we expect a library conv/matmul call to run at max - // concurrency, we should not dispatch runnable instructions until the - // library call is finished (to avoid expensive cache invalidation). - Executor executor( - functions, run_options, &pending, &results, buffer_pointers.data(), - profile_counters ? profile_counters->data() : nullptr, assignment_.get()); - - TF_RETURN_IF_ERROR(executor.Run()); - - uint64 end_micros = tensorflow::Env::Default()->NowMicros(); - - { - tensorflow::mutex_lock lock(mutex_); - double nanoseconds = (end_micros - start_micros) * 1000.0; - execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0)); - } - - return Status::OK(); -} - -StatusOr> ParallelCpuExecutable::ExecuteOnStream( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - HloExecutionProfile* hlo_execution_profile) { - if (GetRootPointsToSet().IsAmbiguous()) { - return Unimplemented("Points-to set of root instruction is ambiguous"); - } - - se::Stream* stream = run_options->stream(); - DeviceMemoryAllocator* memory_allocator = run_options->allocator(); - std::vector buffers(assignment_->Allocations().size()); - - auto result_buffer = MakeUnique( - /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(), - stream->parent()->platform(), stream->parent()->device_ordinal()); - - TF_RETURN_IF_ERROR(AllocateBuffers( - memory_allocator, stream->parent()->device_ordinal(), &buffers)); - - TF_RETURN_IF_ERROR(ExecuteComputeFunctions(run_options, arguments, buffers, - hlo_execution_profile)); - - // Copy DeviceMemoryBase values which into the respective location in - // ShapedBuffer which is returned to the caller. - std::vector buffers_in_result(assignment_->Allocations().size(), false); - TF_RETURN_IF_ERROR(result_buffer->buffers().ForEachMutableElementWithStatus( - [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { - const auto& sources = this->GetRootPointsToSet().element(index); - - // The points to set is unambiguous so the set should be a singleton. - CHECK_EQ(1, sources.size()); - const LogicalBuffer* buffer_source = sources[0]; - HloInstruction* src = buffer_source->instruction(); - - // The source for this result buffer can be a nested buffer such as a - // tuple element. The source instruction should have a non-parameter - // buffer assigned. - TF_ASSIGN_OR_RETURN( - const BufferAllocation::Slice slice, - this->assignment_->GetUniqueSlice(src, buffer_source->index())); - CHECK(!slice.allocation()->is_entry_computation_parameter()); - - const BufferAllocation::Index buffer_index = slice.index(); - const se::DeviceMemoryBase& buffer = buffers[buffer_index]; - CHECK(!buffer.is_null() || buffer.size() == 0); - *device_memory = buffer; - buffers_in_result[buffer_index] = true; - return Status::OK(); - })); - - // Free all buffers not in the result. - for (size_t i = 0; i < buffers.size(); ++i) { - se::DeviceMemoryBase alloc = buffers[i]; - if (!buffers_in_result[i] && !alloc.is_null()) { - VLOG(3) << "CpuExecutable deallocating buffer #" << i << " [" - << alloc.opaque() << "]"; - TF_RETURN_IF_ERROR(memory_allocator->Deallocate( - stream->parent()->device_ordinal(), &alloc)); - } - } - - return std::move(result_buffer); -} - -StatusOr> -ParallelCpuExecutable::ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) { - // TODO(b/30671675): Implement asynchronous execution mode. - return Unimplemented( - "Asynchronous execution on stream is not yet supported on CPU."); -} - -const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const { - return assignment_->points_to_analysis().GetPointsToSet( - module().entry_computation()->root_instruction()); -} - -} // namespace cpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h deleted file mode 100644 index 87c0a3df458eb4b3f217192597e0de1576304367..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_ - -#include -#include -#include -#include -#include - -#include "tensorflow/compiler/xla/service/buffer_assignment.h" -#include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" -#include "tensorflow/compiler/xla/service/executable.h" -#include "tensorflow/compiler/xla/service/hlo_execution_profile.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/shaped_buffer.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/stream_executor_no_cuda.h" -#include "tensorflow/core/platform/thread_annotations.h" - -namespace xla { -namespace cpu { - -// CPU-targeting parallel implementation of the XLA Executable interface. -// -// Wraps a JIT-ed object that can be executed "on device". We JIT for the host -// architecture, so JIT-ed code and host code share the same ABI. -class ParallelCpuExecutable : public Executable { - public: - ParallelCpuExecutable( - std::unique_ptr jit, - std::unique_ptr assignment, - std::unique_ptr hlo_module, - std::unique_ptr> function_names, - std::unordered_map> - aligned_constants, - std::unique_ptr hlo_profile_printer_data, - std::unique_ptr hlo_profile_index_map); - ~ParallelCpuExecutable() override {} - - StatusOr> ExecuteOnStream( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - HloExecutionProfile* hlo_execution_profile) override; - - StatusOr> ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) override; - - // This should be called after set_ir_module_string. - const string& ir_module_string() const { return ir_module_string_; } - - void set_ir_module_string(const string& ir_module_string) { - ir_module_string_ = ir_module_string; - } - - static int64 ShapeSizeBytes(const Shape& shape) { - // On the cpu, opaques are pointers. - if (ShapeUtil::IsOpaque(shape)) { - return sizeof(void*); - } - return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); - } - - private: - // Allocate buffers required for execution and assign them to the elements of - // "buffers". "buffers" should be sized to the number of buffers in buffer - // assignment. Each vector element corresponds to a particular Index. If - // a vector element already contains a non-null DeviceMemoryBase, then no - // buffer is assigned for this element. - Status AllocateBuffers( - DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers); - - // Calls the generated functions in 'function_names_', performing the - // computation with the given arguments using the supplied buffers. - Status ExecuteComputeFunctions( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice - buffers, - HloExecutionProfile* hlo_execution_profile); - - // Returns the points-to set of the root instruction of the entry - // computation. Uses points-to analysis from buffer assignment. - const PointsToSet& GetRootPointsToSet() const; - - // The JIT containing compiled modules. - tensorflow::mutex jit_mutex_; - const std::unique_ptr jit_ GUARDED_BY(jit_mutex_); - - // Buffer assignment for the buffers we need to allocate. - const std::unique_ptr assignment_; - - // The LLVM IR, in string format, of the unoptimized module generated for this - // ParallelCpuExecutable. We save a string instead of an llvm::Module* because - // leaving llvm::Module* in a singleton can cause the heap checker to emit - // false positives. - string ir_module_string_; - - // Map containing the JITted function names for each HLO instruction. - const std::unique_ptr> function_names_; - - // Map from HLO Constant instructions to a pointer to their literal data. - // The data stored in the protocol buffer might be insufficiently aligned, - // we create a sufficiently aligned copy and store it in this map. - const std::unordered_map> - aligned_constants_; - - TF_DISALLOW_COPY_AND_ASSIGN(ParallelCpuExecutable); -}; - -} // namespace cpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_ diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc index 78e7aa48accdbb51a8477455f5f9c004828c068f..35db4fd2a22cc1615ade77a801cb28c504db09a6 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.cc +++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc @@ -24,19 +24,16 @@ limitations under the License. namespace xla { StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator( - const perftools::gputools::Platform* platform, - tensorflow::gtl::ArraySlice - stream_executors) + const se::Platform* platform, + tensorflow::gtl::ArraySlice stream_executors) : DeviceMemoryAllocator(platform), stream_executors_(stream_executors.begin(), stream_executors.end()) {} -StatusOr -StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size, - bool retry_on_failure) { - TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor, +StatusOr StreamExecutorMemoryAllocator::Allocate( + int device_ordinal, uint64 size, bool retry_on_failure) { + TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor, GetStreamExecutor(device_ordinal)); - perftools::gputools::DeviceMemoryBase result = - stream_executor->AllocateArray(size); + se::DeviceMemoryBase result = stream_executor->AllocateArray(size); if (size > 0 && result == nullptr) { return ResourceExhausted( "Failed to allocate request for %s (%lluB) on device ordinal %d", @@ -47,22 +44,22 @@ StreamExecutorMemoryAllocator::Allocate(int device_ordinal, uint64 size, } tensorflow::Status StreamExecutorMemoryAllocator::Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) { + int device_ordinal, se::DeviceMemoryBase* mem) { if (!mem->is_null()) { - TF_ASSIGN_OR_RETURN(perftools::gputools::StreamExecutor * stream_executor, + TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor, GetStreamExecutor(device_ordinal)); // We make a local copy of 'mem' so the original is not zeroed out by the // Deallocate() call below. This gives us a better chance of // catching double-free bugs, since Deallocate silently succeeds for null // values. - perftools::gputools::DeviceMemoryBase mem_copy(*mem); + se::DeviceMemoryBase mem_copy(*mem); stream_executor->Deallocate(&mem_copy); } return tensorflow::Status::OK(); } -StatusOr -StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) { +StatusOr StreamExecutorMemoryAllocator::GetStreamExecutor( + int device_ordinal) { if (device_ordinal < 0) { return InvalidArgument("device ordinal value (%d) must be non-negative", device_ordinal); diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h index 39dfad84c1c1c1c461c24de555ecd919cea47d83..da45c4d45a1c56fd39b1e3e17ff131de59ceeced 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.h +++ b/tensorflow/compiler/xla/service/device_memory_allocator.h @@ -33,30 +33,42 @@ class DeviceMemoryAllocator { public: // Parameter platform indicates which platform the allocator allocates memory // on. Must be non-null. - explicit DeviceMemoryAllocator(const perftools::gputools::Platform* platform) + explicit DeviceMemoryAllocator(const se::Platform* platform) : platform_(platform) {} virtual ~DeviceMemoryAllocator() {} // 'retry_on_failure': If false, and the first attempt to allocate the memory - // fails, the allocation should return immediately without retrying. - // An example use case is optional scratch spaces where a failure - // has only performance impact. + // fails, the allocation should return immediately without retrying. An + // example use case is optional scratch spaces where a failure has only + // performance impact. + // // Allocate() should return a null pointer for a size-0 allocation. // Deallocate() must be a no-op for null pointers. - virtual StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure = true) = 0; - virtual tensorflow::Status Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) = 0; + virtual StatusOr Allocate(int device_ordinal, + uint64 size, + bool retry_on_failure) = 0; + + // Two-arg version of Allocate(), which sets retry-on-failure to true. + // + // (We don't simply use a default argument on the virtual Allocate function + // because default args on virtual functions are disallowed by the Google + // style guide.) + StatusOr Allocate(int device_ordinal, uint64 size) { + return Allocate(device_ordinal, size, /*retry_on_failure=*/true); + } + + virtual tensorflow::Status Deallocate(int device_ordinal, + se::DeviceMemoryBase* mem) = 0; // Return the platform that the allocator allocates memory on. - const perftools::gputools::Platform* platform() const { return platform_; } + const se::Platform* platform() const { return platform_; } // Can we call Deallocate() as soon as a computation has been scheduled on // a stream, or do we have to wait for the computation to complete first? virtual bool AllowsAsynchronousDeallocation() const = 0; protected: - const perftools::gputools::Platform* platform_; + const se::Platform* platform_; }; // Default memory allocator for a platform which uses @@ -64,25 +76,27 @@ class DeviceMemoryAllocator { class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator { public: StreamExecutorMemoryAllocator( - const perftools::gputools::Platform* platform, - tensorflow::gtl::ArraySlice - stream_executors); + const se::Platform* platform, + tensorflow::gtl::ArraySlice stream_executors); - StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure = true) override; - tensorflow::Status Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override; + StatusOr Allocate(int device_ordinal, uint64 size, + bool retry_on_failure) override; + + // Pull in two-arg overload that sets retry_on_failure to true. + using DeviceMemoryAllocator::Allocate; + + tensorflow::Status Deallocate(int device_ordinal, + se::DeviceMemoryBase* mem) override; bool AllowsAsynchronousDeallocation() const override; private: - StatusOr GetStreamExecutor( - int device_ordinal); + StatusOr GetStreamExecutor(int device_ordinal); // A vector indexed by device ordinal of StreamExecutors for each device of // the allocator's platform type. If an element is nullptr, then the device // with the respective device ordinal is not supported by XLA. - std::vector stream_executors_; + std::vector stream_executors_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h index 3f7089d6ca1e1a3b9bb42028327ba54ba4b93974..0528b076027603796a445d8b0e9cbcebd1b513a7 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h @@ -147,6 +147,9 @@ class DfsHloVisitorBase { virtual Status HandleLog(HloInstructionPtr hlo) { return HandleElementwiseUnary(hlo); } + virtual Status HandleClz(HloInstructionPtr hlo) { + return HandleElementwiseUnary(hlo); + } virtual Status HandleCos(HloInstructionPtr hlo) { return HandleElementwiseUnary(hlo); } @@ -199,7 +202,6 @@ class DfsHloVisitorBase { virtual Status HandleReduce(HloInstructionPtr hlo) = 0; virtual Status HandleBitcast(HloInstructionPtr hlo) = 0; virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0; - virtual Status HandleBroadcastDimOne(HloInstructionPtr hlo) = 0; virtual Status HandleReshape(HloInstructionPtr hlo) = 0; virtual Status HandleTranspose(HloInstructionPtr hlo) = 0; virtual Status HandleParameter(HloInstructionPtr hlo) = 0; diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h index e6680ee9b87e1a01782204047c3b2104995c11ed..240faebe62f5cee4f61b3c36b5e8f653cfd6db8e 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h @@ -158,9 +158,6 @@ class DfsHloVisitorWithDefaultBase Status HandleBroadcast(HloInstructionPtr broadcast) override { return DefaultAction(broadcast); } - Status HandleBroadcastDimOne(HloInstructionPtr broadcastDimOne) override { - return DefaultAction(broadcastDimOne); - } Status HandlePad(HloInstructionPtr pad) override { return DefaultAction(pad); } diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index b6a0903b0eeaa04d8bc1488378c148b2016c5d48..38b5efa9fb2cdbb8581682003d2fc68cf2020b2b 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -52,6 +52,13 @@ using tensorflow::strings::StrCat; namespace { +int64 GlobalRandomValue() { + static auto* mu = new tensorflow::mutex(); + static std::mt19937_64 rng{42}; + tensorflow::mutex_lock l(*mu); + return rng(); +} + llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits, int64 mantissa_bits, llvm::IRBuilder<>* ir_builder) { @@ -293,6 +300,12 @@ StatusOr ElementalIrEmitter::EmitIntegerUnaryOp( return operand_value; } } + case HloOpcode::kClz: { + auto is_zero_undef = ir_builder_->getFalse(); + return llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::ctlz, {operand_value, is_zero_undef}, + {operand_value->getType()}, ir_builder_); + } case HloOpcode::kSign: { bool is_signed = primitive_util::IsSignedIntegralType(op->shape().element_type()); @@ -1169,7 +1182,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator( llvm::Value* increment = ir_builder_->getInt( llvm::APInt(128, {0x14057B7EF767814F, 0x5851F42D4C957F2D})); - auto random_value = [hlo]() { + auto random_value_from_hlo = [hlo]() { const HloModule* module = hlo->IsFused() ? hlo->parent()->FusionInstruction()->parent()->parent() : hlo->parent()->parent(); @@ -1191,10 +1204,15 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator( /*Ty=*/ir_builder_->getInt64Ty(), /*isConstant=*/false, /*Linkage=*/llvm::GlobalValue::PrivateLinkage, - /*Initializer=*/ir_builder_->getInt64(random_value()), + /*Initializer=*/ir_builder_->getInt64(random_value_from_hlo()), /*Name=*/"state_ptr0"); + + // When the module config seed is 0, the expected result of a prng is a random + // value. Instead of using the random_value_from_hlo, we need a global random + // value as the graph seed. This is because if we use random_value_from_hlo + // here, then for a newly built hlo graph, it always gives the same number. uint64 graph_seed = hlo_module_config_.seed() != 0 ? hlo_module_config_.seed() - : random_value(); + : GlobalRandomValue(); llvm::GlobalVariable* state_ptr1 = new llvm::GlobalVariable( /*M=*/*module_, /*Ty=*/ir_builder_->getInt64Ty(), @@ -1334,6 +1352,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator( case HloOpcode::kAbs: case HloOpcode::kRoundNearestAfz: case HloOpcode::kCeil: + case HloOpcode::kClz: case HloOpcode::kConvert: case HloOpcode::kBitcastConvert: case HloOpcode::kCopy: diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index 471d2fd6cebcd7a00dfea4aca08da08af534b05f..021f09d310b718b51932d0492d1b8f5eb562605c 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -29,18 +29,19 @@ using tensorflow::gtl::ArraySlice; namespace xla { -StatusOr>> -Executable::ExecuteOnStreams( +StatusOr> Executable::ExecuteOnStreams( ArraySlice run_options, ArraySlice> arguments) { TF_RET_CHECK(run_options.size() == arguments.size()); - std::vector> return_values(run_options.size()); + std::vector return_values; + return_values.reserve(run_options.size()); if (run_options.size() == 1) { - TF_ASSIGN_OR_RETURN(return_values[0], + TF_ASSIGN_OR_RETURN(auto rv, ExecuteOnStream(&run_options[0], arguments[0], /*hlo_execution_profile=*/nullptr)); + return_values.push_back(std::move(rv)); return std::move(return_values); } @@ -48,8 +49,9 @@ Executable::ExecuteOnStreams( // We cannot BlockHostUntilDone() on the already-launched executions in case // of error, since if the executions communicate, the initially launched // executions may never complete if not all executions are running. - TF_ASSIGN_OR_RETURN(return_values[i], + TF_ASSIGN_OR_RETURN(auto rv, ExecuteAsyncOnStream(&run_options[i], arguments[i])); + return_values.push_back(std::move(rv)); } for (const auto& options : run_options) { TF_RET_CHECK(options.stream() != nullptr); @@ -58,13 +60,13 @@ Executable::ExecuteOnStreams( return std::move(return_values); } -StatusOr> Executable::ExecuteOnStreamWrapper( +StatusOr Executable::ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, ArraySlice arguments) { - perftools::gputools::Stream* stream = run_options->stream(); - std::unique_ptr timer; + se::Stream* stream = run_options->stream(); + std::unique_ptr timer; if (profile != nullptr) { - timer.reset(new perftools::gputools::Timer(stream->parent())); + timer.reset(new se::Timer(stream->parent())); stream->InitTimer(timer.get()).ThenStartTimer(timer.get()); } @@ -78,7 +80,7 @@ StatusOr> Executable::ExecuteOnStreamWrapper( &hlo_profile_index_map()) : nullptr; - StatusOr> return_value = + StatusOr return_value = ExecuteOnStream(run_options, arguments, profile_ptr.get()); TF_RETURN_IF_ERROR(return_value.status()); @@ -161,4 +163,24 @@ Status Executable::DumpSessionModule() { result); } +/* static */ Status Executable::DumpToDirectory( + const string& directory_path, string filename, + const HloSnapshot& hlo_session) { + tensorflow::Env* env = tensorflow::Env::Default(); + if (!env->IsDirectory(directory_path).ok()) { + // NB! CreateDir does not work reliably with multiple XLA threads -- two + // threads can race to observe the absence of the dump directory and + // simultaneously try to create it, causing the "losing" thread to get a + // "directory already exists" error. + TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path)); + } + filename = SanitizeFileName(std::move(filename)); + string file_path = tensorflow::io::JoinPath(directory_path, filename); + string result; + TF_RET_CHECK( + tensorflow::SerializeToStringDeterministic(hlo_session, &result)); + return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path, + result); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index a157235f8af6ea64a488510e427bbae502c46ca6..f7af1ca57492972c58d3ce5ddc083088a32a968f 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -62,14 +63,14 @@ class Executable { // enabled. // // Returns a shaped buffer containing the result of the computation. - virtual StatusOr> ExecuteOnStream( + virtual StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) = 0; // Same as ExecuteOnStream(), but this call is non-blocking and returns as // soon as all of the operations are enqueued for launch on the stream. - virtual StatusOr> ExecuteAsyncOnStream( + virtual StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) = 0; @@ -77,7 +78,7 @@ class Executable { // streams. arguments[i] contains the arguments to the execution on // run_options[i]->stream() and the returned value is at index i of the // returned vector. - virtual StatusOr>> ExecuteOnStreams( + virtual StatusOr> ExecuteOnStreams( tensorflow::gtl::ArraySlice run_options, tensorflow::gtl::ArraySlice< @@ -90,14 +91,14 @@ class Executable { // has completed. virtual Status PopulateExecutionProfile( HloExecutionProfile* hlo_execution_profile, - perftools::gputools::StreamExecutor* executor) { + se::StreamExecutor* executor) { return Status::OK(); } // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a // timer for the execution, sets up HLO profiling if enabled, and fills in the // given ExecutionProfile if non-null. - StatusOr> ExecuteOnStreamWrapper( + StatusOr ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, tensorflow::gtl::ArraySlice arguments); @@ -155,6 +156,10 @@ class Executable { static Status DumpToDirectory(const string& directory_path, string filename, const SessionModule& session_module); + // Dump hlo snapshot to directory_path/filename. + static Status DumpToDirectory(const string& directory_path, string filename, + const HloSnapshot& hlo_session); + protected: mutable tensorflow::mutex mutex_; diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc index 221ff7900f398166c193c495848a2afcfd4edc81..2d3e4b1fcdf6675955714cab262a8b2ca8ff4297 100644 --- a/tensorflow/compiler/xla/service/gather_expander.cc +++ b/tensorflow/compiler/xla/service/gather_expander.cc @@ -28,9 +28,15 @@ using tensorflow::gtl::ArraySlice; static StatusOr TransposeIndexVectorDimToLast( HloInstruction* gather_indices, int64 index_vector_dim) { const Shape& gather_indices_shape = gather_indices->shape(); + + if (gather_indices_shape.dimensions_size() == index_vector_dim) { + return gather_indices; + } + if (index_vector_dim == (gather_indices_shape.dimensions_size() - 1)) { return gather_indices; } + std::vector permutation; permutation.reserve(gather_indices_shape.dimensions_size()); for (int64 i = 0, e = gather_indices_shape.dimensions_size(); i < e; i++) { @@ -42,55 +48,35 @@ static StatusOr TransposeIndexVectorDimToLast( return MakeTransposeHlo(gather_indices, permutation); } -// If the gather_indices holds scalar indices (i.e. gather_indices has rank N -// and index_vector_dim is N) then reshape it to have a trailing degenerate -// dimension. This makes the code for slicing out the index vector more -// uniform. -static StatusOr DeScalarizeGatherIndices( - HloInstruction* gather_indices, int64 index_vector_dim) { - const Shape& gather_indices_shape = gather_indices->shape(); - if (index_vector_dim != gather_indices_shape.dimensions_size()) { - return gather_indices; - } - - DCHECK_EQ(index_vector_dim, gather_indices_shape.dimensions_size()); - - std::vector result_shape_dims; - c_copy(gather_indices_shape.dimensions(), - std::back_inserter(result_shape_dims)); - result_shape_dims.push_back(1); - - return MakeReshapeHlo(result_shape_dims, gather_indices); -} - // Canonicalizes the gather_indices tensors so that we only have deal with some // specific cases in the while loop that does the heavy lifting. // // See the "High Level Algorithm" section for a broader picture. static StatusOr CanonicalizeGatherIndices( HloInstruction* gather_indices, int64 index_vector_dim) { - // If gather_indices holds scalar indices, normalize it to hold index vectors - // of size 1. + // Transpose the non-index-vector dimensions to the front. TF_ASSIGN_OR_RETURN( - HloInstruction * descalarized_gather_indices, - DeScalarizeGatherIndices(gather_indices, index_vector_dim)); + HloInstruction * transposed_gather_indices, + TransposeIndexVectorDimToLast(gather_indices, index_vector_dim)); + bool indices_are_scalar = + index_vector_dim == gather_indices->shape().dimensions_size(); - // Transpose the non-index-vector dimensions to the front. - TF_ASSIGN_OR_RETURN(HloInstruction * transposed_gather_indices, - TransposeIndexVectorDimToLast(descalarized_gather_indices, - index_vector_dim)); + // The number of dimensions in gather_indices that are index dimensions. + const int64 index_dims_in_gather_indices = indices_are_scalar ? 0 : 1; // If there is only one index (i.e. gather_indices has rank 1 and this gather // is really just a dynamic slice) add a leading degenerate dimension for // uniformity. Otherwise create a "collapsed" leading dimension that subsumes // all of the non-index-vector dimensions. const Shape& shape = transposed_gather_indices->shape(); - if (shape.dimensions_size() == 1) { - return ExpandFirstDimIntoNDims(transposed_gather_indices, - {1, shape.dimensions(0)}); + if (shape.dimensions_size() == index_dims_in_gather_indices) { + return PrependDegenerateDims(transposed_gather_indices, 1); } else { - return CollapseFirstNDims(transposed_gather_indices, - shape.dimensions_size() - 1); + // Collapse all but the dimensions (0 or 1) in gather_indices containing the + // index vectors. + return CollapseFirstNDims( + transposed_gather_indices, + shape.dimensions_size() - index_dims_in_gather_indices); } } @@ -112,11 +98,7 @@ static StatusOr AdjustGatherDimsInAccumulator( // dynamic-slice. In that case, there is a leading degenerate gather // dimension that we added to make this special case play well with the // general while loop which we need to remove now. - CHECK_EQ(accumulator->shape().dimensions(0), 1); - ArraySlice reshaped_dim_sizes = - AsInt64Slice(accumulator->shape().dimensions()); - reshaped_dim_sizes.remove_prefix(1); - return MakeReshapeHlo(reshaped_dim_sizes, accumulator); + return ElideDegenerateDims(accumulator, {0}); } return ExpandFirstDimIntoNDims(accumulator, output_gather_dim_bounds); @@ -161,50 +143,73 @@ static StatusOr ExpandIndexVectorIntoOperandSpace( static StatusOr> GatherLoopBody( const HloInstruction& gather, HloInstruction* induction_var, const std::vector& incoming_loop_state) { + const GatherDimensionNumbers& dim_numbers = gather.gather_dimension_numbers(); CHECK_EQ(incoming_loop_state.size(), 3); HloInstruction* const operand = incoming_loop_state[0]; HloInstruction* const gather_indices = incoming_loop_state[1]; HloInstruction* const output_accumulator = incoming_loop_state[2]; - int64 index_vector_size = gather_indices->shape().dimensions(1); + bool has_scalar_indices = gather_indices->shape().dimensions_size() == 1; + CHECK_EQ(has_scalar_indices, + dim_numbers.index_vector_dim() == + gather.operand(1)->shape().dimensions_size()); TF_ASSIGN_OR_RETURN( HloInstruction * induction_var_as_vector, MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{}, /*result_shape_bounds=*/{1})); - TF_ASSIGN_OR_RETURN( - HloInstruction * index_into_gather_indices, - PadVectorWithZeros(induction_var_as_vector, - /*zeros_to_prepend=*/0, /*zeros_to_append=*/1)); - - TF_ASSIGN_OR_RETURN( - HloInstruction * index_vector_2d, - MakeDynamicSliceHlo(gather_indices, index_into_gather_indices, - {1, index_vector_size})); + HloInstruction* index_vector; - TF_ASSIGN_OR_RETURN(HloInstruction * index_vector, - ElideDegenerateDims(index_vector_2d, {0})); + if (has_scalar_indices) { + // In this case gather_indices has rank 1 and induction_var_as_vector (of + // shape {1}) is an index into this rank 1 tensor. + TF_ASSIGN_OR_RETURN( + index_vector, + MakeDynamicSliceHlo(gather_indices, induction_var_as_vector, {1})); + } else { + // In this case gather_indices has rank 2 and induction_var_as_vector (of + // shape {1}) is an index into just the first dimension of this rank 2 + // tensor. + TF_ASSIGN_OR_RETURN( + HloInstruction * index_into_gather_indices, + PadVectorWithZeros(induction_var_as_vector, + /*zeros_to_prepend=*/0, /*zeros_to_append=*/1)); + + int64 index_vector_size = gather_indices->shape().dimensions(1); + TF_ASSIGN_OR_RETURN( + HloInstruction * index_vector_2d, + MakeDynamicSliceHlo(gather_indices, index_into_gather_indices, + {1, index_vector_size})); + + TF_ASSIGN_OR_RETURN(index_vector, + ElideDegenerateDims(index_vector_2d, {0})); + } - TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice_start, - ExpandIndexVectorIntoOperandSpace( - index_vector, gather.gather_dimension_numbers(), - operand->shape().dimensions_size())); + TF_ASSIGN_OR_RETURN( + HloInstruction * gathered_slice_start, + ExpandIndexVectorIntoOperandSpace(index_vector, dim_numbers, + operand->shape().dimensions_size())); TF_ASSIGN_OR_RETURN(HloInstruction * gathered_slice, MakeDynamicSliceHlo(operand, gathered_slice_start, gather.gather_window_bounds())); + TF_ASSIGN_OR_RETURN( + HloInstruction * gathered_slice_with_dims_elided, + ElideDegenerateDims(gathered_slice, + AsInt64Slice(dim_numbers.elided_window_dims()))); + TF_ASSIGN_OR_RETURN( HloInstruction * gathered_slice_for_update, - ExpandFirstDimIntoNDims(gathered_slice, - {1, gathered_slice->shape().dimensions(0)})); + PrependDegenerateDims(gathered_slice_with_dims_elided, 1)); TF_ASSIGN_OR_RETURN( HloInstruction * index_vector_into_accumulator, PadVectorWithZeros( induction_var_as_vector, /*zeros_to_prepend=*/0, - /*zeros_to_append=*/gathered_slice->shape().dimensions_size())); + /*zeros_to_append=*/ + gathered_slice_with_dims_elided->shape().dimensions_size())); TF_ASSIGN_OR_RETURN( HloInstruction * updated_accumulator, @@ -220,26 +225,20 @@ static StatusOr> GatherLoopBody( static StatusOr CreateGatherLoopAccumulatorInitValue( HloComputation* computation, PrimitiveType element_type, - ArraySlice window_bounds, int64 gather_loop_trip_count) { + ArraySlice window_bounds, int64 gather_loop_trip_count, + const GatherDimensionNumbers& dim_numbers) { std::vector accumulator_state_shape_dims; accumulator_state_shape_dims.reserve(1 + window_bounds.size()); accumulator_state_shape_dims.push_back(gather_loop_trip_count); - c_copy(window_bounds, std::back_inserter(accumulator_state_shape_dims)); + for (int64 i = 0; i < window_bounds.size(); i++) { + if (!c_binary_search(dim_numbers.elided_window_dims(), i)) { + accumulator_state_shape_dims.push_back(window_bounds[i]); + } + } return BroadcastZeros(computation, element_type, accumulator_state_shape_dims); } -static StatusOr ElideWindowDimsFromAccumulator( - HloInstruction* accumulator, const GatherDimensionNumbers& dim_numbers) { - std::vector dims_to_elide; - dims_to_elide.reserve(dim_numbers.elided_window_dims_size()); - for (int64 elided_window_dim : dim_numbers.elided_window_dims()) { - dims_to_elide.push_back(elided_window_dim + 1); - } - - return ElideDegenerateDims(accumulator, dims_to_elide); -} - // `accumulator` is almost the tensor the gather operation would have produced, // except that it has the dimensions in the wrong order -- the gather dimensions // are the major dimensions and the window dimensions are the minor dimensions. @@ -338,7 +337,8 @@ StatusOr GatherExpander::ExpandGather( HloInstruction * accumulator_init, CreateGatherLoopAccumulatorInitValue( computation, output_shape.element_type(), - gather_instr->gather_window_bounds(), gather_loop_trip_count)); + gather_instr->gather_window_bounds(), gather_loop_trip_count, + gather_instr->gather_dimension_numbers())); StatusOr> gather_loop_result_or_error = WhileUtil::MakeCountedLoop( @@ -353,14 +353,10 @@ StatusOr GatherExpander::ExpandGather( gather_loop_result_or_error); HloInstruction* accumulator_result = gather_loop_result.back(); - TF_ASSIGN_OR_RETURN( - HloInstruction * accumulator_with_window_dims_elided, - ElideWindowDimsFromAccumulator(accumulator_result, dim_numbers)); TF_ASSIGN_OR_RETURN( HloInstruction * accumulator_with_output_gather_dims_decanonicalized, - AdjustGatherDimsInAccumulator(gather_indices->shape(), - accumulator_with_window_dims_elided, + AdjustGatherDimsInAccumulator(gather_indices->shape(), accumulator_result, dim_numbers.index_vector_dim())); return PermuteGatherAndWindowDims( diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc index ba41ee8428cbe7132103df24d552565a8dc2f9f6..1c72ca066502eb549bf8638cdf0b7827b06f92d7 100644 --- a/tensorflow/compiler/xla/service/gather_expander_test.cc +++ b/tensorflow/compiler/xla/service/gather_expander_test.cc @@ -47,5 +47,62 @@ ENTRY main { "indices are not supported.")); } +TEST(GatherExpanderTest, AvoidDegenerateDims) { + const string hlo_text = R"( +HloModule TensorFlowGatherV2 + +ENTRY main { + operand = s32[3,3] parameter(0) + indices = s32[2] parameter(1) + ROOT gather = s32[3,2] gather(operand, indices), + output_window_dims={0}, + elided_window_dims={1}, + gather_dims_to_operand_dims={1}, + index_vector_dim=1, + window_bounds={3, 1} +} +)"; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr module, + tools::Parse(hlo_text)); + TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get())); + ASSERT_TRUE(changed); + + HloInstruction* while_instr = nullptr; + for (auto* instr : module->entry_computation()->instructions()) { + if (instr->opcode() == HloOpcode::kWhile) { + ASSERT_EQ(while_instr, nullptr) + << "Expected exactly one while instruction in the entry computation " + "after gather expansion"; + while_instr = instr; + } + } + + ASSERT_NE(while_instr, nullptr) + << "Expected exactly one while instruction in the entry computation " + "after gather expansion"; + + // We want to avoid create while loop with shapes that have degenerate + // dimensions for TF gather. In this case we expect the loop state to be of + // the shape (sNN[], s32[3,3]{1,0}, s32[2]{0}, s32[2,3]{1,0}). The leading + // sNN is an implementation detail from WhileUtil::MakeCountedLoop so we don't + // check it here (though in theory the form of the while loop state is itself + // an implementation detail from WhileUtil::MakeCountedLoop). + + const Shape& while_shape = while_instr->shape(); + ASSERT_TRUE(ShapeUtil::IsTuple(while_shape)); + ASSERT_EQ(ShapeUtil::TupleElementCount(while_shape), 4); + + EXPECT_TRUE(ShapeUtil::SameDimensions( + ShapeUtil::MakeShape(S32, {3, 3}), + ShapeUtil::GetTupleElementShape(while_shape, 1))); + + EXPECT_TRUE(ShapeUtil::SameDimensions( + ShapeUtil::MakeShape(S32, {2}), + ShapeUtil::GetTupleElementShape(while_shape, 2))); + + EXPECT_TRUE(ShapeUtil::SameDimensions( + ShapeUtil::MakeShape(S32, {2, 3}), + ShapeUtil::GetTupleElementShape(while_shape, 3))); +} } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc index a99e2b7794a399047fb5a77a140bd333214e3f23..ddb687314ee8221ba9282f230db498b3a5d23499 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc @@ -32,8 +32,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id, @@ -45,9 +43,9 @@ se::Platform::Id GenericTransferManager::PlatformId() const { } Status GenericTransferManager::WriteSingleTupleIndexTable( - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, tensorflow::gtl::ArraySlice elements, - const Shape& shape, perftools::gputools::DeviceMemoryBase* region) { + const Shape& shape, se::DeviceMemoryBase* region) { TF_RET_CHECK(elements.size() == ShapeUtil::TupleElementCount(shape)); std::vector element_pointers; @@ -144,20 +142,19 @@ Status GenericTransferManager::TransferLiteralToInfeed( } Status GenericTransferManager::TransferBufferToInfeed( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source) { + se::StreamExecutor* executor, int64 size, const void* source) { return Unimplemented("Generic transfer to Infeed"); } Status GenericTransferManager::TransferLiteralFromOutfeed( - perftools::gputools::StreamExecutor* executor, const Shape& literal_shape, + se::StreamExecutor* executor, const Shape& literal_shape, Literal* literal) { return Unimplemented( "Outfeed is not supported on this platform (b/30467474)"); } Status GenericTransferManager::ResetDevices( - tensorflow::gtl::ArraySlice + tensorflow::gtl::ArraySlice /*executors*/) { return Unimplemented( "Device reset is not yet supported on this platform (b/30481585)"); diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h index 63a7c820cf4e5fbbdf870086a4fb5316ac50d10b..0579099de40ba3e43f69e4e6474b56691064c692 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.h +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h @@ -36,46 +36,41 @@ namespace xla { // infeed. class GenericTransferManager : public TransferManager { public: - GenericTransferManager(perftools::gputools::Platform::Id platform_id, - size_t pointer_size); + GenericTransferManager(se::Platform::Id platform_id, size_t pointer_size); ~GenericTransferManager() override {} - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; StatusOr> TransferLiteralFromDevice( - perftools::gputools::StreamExecutor* executor, - const ShapedBuffer& device_buffer) override; + se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override; - Status TransferLiteralToDevice(perftools::gputools::StreamExecutor* executor, + Status TransferLiteralToDevice(se::StreamExecutor* executor, const Literal& literal, const ShapedBuffer& device_buffer) override; - Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor, + Status TransferLiteralToInfeed(se::StreamExecutor* executor, const Literal& literal) override; - Status TransferLiteralFromOutfeed( - perftools::gputools::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) override; + Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, + const Shape& literal_shape, + Literal* literal) override; Status ResetDevices( - tensorflow::gtl::ArraySlice - executors) override; + tensorflow::gtl::ArraySlice executors) override; int64 GetByteSizeRequirement(const Shape& shape) const override; protected: - Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor, - int64 size, const void* source) override; + Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, + const void* source) override; Status WriteSingleTupleIndexTable( - perftools::gputools::StreamExecutor* executor, - tensorflow::gtl::ArraySlice - elements, - const Shape& shape, - perftools::gputools::DeviceMemoryBase* region) override; + se::StreamExecutor* executor, + tensorflow::gtl::ArraySlice elements, + const Shape& shape, se::DeviceMemoryBase* region) override; private: // The platform this transfer manager targets. - const perftools::gputools::Platform::Id platform_id_; + const se::Platform::Id platform_id_; // The size in bytes of pointers on this platform. const size_t pointer_size_; diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc index 2029c303d47e9a62135b003c3bd9be6f8b3438d4..837f05244f7a8c71483cc30eeac9e1c219e6bbd2 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc @@ -28,8 +28,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h index ea7f0eb3745f2e0e0bfd88c3dca79d6ad25884ed..c2fc35be4ca4bc6df85ee21fb6b564bfd6de3ec0 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h @@ -41,7 +41,7 @@ class BufferAllocations { // user-specified result buffers) to the given buffer index. The builder // will skip allocating buffers for registered buffer indices. void RegisterBuffer(BufferAllocation::Index index, - perftools::gputools::DeviceMemoryBase address); + se::DeviceMemoryBase address); // Builds a BufferAllocations object from the given buffer assignment. // `memory_allocator` is what this function uses to allocate device memory. @@ -52,8 +52,7 @@ class BufferAllocations { DeviceMemoryAllocator* memory_allocator); private: - std::map - registered_buffers_; + std::map registered_buffers_; }; BufferAllocations(const BufferAllocations&) = delete; @@ -65,22 +64,20 @@ class BufferAllocations { // Returns the device address of buffer `buffer_index`. `buffer_index` must be // a valid index, i.e., in [0, buffer_count). This function returns null if // `buffer_index` is not assigned to a buffer address. - perftools::gputools::DeviceMemoryBase GetDeviceAddress( + se::DeviceMemoryBase GetDeviceAddress( BufferAllocation::Index buffer_index) const; // Same as above, but also adjusts the returned address for the offset and // size contained in the given slice. - perftools::gputools::DeviceMemoryBase GetDeviceAddress( + se::DeviceMemoryBase GetDeviceAddress( const BufferAllocation::Slice& buffer_slice) const; - perftools::gputools::DeviceMemoryBase GetTempBufferBase() const { - return temp_buffer_base_; - } + se::DeviceMemoryBase GetTempBufferBase() const { return temp_buffer_base_; } // Tears down all buffers allocated by this object that are not in // `live_addresses`. tensorflow::Status TearDown( - const std::set& live_addresses, + const std::set& live_addresses, const BufferAssignment& buffer_assignment); private: @@ -92,15 +89,15 @@ class BufferAllocations { // Sets the device address of buffer `buffer_index`. void SetBuffer(BufferAllocation::Index buffer_index, - perftools::gputools::DeviceMemoryBase buffer); + se::DeviceMemoryBase buffer); // An array of device pointers that stores the address of each buffer // indexed by Index. Each element can point to a temporary buffer, an // input buffer, or nullptr if no buffer is needed for that Index. - std::vector buffers_; + std::vector buffers_; // The base address of the memory block that contains all temporary buffers. - perftools::gputools::DeviceMemoryBase temp_buffer_base_; + se::DeviceMemoryBase temp_buffer_base_; int device_ordinal_; diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc index 790ca535b11ee47724ef6227de40726d940d6153..dce8de2e301ecfaa4674b8be48b8c02bfabf3f4b 100644 --- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc @@ -42,11 +42,10 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable) { } Status ConditionalThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream) { // Copy the predicate value from device. bool predicate; - perftools::gputools::DeviceMemoryBase predicate_address = + se::DeviceMemoryBase predicate_address = buffer_allocations.GetDeviceAddress(predicate_buffer_index_); stream->ThenMemcpy(&predicate, predicate_address, sizeof(bool)); diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h index 7725c46a3b4b51af34a4dd977885353ff32c21f6..e40872688fdad24d24db5f7cebb3206c77652dce 100644 --- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h @@ -49,7 +49,7 @@ class ConditionalThunk : public Thunk { Status Initialize(const GpuExecutable& executable) override; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: BufferAllocation::Slice predicate_buffer_index_; diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc index 461747b699b542ae0c8735aea34cc9e57c1fb387..64d3b84b8c73d82800270aebcebf7f7a8fec5fe4 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc @@ -25,8 +25,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h index 900d9cb6243088b56a1825fb3ab8c06cf8d74726..6d845025b1aef2b0a5f147401b6db0598ba94d6d 100644 --- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h @@ -66,23 +66,21 @@ class ConvolutionThunk : public Thunk { // Does the convolution for the thunk on "stream". Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: class ScratchAllocator; - Status Convolve( - const perftools::gputools::dnn::BatchDescriptor& input_descriptor, - perftools::gputools::DeviceMemory input_data, - const perftools::gputools::dnn::FilterDescriptor& filter_descriptor, - perftools::gputools::DeviceMemory filter_data, - const perftools::gputools::dnn::BatchDescriptor& output_descriptor, - perftools::gputools::DeviceMemory output_data, - const perftools::gputools::dnn::ConvolutionDescriptor& - convolution_descriptor, - const perftools::gputools::dnn::AlgorithmConfig& algorithm_config, - perftools::gputools::Stream* stream, ScratchAllocator* scratch_allocator, - perftools::gputools::dnn::ProfileResult* profile_result); + Status Convolve(const se::dnn::BatchDescriptor& input_descriptor, + se::DeviceMemory input_data, + const se::dnn::FilterDescriptor& filter_descriptor, + se::DeviceMemory filter_data, + const se::dnn::BatchDescriptor& output_descriptor, + se::DeviceMemory output_data, + const se::dnn::ConvolutionDescriptor& convolution_descriptor, + const se::dnn::AlgorithmConfig& algorithm_config, + se::Stream* stream, ScratchAllocator* scratch_allocator, + se::dnn::ProfileResult* profile_result); const CudnnConvKind convolution_kind_; diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc index f4498663b1c039b3175376baf8f27c4ecec678ec..bf912fbd14de5874062a79db28186ab233575233 100644 --- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc @@ -30,9 +30,8 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk( mem_size_(mem_size) {} tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { - perftools::gputools::DeviceMemoryBase destination_data = + const BufferAllocations& buffer_allocations, se::Stream* stream) { + se::DeviceMemoryBase destination_data = buffer_allocations.GetDeviceAddress(destination_buffer_); stream->ThenMemcpy(&destination_data, source_address_, mem_size_); return tensorflow::Status::OK(); @@ -48,11 +47,10 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk( mem_size_(mem_size) {} tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { - perftools::gputools::DeviceMemoryBase destination_data = + const BufferAllocations& buffer_allocations, se::Stream* stream) { + se::DeviceMemoryBase destination_data = buffer_allocations.GetDeviceAddress(destination_buffer_); - perftools::gputools::DeviceMemoryBase source_data = + se::DeviceMemoryBase source_data = buffer_allocations.GetDeviceAddress(source_buffer_); stream->ThenMemcpy(&destination_data, source_data, mem_size_); return tensorflow::Status::OK(); diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h index e2783fd255239d31edc89701ea208f33ebb8d3fb..2e7eb5f3445bc9294fa455ef31fb816cdba4726c 100644 --- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h @@ -40,8 +40,7 @@ class HostToDeviceCopyThunk : public Thunk { HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: const void* source_address_; @@ -64,8 +63,7 @@ class DeviceToDeviceCopyThunk : public Thunk { DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: const BufferAllocation::Slice source_buffer_; diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc index 58d9c8caff31e878487fbef01afce566e6187fd9..68099fd63847ef9993f9bc7ac0e28b2939631b35 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc @@ -28,7 +28,6 @@ limitations under the License. namespace xla { namespace gpu { -namespace se = ::perftools::gputools; namespace dnn = se::dnn; static std::pair> - AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override; + se::port::StatusOr> AllocateBytes( + se::Stream* stream, int64 byte_size) override; private: const int device_ordinal_; DeviceMemoryAllocator* memory_allocator_; - std::vector allocated_buffers_; + std::vector allocated_buffers_; int64 total_allocated_bytes_ = 0; }; @@ -74,16 +74,15 @@ class FftThunk : public Thunk { // Does the FFT for the thunk on "stream". tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: - const perftools::gputools::fft::Type fft_type_; + const se::fft::Type fft_type_; const std::vector fft_length_; float scale_factor_; - std::unique_ptr fft_plan_; + std::unique_ptr fft_plan_; const BufferAllocation::Slice input_buffer_; const BufferAllocation::Slice output_buffer_; diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc index 283d21ca222a236a69e4bab1b6504665d4d1cdd3..6e6966df3987eef29b2122c3ef8f11b7cd0bfe14 100644 --- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc @@ -36,8 +36,7 @@ tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) { } tensorflow::Status ForThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream) { for (int64 i = 0; i < loop_limit_; ++i) { // Invoke loop body thunk sequence. TF_RETURN_IF_ERROR( diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h index 832494d17e9c4e1d9e92e18ef331df1cf3689024..c78d1c50686297aea8235af928aba562697f49bc 100644 --- a/tensorflow/compiler/xla/service/gpu/for_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h @@ -38,8 +38,7 @@ class ForThunk : public Thunk { tensorflow::Status Initialize(const GpuExecutable& executable) override; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: const int64 loop_limit_; diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc index 38668ff455a44c7ef99b57b750f1a3b18a90bd2c..0ec12f52d8b398218ec370fc74bfdf6f97f43893 100644 --- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc @@ -22,8 +22,6 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h index df3edcefef898d465cd5ddc53e5d06a966a31f88..a18f425bc38fd3fbbb345901514c4ac16dbe97ec 100644 --- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h @@ -50,14 +50,12 @@ class GemmThunk : public Thunk { // Does the gemm operation for the thunk on "stream", which must be non-null. tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; // Returns true if we'll perform autotuning if run on the given stream. If // so, we want the GPU to be quiescent during autotuning, so as not to // introduce noise in our results. - bool ShouldHaltAllActivityBeforeRunning( - perftools::gputools::Stream* stream) override { + bool ShouldHaltAllActivityBeforeRunning(se::Stream* stream) override { return autotune_results_.count( stream->parent()->GetDeviceDescription().name()) != 0; } @@ -79,8 +77,7 @@ class GemmThunk : public Thunk { // results. The map's value is the best algorithm we've found for this thunk // on this device, or an error if none of the algorithms worked and we should // use the regular gemm without an algorithm. - std::unordered_map> + std::unordered_map> autotune_results_; }; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 07be2a0cf90c326af6e41764e79950db546e43e4..30bfc9351a5273b4cf854e269c5f576f6dd1bef7 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -91,8 +91,6 @@ limitations under the License. #include "tensorflow/core/platform/tracing.h" #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { @@ -779,9 +777,9 @@ se::Platform::Id GpuCompiler::PlatformId() const { } // namespace xla static bool InitModule() { - xla::Compiler::RegisterCompilerFactory(se::cuda::kCudaPlatformId, []() { - return xla::MakeUnique(); - }); + xla::Compiler::RegisterCompilerFactory( + stream_executor::cuda::kCudaPlatformId, + []() { return xla::MakeUnique(); }); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h index c352d4d8462fadb266c55ad437de998e86a6528e..f3b02ae5d8867bdf1d970e809bff95a15d9f54d2 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h @@ -45,25 +45,23 @@ class GpuCompiler : public LLVMCompiler { // Bring in // StatusOr>> Compile( // std::vector> modules, - // std::vector> + // std::vector> // stream_execs) using LLVMCompiler::Compile; StatusOr> RunHloPasses( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( - std::unique_ptr module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::vector> module, AotCompilationOptions const& options) override; - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override { // Capture just the pointer size, not the entire GpuCompiler object. diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 28f93447953b90d8a7fa4386e2355066c0405aec..980cc89fa03abd874a8e0a694f2abb775c1de050 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -34,8 +34,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { namespace { @@ -252,7 +250,7 @@ Status GpuExecutable::ExecuteThunks( return Status::OK(); } -StatusOr> GpuExecutable::ExecuteOnStream( +StatusOr GpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -299,13 +297,13 @@ StatusOr> GpuExecutable::ExecuteOnStream( HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); auto device_ordinal = executor->device_ordinal(); - auto shaped_buffer = MakeUnique( - root->shape(), root->shape(), executor->platform(), device_ordinal); + ScopedShapedBuffer shaped_buffer(root->shape(), root->shape(), + memory_allocator, device_ordinal); // Copy DeviceMemoryBase values which contain the array(s) of the result into // the respective location in ShapedBuffer. std::set buffers_in_result; - TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus( + TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus( [&buffer_allocations, &buffers_in_result, &shaped_buffer, this]( const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { const auto& sources = this->GetRootPointsToSet().element(index); @@ -324,7 +322,7 @@ StatusOr> GpuExecutable::ExecuteOnStream( this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index())); CHECK(!slice.allocation()->is_entry_computation_parameter()); - perftools::gputools::DeviceMemoryBase src_base = + se::DeviceMemoryBase src_base = buffer_allocations->GetDeviceAddress(slice.index()); CHECK(!src_base.is_null() || src_base.size() == 0); *device_memory = src_base; @@ -337,7 +335,7 @@ StatusOr> GpuExecutable::ExecuteOnStream( return std::move(shaped_buffer); } -StatusOr> GpuExecutable::ExecuteAsyncOnStream( +StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { // TODO(b/30671675): Implement asynchronous execution mode. diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index dcb3991f41a31db84d8e9e555ae7d13c3ac84b97..80ec38c3ac114fe4ad9d56784330c1144d913db1 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -74,12 +74,12 @@ class GpuExecutable : public Executable { // ExecuteOnStream will fail if the compute capability of the stream doesn't // match the compute capability passed to this object's constructor. - StatusOr> ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr> ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc index af9897769fda371e47af06c19abce9a06015e094..f13727ca9b6954f6be9b9277018fcc64ee326954 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc @@ -33,8 +33,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { // TODO(b/30467474) Once GPU infeed implementation settles, consider @@ -153,8 +151,8 @@ static std::unique_ptr CreateGpuTransferManager() { } static bool InitModule() { - xla::TransferManager::RegisterTransferManager(se::cuda::kCudaPlatformId, - &CreateGpuTransferManager); + xla::TransferManager::RegisterTransferManager( + stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager); return true; } static bool module_initialized = InitModule(); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h index 9aa369c668364079504ead3491903e2590a142cc..d040a99975230578c270deabdfe60c61649e778c 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h @@ -36,21 +36,20 @@ class GpuTransferManager : public GenericTransferManager { GpuTransferManager(); ~GpuTransferManager() override {} - Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor, + Status TransferLiteralToInfeed(se::StreamExecutor* executor, const Literal& literal) override; - Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor, - int64 size, const void* source) override; + Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size, + const void* source) override; private: // Initiates the infeed data transfers. InfeedBuffer->Done() must be // called to clean up the memory allocated for InfeedBuffer. StatusOr TransferBufferToInfeedInternal( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source); + se::StreamExecutor* executor, int64 size, const void* source); // Enqueues infeed data buffers with the infeed manager after their // transfer completes. - Status EnqueueBuffersToInfeed(perftools::gputools::StreamExecutor* executor, + Status EnqueueBuffersToInfeed(se::StreamExecutor* executor, std::vector buffers); TF_DISALLOW_COPY_AND_ASSIGN(GpuTransferManager); diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc index ee5b447c9cd0b1fde4d3a0943d5d4cb8cc5b3376..3ddc1c0789d746bf021256638342364aac63e0e3 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc @@ -19,8 +19,6 @@ limitations under the License. #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/core/platform/logging.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h index 73d5a5ce35497f156a181371bfb97fc37a8eb09e..d5f2216d460a45085536b15f9bf6e3bd3579f9c8 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_manager.h +++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h @@ -46,7 +46,7 @@ namespace gpu { // the client. The client manages the memory of the buffer. class InfeedBuffer { public: - InfeedBuffer(perftools::gputools::StreamExecutor* executor, int64 length) + InfeedBuffer(se::StreamExecutor* executor, int64 length) : executor_(executor), length_(length) { device_memory_ = executor_->AllocateArray(length); CHECK(!device_memory_.is_null()); @@ -60,14 +60,12 @@ class InfeedBuffer { // client to manage memory for the infeed buffers. void Done() { delete this; } - perftools::gputools::DeviceMemoryBase* device_memory() { - return &device_memory_; - } + se::DeviceMemoryBase* device_memory() { return &device_memory_; } private: - perftools::gputools::StreamExecutor* executor_; // Not owned. + se::StreamExecutor* executor_; // Not owned. const int64 length_; - perftools::gputools::DeviceMemoryBase device_memory_; + se::DeviceMemoryBase device_memory_; }; // Client-side class used to enqueue infeed buffers. @@ -100,8 +98,7 @@ class InfeedManager { // new stream on the first invocation. On subsequent invocations, if // the cached executor is not the same as the requested executor, // returns null. - perftools::gputools::Stream* GetStream( - perftools::gputools::StreamExecutor* executor); + se::Stream* GetStream(se::StreamExecutor* executor); private: // TODO(b/30467474): Revisit if this mutex becomes a point of @@ -121,10 +118,10 @@ class InfeedManager { tensorflow::gtl::FlatSet dequeued_buffer_; // Cached host to device stream for queuing infeed data. - std::unique_ptr host_to_device_stream_; + std::unique_ptr host_to_device_stream_; // Executor that the host_to_device_stream belongs to. Not owned. - perftools::gputools::StreamExecutor* host_to_device_executor_; + se::StreamExecutor* host_to_device_executor_; }; // Singleton creator-or-accessor: Returns the GPU infeed manager. diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc index 2ac95ceb692447c7ac6dbbcd8b9a38876f7a77b6..ea34d5b30c91e8b809e3e17a904e27e589fd6b5f 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc @@ -31,10 +31,10 @@ InfeedThunk::InfeedThunk( destination_buffer_(destination_buffer) {} Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { + se::Stream* stream) { VLOG(2) << "Infeeding to GPU "; - perftools::gputools::DeviceMemoryBase destination_address = + se::DeviceMemoryBase destination_address = buffer_allocations.GetDeviceAddress(destination_buffer_); InfeedManager* infeed_manager = GetOrCreateInfeedManager(); @@ -45,7 +45,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, std::vector tuple_element_addresses; for (BufferAllocation::Slice tuple_element_buffer : tuple_element_buffers_) { - perftools::gputools::DeviceMemoryBase tuple_element_address = + se::DeviceMemoryBase tuple_element_address = buffer_allocations.GetDeviceAddress(tuple_element_buffer); InfeedBuffer* buffer = infeed_manager->BlockingDequeueBuffer(); diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h index 86918705fa0305217f11753e383200c7bd71474b..93713cb12defd95bdd69cb0aa7ad7b4e37fc8fae 100644 --- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h @@ -44,7 +44,7 @@ class InfeedThunk : public Thunk { InfeedThunk& operator=(const InfeedThunk&) = delete; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: const std::vector tuple_element_buffers_; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h index 3790ed313b9d0e167185a8b12c812132ee78811f..a78b4ff83075fd7ef330bb97ce217a198d450cf8 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h @@ -32,7 +32,7 @@ class IrEmitterContext { public: IrEmitterContext(const HloModule* hlo_module, const BufferAssignment* buffer_assignment, - const perftools::gputools::DeviceDescription* device_desc, + const se::DeviceDescription* device_desc, llvm::Module* llvm_module) : hlo_module_(hlo_module), buffer_assignment_(buffer_assignment), @@ -47,7 +47,7 @@ class IrEmitterContext { const BufferAssignment& buffer_assignment() const { return *buffer_assignment_; } - const perftools::gputools::DeviceDescription& device_description() const { + const se::DeviceDescription& device_description() const { return *device_desc_; } llvm::Module* llvm_module() { return llvm_module_; } @@ -56,7 +56,7 @@ class IrEmitterContext { private: const HloModule* hlo_module_; const BufferAssignment* buffer_assignment_; - const perftools::gputools::DeviceDescription* device_desc_; + const se::DeviceDescription* device_desc_; llvm::Module* llvm_module_; NameUniquer name_uniquer_; }; diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc index c24dc1457f83c7557430a69baf806ed05b45adca..d376ef7a245eb9ed86939f44c611b6dde5606b23 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc @@ -23,8 +23,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h index df8971b083fe70588f8c32f977981e365d78fdb8..b556befe66b6bebba1a958f553f0a9b2c4eebbe4 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h @@ -61,8 +61,7 @@ class KernelThunk : public Thunk { // Executes the kernel for the thunk on "stream", which must be non-null. tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: // Buffers passed to the kernel as arguments. @@ -82,13 +81,11 @@ class KernelThunk : public Thunk { // Describes how to load this kernel. ExecuteOnStream reuses this loader // specification for all executions. mutable tensorflow::mutex mutex_; - std::unique_ptr loader_spec_ - GUARDED_BY(mutex_); + std::unique_ptr loader_spec_ GUARDED_BY(mutex_); // Loaded kernels for each `StreamExecutor` - std::unordered_map - kernel_cache_ GUARDED_BY(mutex_); + std::unordered_map kernel_cache_ + GUARDED_BY(mutex_); }; } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc index 18e673542c5b47cb90d31a8eff62a5e4adb78d1d..d4100a898b5bb9eec382c34932c2db104c9e985b 100644 --- a/tensorflow/compiler/xla/service/gpu/memset_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.cc @@ -19,8 +19,6 @@ limitations under the License. namespace xla { namespace gpu { -namespace se = ::perftools::gputools; - Status MemzeroThunk::ExecuteOnStream( const BufferAllocations& buffer_allocations, se::Stream* stream) { se::DeviceMemoryBase dest_data = buffer_allocations.GetDeviceAddress(dest_); diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h index b4bb74d1dd6dc9d09c5e4d439d57dfe8b57c2ed9..51c332d287d139335b356fc66411b5ffaa448b5a 100644 --- a/tensorflow/compiler/xla/service/gpu/memset_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h @@ -36,7 +36,7 @@ class MemzeroThunk : public Thunk { : Thunk(Kind::kMemzero, hlo), dest_(dest) {} Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: const BufferAllocation::Slice dest_; @@ -52,7 +52,7 @@ class Memset32BitValueThunk : public Thunk { : Thunk(Kind::kMemset32BitValue, hlo), value_(value), dest_(dest) {} Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: uint32 value_; diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc index 5283d51cd10668c43c5ad1c1fb11049555bff5d4..d3fd0544fb68809125e9b9f7a5e5b7eff8c6ef43 100644 --- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc +++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc @@ -29,8 +29,6 @@ limitations under the License. #include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h index 42d2d2af2e334da7c42419cb07a2bd5bb9d209d6..c125474edb1036090a926020f2b1e7fcf64c751a 100644 --- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h +++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h @@ -57,8 +57,7 @@ std::ostream& operator<<(std::ostream& out, const LaunchDimensions& launch_dims); LaunchDimensions CalculateLaunchDimensions( - const Shape& shape, - const perftools::gputools::DeviceDescription& device_desc, + const Shape& shape, const se::DeviceDescription& device_desc, int unroll_factor = 1); } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc index d8a43091d4037a0edd125a4a1b6cb5ad7c7065f0..c8510808f10a731af90154447bd3e1e037db6348 100644 --- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc @@ -33,8 +33,7 @@ tensorflow::Status SequentialThunk::Initialize( } tensorflow::Status SequentialThunk::ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { + const BufferAllocations& buffer_allocations, se::Stream* stream) { for (const auto& thunk : thunks_) { TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream)); } diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h index 32c5b748aba14239d6795d14e442c1c3b43d010e..df17b8d67b80321c7088243eae46e7a723b4ede9 100644 --- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h @@ -40,8 +40,7 @@ class SequentialThunk : public Thunk { tensorflow::Status Initialize(const GpuExecutable& executable) override; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: // The list of sub-thunks. diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h index 9eea958d1214b131d49cb4e28f1944860408d3a8..a0c785ed913109e987d058124c8ef49019c98500 100644 --- a/tensorflow/compiler/xla/service/gpu/thunk.h +++ b/tensorflow/compiler/xla/service/gpu/thunk.h @@ -85,8 +85,7 @@ class Thunk { // This value is not required to be constant for a given Thunk. For example, // a Thunk that performs autotuning may return true for its first run and // false thereafter. - virtual bool ShouldHaltAllActivityBeforeRunning( - perftools::gputools::Stream* /*stream*/) { + virtual bool ShouldHaltAllActivityBeforeRunning(se::Stream* /*stream*/) { return false; } @@ -104,8 +103,7 @@ class Thunk { // called after Initialize and can be called multiple times over Thunk's // lifetime. Stream argument must be non-null. virtual tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) = 0; + const BufferAllocations& buffer_allocations, se::Stream* stream) = 0; private: Kind kind_; diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc index bd65e72393a59e72671ff0cc32c37eaa48856255..ecb54857ccc40ead21e5a18d79a37b545680021d 100644 --- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc @@ -17,8 +17,6 @@ limitations under the License. #include "tensorflow/compiler/xla/util.h" -namespace se = ::perftools::gputools; - namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h index 3b1a496328540ae69a449e7080903d31284885d1..8b459c29a136a6e7853e68a1bead7d12c0d08ad0 100644 --- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h @@ -46,8 +46,7 @@ class TupleThunk : public Thunk { TupleThunk& operator=(const TupleThunk&) = delete; tensorflow::Status ExecuteOnStream( - const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + const BufferAllocations& buffer_allocations, se::Stream* stream) override; private: const std::vector tuple_element_buffers_; diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc index c21559af6d2e5dfb5aaf62afcdcaed514e0914c9..a9f3d619a3ffd6d849572355e2902375e43508fa 100644 --- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc @@ -41,8 +41,8 @@ Status WhileThunk::Initialize(const GpuExecutable& executable) { } Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) { - perftools::gputools::DeviceMemoryBase condition_result_data = + se::Stream* stream) { + se::DeviceMemoryBase condition_result_data = buffer_allocations.GetDeviceAddress(condition_result_buffer_index_); while (true) { diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h index 4c9f45de9e42494df58706d0a4a3eb0c4220b8b8..e589ca78a7ea00e7592d6e09ead9c270f902702f 100644 --- a/tensorflow/compiler/xla/service/gpu/while_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h @@ -47,7 +47,7 @@ class WhileThunk : public Thunk { Status Initialize(const GpuExecutable& executable) override; Status ExecuteOnStream(const BufferAllocations& buffer_allocations, - perftools::gputools::Stream* stream) override; + se::Stream* stream) override; private: const BufferAllocation::Slice condition_result_buffer_index_; diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto index 8fd7f8945c7c36a451af30fcd5939a2498648e74..aa6860880b7a1308d3ecabb52318daa7d2852af2 100644 --- a/tensorflow/compiler/xla/service/hlo.proto +++ b/tensorflow/compiler/xla/service/hlo.proto @@ -296,3 +296,20 @@ message HloProto { HloOrderingProto hlo_ordering = 2; BufferAssignmentProto buffer_assignment = 3; } + +// Encapsulates HloProto together with the arguments, result, and +// execution_platform. This message is used for purposes such as +// analysis/replay/file-storage. +message HloSnapshot { + // The hlo graph. + HloProto hlo = 1; + + // The arguments passed to the graph. + repeated LiteralProto arguments = 2; + + // The result of the graph. + LiteralProto result = 3; + + // The name of the platform used to run the graph. + string execution_platform = 4; +} diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc index 7aa38c6b79ed904bb4a518c4b7aaa1e079c27ea8..35ecd4428d0dfde2de445ea34472d2c78148c6c9 100644 --- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc +++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc @@ -69,8 +69,7 @@ StatusOr HloConstantFolding::Run(HloModule* module) { // Broadcasts dramatically increase the size of constants, which is often // detrimental to performance and memory capacity, so do not fold // broadcasts. - if (instruction->opcode() == HloOpcode::kBroadcast || - instruction->opcode() == HloOpcode::kBroadcastDimOne) { + if (instruction->opcode() == HloOpcode::kBroadcast) { continue; } diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc index ea4dd62fdb5bb3be40987d1a6ea96b3a58b0053b..44e4f75f75b275653e1a07111943843fc6f78b33 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc @@ -336,11 +336,6 @@ Status HloCostAnalysis::HandleBroadcast(const HloInstruction*) { return Status::OK(); } -Status HloCostAnalysis::HandleBroadcastDimOne( - const HloInstruction* broadcastDimOne) { - return Status::OK(); -} - Status HloCostAnalysis::HandlePad(const HloInstruction*) { return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h index a9f6845747aa2081df936d388551bbc0b75b787b..d17678d20f2a23fd98d18b77d5fb25853901a789 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h @@ -95,7 +95,6 @@ class HloCostAnalysis : public ConstDfsHloVisitor { Status HandleSelectAndScatter(const HloInstruction* instruction) override; Status HandleBitcast(const HloInstruction* bitcast) override; Status HandleBroadcast(const HloInstruction* broadcast) override; - Status HandleBroadcastDimOne(const HloInstruction* broadcastDimOne) override; Status HandlePad(const HloInstruction* pad) override; Status HandleReshape(const HloInstruction* reshape) override; Status HandleTranspose(const HloInstruction* transpose) override; diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc index b186767ce792cd89ae77fe9a03b3a2ecf296b804..9a89888480b8c79dfb1f79a50e9686bf45aa49b3 100644 --- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc +++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc @@ -163,6 +163,8 @@ StatusOr MakeConcatHlo(ArraySlice operands, } StatusOr CollapseFirstNDims(HloInstruction* operand, int64 n) { + CHECK_GT(n, 0); + const Shape& operand_shape = operand->shape(); CHECK_GE(operand_shape.dimensions_size(), n); int64 new_shape_leading_bound = 1; @@ -184,6 +186,17 @@ StatusOr CollapseFirstNDims(HloInstruction* operand, int64 n) { return MakeReshapeHlo(output_shape, operand); } +StatusOr PrependDegenerateDims(HloInstruction* operand, + int64 n) { + CHECK_GT(n, 0); + std::vector new_shape_dims; + const Shape& operand_shape = operand->shape(); + new_shape_dims.reserve(n + operand_shape.dimensions_size()); + new_shape_dims.insert(new_shape_dims.begin(), n, 1); + c_copy(operand_shape.dimensions(), std::back_inserter(new_shape_dims)); + return MakeReshapeHlo(new_shape_dims, operand); +} + StatusOr ExpandFirstDimIntoNDims( HloInstruction* operand, ArraySlice expanded_dims) { CHECK_GT(operand->shape().dimensions_size(), 0); diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h index d99e32a737e6aaa2ff746cf6c00d4300cf62f4e1..c9a7361a6af0c2a0839c59a0ea695ec1b9a98bd4 100644 --- a/tensorflow/compiler/xla/service/hlo_creation_utils.h +++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h @@ -103,12 +103,22 @@ StatusOr MakeConcatHlo( // their operand(s). // Collapses (via reshape) the first N (logical) dimensions of `operand` into a -// single leading dimension. `operand` must have rank > n. +// single leading dimension. `operand` must have rank > `n` and `n` must not be +// 0. // // For instance if `operand` has shape f32[7,8,9] and n is 2 then the output is // the `operand` reshaped to [56,9]. StatusOr CollapseFirstNDims(HloInstruction* operand, int64 n); +// Prepends `n` degenerate dimensions (dimensions with bound = 1) to `operand` +// using a reshape. +// +// For instance if operand has shape f32[3,4,5] then this returns the operand +// reshaped to f32[1,3,4,5]. If the operand is a f32 scalar (i.e. has shape +// f32[]) then this returns the operand reshaped to f32[1]. +StatusOr PrependDegenerateDims(HloInstruction* operand, + int64 n); + // Expands (via reshape) the first (logical) dimension of `operand` into a // sequence of `expanded_dims` dimensions. `operand` must at least be of rank 1 // and the number of elements in its first dimension must be equal to the diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6b681a5bf6f34b724bed52d60df59c2ac1068b52 --- /dev/null +++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc @@ -0,0 +1,234 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/hlo_creation_utils.h" +#include "tensorflow/compiler/xla/ptr_util.h" +#include "tensorflow/compiler/xla/service/hlo_evaluator.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/test.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace { +using tensorflow::gtl::ArraySlice; + +std::unique_ptr CreateModuleWithProgramShape( + PrimitiveType primitive_type, ArraySlice input_shape_dims, + ArraySlice output_shape_dims, HloInstruction** param, + HloComputation** entry_computation) { + Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims); + Shape output_shape = ShapeUtil::MakeShape(primitive_type, output_shape_dims); + std::unique_ptr module = MakeUnique("test"); + *entry_computation = module->AddEntryComputation( + CreateComputationWithSignature({&input_shape}, output_shape, "entry") + .ValueOrDie()); + *param = (*entry_computation)->parameter_instruction(0); + return module; +} + +TEST(HloCreationUtilsTest, CollapseFirst1Dim) { + HloInstruction* param; + HloComputation* entry_computation; + + std::unique_ptr module = CreateModuleWithProgramShape( + S32, + /*input_shape_dims=*/{2}, /*output_shape_dims=*/{2}, ¶m, + &entry_computation); + + TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_1_dims_collapsed, + CollapseFirstNDims(param, 1)); + entry_computation->set_root_instruction(first_1_dims_collapsed); + + HloEvaluator evaluator; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {Literal::CreateR1({3, 4})})); + CHECK_EQ(*result_literal, *Literal::CreateR1({3, 4})); +} + +TEST(HloCreationUtilsTest, CollapseFirst2Dims) { + HloInstruction* param; + HloComputation* entry_computation; + + std::unique_ptr module = CreateModuleWithProgramShape( + S32, + /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, ¶m, + &entry_computation); + + TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_2_dims_collapsed, + CollapseFirstNDims(param, 2)); + entry_computation->set_root_instruction(first_2_dims_collapsed); + + HloEvaluator evaluator; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, + {Literal::CreateR3( + {{{1, 2}, {3, 4}, {5, 6}}, {{-1, -2}, {-3, -4}, {-5, -6}}})})); + CHECK_EQ(*result_literal, + *Literal::CreateR2( + {{1, 2}, {3, 4}, {5, 6}, {-1, -2}, {-3, -4}, {-5, -6}})); +} + +TEST(HloCreationUtilsTest, Prepend1DegenerateDim) { + HloInstruction* param; + HloComputation* entry_computation; + + std::unique_ptr module = CreateModuleWithProgramShape( + S32, + /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 2}, ¶m, + &entry_computation); + + TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_1_degenerate_dim_prepended, + PrependDegenerateDims(param, 1)); + entry_computation->set_root_instruction(with_1_degenerate_dim_prepended); + + HloEvaluator evaluator; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {Literal::CreateR1({9, 10})})); + CHECK_EQ(*result_literal, *Literal::CreateR2({{9, 10}})); +} + +TEST(HloCreationUtilsTest, Prepend2DegenerateDims) { + HloInstruction* param; + HloComputation* entry_computation; + + std::unique_ptr module = CreateModuleWithProgramShape( + S32, + /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 1, 2}, ¶m, + &entry_computation); + + TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended, + PrependDegenerateDims(param, 2)); + entry_computation->set_root_instruction(with_2_degenerate_dims_prepended); + + HloEvaluator evaluator; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {Literal::CreateR1({9, 10})})); + CHECK_EQ(*result_literal, *Literal::CreateR3({{{9, 10}}})); +} + +TEST(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) { + HloInstruction* param; + HloComputation* entry_computation; + + std::unique_ptr module = CreateModuleWithProgramShape( + S32, + /*input_shape_dims=*/{}, /*output_shape_dims=*/{1, 1}, ¶m, + &entry_computation); + + TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended, + PrependDegenerateDims(param, 2)); + entry_computation->set_root_instruction(with_2_degenerate_dims_prepended); + + HloEvaluator evaluator; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {Literal::CreateR0(9)})); + CHECK_EQ(*result_literal, *Literal::CreateR2({{9}})); +} + +TEST(HloCreationUtilsTest, ExpandFirstDimInto3Dims) { + HloInstruction* param; + HloComputation* entry_computation; + + std::unique_ptr module = CreateModuleWithProgramShape( + S32, + /*input_shape_dims=*/{6}, /*output_shape_dims=*/{3, 1, 2}, ¶m, + &entry_computation); + + TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_dim_expanded, + ExpandFirstDimIntoNDims(param, {3, 1, 2})); + entry_computation->set_root_instruction(first_dim_expanded); + + HloEvaluator evaluator; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {Literal::CreateR1({1, 2, 3, 4, 5, 6})})); + CHECK_EQ(*result_literal, + *Literal::CreateR3({{{1, 2}}, {{3, 4}}, {{5, 6}}})); +} + +TEST(HloCreationUtilsTest, PadVectorWithZeros) { + HloInstruction* param; + HloComputation* entry_computation; + + std::unique_ptr module = CreateModuleWithProgramShape( + S32, + /*input_shape_dims=*/{2}, /*output_shape_dims=*/{6}, ¶m, + &entry_computation); + + TF_ASSERT_OK_AND_ASSIGN( + HloInstruction * zero_padded_param, + PadVectorWithZeros(param, /*zeros_to_prepend=*/3, /*zeros_to_append=*/1)); + entry_computation->set_root_instruction(zero_padded_param); + + HloEvaluator evaluator; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {Literal::CreateR1({3, 4})})); + CHECK_EQ(*result_literal, *Literal::CreateR1({0, 0, 0, 3, 4, 0})); +} + +TEST(HloCreationUtilsTest, BroadcastZeros_S32) { + HloInstruction* param; + HloComputation* entry_computation; + + std::unique_ptr module = CreateModuleWithProgramShape( + S32, + /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, ¶m, + &entry_computation); + + TF_ASSERT_OK_AND_ASSIGN( + HloInstruction * zeros, + BroadcastZeros(module->entry_computation(), S32, {2, 2})); + entry_computation->set_root_instruction(zeros); + + HloEvaluator evaluator; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {Literal::CreateR0(0)})); + CHECK_EQ(*result_literal, *Literal::CreateR2({{0, 0}, {0, 0}})); +} + +TEST(HloCreationUtilsTest, BroadcastZeros_F32) { + HloInstruction* param; + HloComputation* entry_computation; + + std::unique_ptr module = CreateModuleWithProgramShape( + F32, + /*input_shape_dims=*/{}, /*output_shape_dims=*/{2, 2}, ¶m, + &entry_computation); + + TF_ASSERT_OK_AND_ASSIGN( + HloInstruction * zeros, + BroadcastZeros(module->entry_computation(), F32, {2, 2})); + entry_computation->set_root_instruction(zeros); + + HloEvaluator evaluator; + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result_literal, + evaluator.Evaluate>( + *module, {Literal::CreateR0(0.0f)})); + CHECK_EQ(*result_literal, + *Literal::CreateR2({{0.0f, 0.0f}, {0.0f, 0.0f}})); +} + +} // namespace +} // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc index cd7cbbdd71706fddb64855f631eb09de35da52e8..3b22c93733af293e4d73a2b1b3ac8822dec6d5f5 100644 --- a/tensorflow/compiler/xla/service/hlo_cse.cc +++ b/tensorflow/compiler/xla/service/hlo_cse.cc @@ -97,6 +97,10 @@ StatusOr HloCSE::Run(HloModule* module) { const std::function eq_computations = std::equal_to(); for (auto* computation : module->computations()) { + if (only_fusion_computations_ && !computation->IsFusionComputation()) { + continue; + } + changed |= CombineConstants(computation, is_layout_sensitive_); std::list post_order = diff --git a/tensorflow/compiler/xla/service/hlo_cse.h b/tensorflow/compiler/xla/service/hlo_cse.h index 70096e07a2493763a9d4b0dc8e1c31510718c6c2..5e2b348bdda2b31556fb692e24d2bad2e4173ef5 100644 --- a/tensorflow/compiler/xla/service/hlo_cse.h +++ b/tensorflow/compiler/xla/service/hlo_cse.h @@ -29,9 +29,11 @@ class HloCSE : public HloPassInterface { public: // If is_layout_sensitive is true, then the simplifier preserves layout during // transformation. Otherwise, layout is ignored. - explicit HloCSE(bool is_layout_sensitive) - : is_layout_sensitive_(is_layout_sensitive) {} - ~HloCSE() override {} + explicit HloCSE(bool is_layout_sensitive, + bool only_fusion_computations = false) + : is_layout_sensitive_(is_layout_sensitive), + only_fusion_computations_(only_fusion_computations) {} + ~HloCSE() override = default; tensorflow::StringPiece name() const override { return "cse"; } // Run CSE on the given module. Returns whether the module was changed (common @@ -39,7 +41,8 @@ class HloCSE : public HloPassInterface { StatusOr Run(HloModule* module) override; private: - bool is_layout_sensitive_; + const bool is_layout_sensitive_; + const bool only_fusion_computations_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc index c782d1b0add17c70e0f54826917df251d5a613e2..d236f83aeb9254b9c6e6d04629758ac2c8fd0da3 100644 --- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc +++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc @@ -178,24 +178,37 @@ StatusOr HloElementTypeConverter::Run(HloModule* module) { if (hlo->shape().element_type() == eliminate_type_) { Shape shape = ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_); + new_hlo = computation->AddInstruction( hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule())); + TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo)); + new_hlo = ToElementType(new_hlo, eliminate_type_); } else if (ShapeUtil::IsTuple(hlo->shape())) { Shape old_shape = hlo->shape(); Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_, replace_with_type_); + new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands( new_shape, new_operands, hlo->GetModule())); + TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo)); + // Convert the elements of the result of `new_hlo` to produce a new // tuple with shape `old_shape`. new_hlo = ConvertTupleElements(new_hlo, old_shape); } else { new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands( hlo->shape(), new_operands, hlo->GetModule())); + TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo)); } - TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, new_hlo)); + TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_hlo)); + TF_RETURN_IF_ERROR(hlo->DropAllControlDeps()); + + // NB! We want to replace and remove side effecting instructions like Rng + // as well so we can't rely HloComputation::ReplaceInstruction to reliably + // remove the replaced instruction. + TF_RETURN_IF_ERROR(computation->RemoveInstruction(hlo)); changed = true; } } diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc index cb94d9f19b825d1321263a4737b66a6bf198a772..5c5a059e0fd895f03bc26a975609b57333237faf 100644 --- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc +++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc @@ -22,6 +22,12 @@ namespace { namespace op = xla::testing::opcode_matchers; +using ::testing::Contains; +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Not; +using ::testing::ResultOf; + class HloElementTypeConverterTest : public HloTestBase { public: std::unique_ptr CreateModuleFromHloString( @@ -117,5 +123,65 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) { op::Convert(op::GetTupleElement(batch_norm, 2)))); } +TEST_F(HloElementTypeConverterTest, RngIsRemoved) { + const string& hlo_string = R"( +HloModule RngIsRemoved + +ENTRY main { + constant.3 = bf16[] constant(0) + constant.4 = bf16[] constant(1) + ROOT rng = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform +} + )"; + auto module = CreateModuleFromHloString(hlo_string); + HloElementTypeConverter type_converter(BF16, F32); + TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get())); + EXPECT_TRUE(converted); + + std::function is_bf16_rng = + [](const HloInstruction* inst) { + return inst->shape().element_type() == BF16 && + inst->opcode() == HloOpcode::kRng; + }; + + EXPECT_THAT(module->entry_computation()->instructions(), + Not(Contains(ResultOf(is_bf16_rng, Eq(true))))); +} + +TEST_F(HloElementTypeConverterTest, RngCtrlDep) { + const string& hlo_string = R"( +HloModule RngIsRemoved + +ENTRY main { + constant.3 = bf16[] constant(0) + constant.4 = bf16[] constant(1) + rng0 = bf16[1,2000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform + ROOT rng1 = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), control-predecessors={%rng0}, distribution=rng_uniform +} + )"; + auto module = CreateModuleFromHloString(hlo_string); + + HloElementTypeConverter type_converter(BF16, F32); + TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get())); + EXPECT_TRUE(converted); + + HloInstruction *rng0, *rng1; + for (auto* inst : module->entry_computation()->instructions()) { + if (inst->opcode() == HloOpcode::kRng) { + const Shape& shape = inst->shape(); + ASSERT_EQ(shape.dimensions_size(), 3); + ASSERT_TRUE(shape.dimensions(1) == 2000 || shape.dimensions(1) == 1000); + if (shape.dimensions(1) == 2000) { + rng0 = inst; + } else { + rng1 = inst; + } + } + } + + EXPECT_THAT(rng0->control_successors(), ElementsAre(rng1)); + EXPECT_THAT(rng1->control_predecessors(), ElementsAre(rng0)); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc index 52bc2c0448df080fa8224a2e28b66f13d8c9246b..c5e30148345fec2029bf533fcfa9deb89662ec83 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator.cc +++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc @@ -1853,6 +1853,34 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault { return Status::OK(); } + // Enable CLZ only for int32 and uint32. + template < + typename NativeT, + typename std::enable_if< + (std::is_floating_point::value || + std::is_integral::value || is_complex_t::value) && + !(std::is_same::value || + std::is_same::value)>::type* = nullptr> + Status HandleClz(HloInstruction* clz) { + return InvalidArgument("Unsupported type for Clz"); + } + + template ::value || + std::is_same::value>::type* = nullptr> + Status HandleClz(HloInstruction* clz) { + TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz], + ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) { + return 31 - tensorflow::Log2Floor(elem_operand); + })); + return Status::OK(); + } + + Status HandleClz(HloInstruction* clz) override { + return HandleClz(clz); + } + template ::value>::type* = nullptr> Status HandleSin(HloInstruction* sin) { diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h index 6fb91b9bef9d1df82b8806ce79cc147823edeb3d..be989846ef5cd2645da88ac9bbfea9534dd47821 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.h +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h @@ -88,7 +88,7 @@ std::unique_ptr CreateHloProfilePrinterData( // down how much time each HLO took. class HloExecutionProfile { public: - using DeviceDescription = perftools::gputools::DeviceDescription; + using DeviceDescription = se::DeviceDescription; HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data, const HloProfileIndexMap* hlo_profile_index_map); diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index c35783c456c63b9a651d1221cf9a3d70af38ba66..516e14b4642ae6665a2d15c91715dc9b057ab41a 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -909,6 +909,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { case HloOpcode::kBitcastConvert: case HloOpcode::kCeil: case HloOpcode::kClamp: + case HloOpcode::kClz: case HloOpcode::kComplex: case HloOpcode::kConvert: case HloOpcode::kCos: @@ -956,7 +957,6 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) { case HloOpcode::kTuple: return kWhite; case HloOpcode::kBroadcast: - case HloOpcode::kBroadcastDimOne: // De-emphasize nodes which broadcast a scalar within a fusion node -- // these are essentially free. if (instr->IsFused() && diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index 56cb241087cf31084df76c25ead89d477cd38f0f..a714d0e114245021c28da26beae444dbd3d99bb5 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -254,6 +254,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape, case HloOpcode::kCeil: case HloOpcode::kCopy: case HloOpcode::kCos: + case HloOpcode::kClz: case HloOpcode::kExp: case HloOpcode::kFloor: case HloOpcode::kImag: @@ -700,15 +701,6 @@ HloInstruction::CreateSelectAndScatter( return instruction; } -/* static */ std::unique_ptr -HloInstruction::CreateBroadcastDimOne(const Shape& shape, - HloInstruction* operand) { - auto instruction = - WrapUnique(new HloInstruction(HloOpcode::kBroadcastDimOne, shape)); - instruction->AppendOperand(operand); - return instruction; -} - /* static */ std::unique_ptr HloInstruction::CreateBroadcastSequence( const Shape& output_shape, HloInstruction* operand, @@ -1257,6 +1249,7 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( case HloOpcode::kRoundNearestAfz: case HloOpcode::kBitcast: case HloOpcode::kCeil: + case HloOpcode::kClz: case HloOpcode::kCopy: case HloOpcode::kCos: case HloOpcode::kExp: @@ -1311,10 +1304,6 @@ std::unique_ptr HloInstruction::CloneWithNewOperands( CHECK_EQ(new_operands.size(), 1); clone = CreateBroadcast(shape, new_operands[0], dimensions_); break; - case HloOpcode::kBroadcastDimOne: - CHECK_EQ(new_operands.size(), 1); - clone = CreateBroadcastDimOne(shape, new_operands[0]); - break; case HloOpcode::kCall: clone = CreateCall(shape, new_operands, to_apply()); break; @@ -1689,14 +1678,35 @@ Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) { } Status HloInstruction::RemoveControlDependencyTo(HloInstruction* instruction) { - auto succ_it = std::find(control_successors_.begin(), - control_successors_.end(), instruction); - TF_RET_CHECK(succ_it != control_successors_.end()); - control_successors_.erase(succ_it); - auto pred_it = std::find(instruction->control_predecessors_.begin(), - instruction->control_predecessors_.end(), this); - TF_RET_CHECK(pred_it != instruction->control_predecessors_.end()); - instruction->control_predecessors_.erase(pred_it); + TF_RET_CHECK(instruction->parent() == parent()); + TF_RETURN_IF_ERROR(EraseElementFromVector(&control_successors_, instruction)); + TF_RETURN_IF_ERROR( + EraseElementFromVector(&instruction->control_predecessors_, this)); + return Status::OK(); +} + +Status HloInstruction::DropAllControlDeps() { + for (auto* ctrl_succ : control_successors_) { + TF_RETURN_IF_ERROR( + EraseElementFromVector(&ctrl_succ->control_predecessors_, this)); + } + for (auto* ctrl_pred : control_predecessors_) { + TF_RETURN_IF_ERROR( + EraseElementFromVector(&ctrl_pred->control_successors_, this)); + } + control_successors_.clear(); + control_predecessors_.clear(); + return Status::OK(); +} + +Status HloInstruction::CopyAllControlDepsFrom(const HloInstruction* inst) { + for (auto* ctrl_pred : inst->control_predecessors()) { + TF_RETURN_IF_ERROR(ctrl_pred->AddControlDependencyTo(this)); + } + + for (auto* ctrl_succ : inst->control_successors()) { + TF_RETURN_IF_ERROR(this->AddControlDependencyTo(ctrl_succ)); + } return Status::OK(); } @@ -1741,6 +1751,7 @@ bool HloInstruction::IdenticalSlowPath( case HloOpcode::kAdd: case HloOpcode::kCeil: case HloOpcode::kClamp: + case HloOpcode::kClz: case HloOpcode::kComplex: case HloOpcode::kCopy: case HloOpcode::kCos: @@ -1863,8 +1874,6 @@ bool HloInstruction::IdenticalSlowPath( // Remaining instructions with special values. case HloOpcode::kBitcast: - case HloOpcode::kBroadcastDimOne: - case HloOpcode::kDynamicUpdateSlice: return eq_shapes(shape(), other.shape()); case HloOpcode::kBroadcast: return eq_shapes(shape(), other.shape()) && @@ -1883,6 +1892,8 @@ bool HloInstruction::IdenticalSlowPath( case HloOpcode::kDynamicSlice: return eq_shapes(shape(), other.shape()) && dynamic_slice_sizes_ == other.dynamic_slice_sizes_; + case HloOpcode::kDynamicUpdateSlice: + return eq_shapes(shape(), other.shape()); case HloOpcode::kCall: case HloOpcode::kMap: return eq_computations(to_apply(), other.to_apply()); @@ -2440,12 +2451,6 @@ HloInstructionProto HloInstruction::ToProto() const { proto.add_fft_length(fft_len); } - if (gather_dimension_numbers_ != nullptr) { - *proto.mutable_gather_dimension_numbers() = *gather_dimension_numbers_; - } - for (int64 bound : gather_window_bounds_) { - proto.add_gather_window_bounds(bound); - } proto.set_channel_name(channel_name_); proto.set_cost_estimate_ns(cost_estimate_ns_); @@ -2672,6 +2677,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return visitor->HandleFloor(this); case HloOpcode::kCeil: return visitor->HandleCeil(this); + case HloOpcode::kClz: + return visitor->HandleClz(this); case HloOpcode::kLog: return visitor->HandleLog(this); case HloOpcode::kTanh: @@ -2692,8 +2699,6 @@ Status HloInstruction::Visit(DfsHloVisitorBase* visitor) { return visitor->HandleBitcast(this); case HloOpcode::kBroadcast: return visitor->HandleBroadcast(this); - case HloOpcode::kBroadcastDimOne: - return visitor->HandleBroadcastDimOne(this); case HloOpcode::kPad: return visitor->HandlePad(this); case HloOpcode::kReshape: @@ -3015,6 +3020,7 @@ bool HloInstruction::IsElementwise() const { case HloOpcode::kAbs: case HloOpcode::kRoundNearestAfz: case HloOpcode::kCeil: + case HloOpcode::kClz: case HloOpcode::kConvert: case HloOpcode::kBitcastConvert: case HloOpcode::kCopy: diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 49aa07502996b698bb20f2c2e9d1d371d43d1793..a5e9aecb9e7f5204b53186abca78033215a75828 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -401,10 +401,6 @@ class HloInstruction { const Shape& shape, HloInstruction* operand, tensorflow::gtl::ArraySlice broadcast_dimensions); - // Creates a broadcast-size-one-dimensions instruction. - static std::unique_ptr CreateBroadcastDimOne( - const Shape& shape, HloInstruction* operand); - // Creates a sequence of instructions that performs an explicit broadcast of // the operand to the target shape. // @@ -561,6 +557,18 @@ class HloInstruction { // 'instruction'. Status RemoveControlDependencyTo(HloInstruction* instruction); + // Drops all control predecessors and successors from this HLO instruction. + Status DropAllControlDeps(); + + // Copies the control predecessors and successors on this HLO instruction to + // `inst`. Does not do a deep copy so this makes sense only if `inst` and + // this HLO are in the same module. + // + // Depending on the use cases we see in practice, in the future we may + // consider folding the logic here into Clone, CloneWithNewOperands and + // ReplaceAllUsesWith by treating control dependencies like data dependencies. + Status CopyAllControlDepsFrom(const HloInstruction* inst); + // Returns the set of control predecessors (successors) of this // instruction. Control predecessors (successors) must execute before (after) // the current instruction. @@ -1152,17 +1160,17 @@ class HloInstruction { // Clones the HLO instruction. The clone will have the same opcode, shape, and // operands. After creation the clone has no uses. "this" (the instruction // cloned from) is not changed. Suffix is the string to append to the name of - // the instruction to form the name of the cloned instruction. - // If the module pointer is not nullptr, it will be the module where - // the cloned computations will be added to (in order to support deep - // cloning). + // the instruction to form the name of the cloned instruction. If the module + // pointer is not nullptr, it will be the module where the cloned computations + // will be added to (in order to support deep cloning). Ignores the control + // predecessors and successors of this HLO instruction. std::unique_ptr Clone(const string& suffix = "clone", HloModule* module = nullptr) const; - // Clones the HLO instruction as above but with new shape and operands. - // If the module pointer is not nullptr, it will be the module where - // the cloned computations will be added to (in order to support deep - // cloning). + // Clones the HLO instruction as above but with new shape and operands. If + // the module pointer is not nullptr, it will be the module where the cloned + // computations will be added to (in order to support deep cloning). Ignores + // the control predecessors and successors of this HLO instruction. std::unique_ptr CloneWithNewOperands( const Shape& shape, tensorflow::gtl::ArraySlice operands, HloModule* module = nullptr) const; diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc index bc74c4bc10cad20eab20b5caf8550b17048a5276..69deac263ee58f9e4d46987a54f09b11d650950a 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers.cc @@ -132,6 +132,69 @@ bool HloCustomCallMatcher::MatchAndExplain( return result; } +bool HloShapeMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (ShapeUtil::Compatible(instruction->shape(), shape_)) { + return true; + } + *listener << instruction->ToString() << " has incorrect shape (expected: " + << ShapeUtil::HumanString(shape_) << ")"; + return false; +} + +void HloShapeMatcher::DescribeTo(std::ostream* os) const { + *os << ShapeUtil::HumanString(shape_); +} + +bool HloShapeAndLayoutMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (ShapeUtil::Equal(instruction->shape(), shape_)) { + return true; + } + *listener << instruction->ToString() << " has incorrect shape (expected: " + << ShapeUtil::HumanStringWithLayout(shape_) << ")"; + return false; +} + +void HloShapeAndLayoutMatcher::DescribeTo(std::ostream* os) const { + *os << ShapeUtil::HumanStringWithLayout(shape_); +} + +bool HloShardingMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (!sharding_.has_value()) { + if (!instruction->has_sharding()) { + return true; + } + *listener << instruction->ToString() << " expected to have no sharding."; + return false; + } + if (instruction->has_sharding()) { + if (instruction->sharding() == sharding_.value()) { + return true; + } + *listener << instruction->ToString() + << " has incorrect sharding (expected: " << sharding_->ToString() + << ")"; + return false; + } else { + *listener << instruction->ToString() + << " has no sharding (expected: " << sharding_->ToString() << ")"; + return false; + } +} + +void HloShardingMatcher::DescribeTo(std::ostream* os) const { + if (sharding_.has_value()) { + *os << sharding_->ToString(); + } else { + *os << ""; + } +} + } // namespace testing void PrintTo(const HloInstruction* inst, ::std::ostream* os) { diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h index 103f04a2cb7a1a5ae877d8bf259692f7cbed3408..f2ab9b5d9b6e000cdd808fe06a70f0506ddb5ff3 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.h +++ b/tensorflow/compiler/xla/service/hlo_matchers.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/test.h" +#include "tensorflow/core/lib/gtl/optional.h" namespace xla { namespace testing { @@ -86,6 +87,50 @@ class HloCustomCallMatcher : public HloMatcher { ::testing::Matcher call_target_matcher_; }; +class HloShapeMatcher + : public ::testing::MatcherInterface { + public: + explicit HloShapeMatcher(const Shape& shape) : shape_(shape) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + Shape shape_; +}; + +class HloShapeAndLayoutMatcher + : public ::testing::MatcherInterface { + public: + explicit HloShapeAndLayoutMatcher(const Shape& shape) : shape_(shape) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + Shape shape_; +}; + +// Verify the sharding of an instruction against the provided HloSharding. If a +// nullopt is provided for the expected sharding then it checks that no sharding +// is present for an instruction. +class HloShardingMatcher + : public ::testing::MatcherInterface { + public: + explicit HloShardingMatcher( + const tensorflow::gtl::optional& sharding) + : sharding_(sharding) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + tensorflow::gtl::optional sharding_; +}; + // HloInstruction* matchers for opcode and operands. Example: // namespace op = xla::opcode_matchers; // EXPECT_THAT(instruction, @@ -231,6 +276,30 @@ inline ::testing::Matcher CustomCall() { new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {})); } +// Verifies the shape or the shape and the layout of an HLO instruction against +// the provided shape object. +inline ::testing::Matcher Shape( + const class Shape& shape) { + return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(shape)); +} +inline ::testing::Matcher ShapeWithLayout( + const class Shape& shape) { + return ::testing::MakeMatcher( + new ::xla::testing::HloShapeAndLayoutMatcher(shape)); +} + +// Verifies the value of the HloSharing against the provided sharding object. +inline ::testing::Matcher Sharding( + const HloSharding& sharding) { + return ::testing::MakeMatcher( + new ::xla::testing::HloShardingMatcher(sharding)); +} +// Verifies that no HloSharding is set for an HLO instruction. +inline ::testing::Matcher NoSharding() { + return ::testing::MakeMatcher( + new ::xla::testing::HloShardingMatcher(tensorflow::gtl::nullopt)); +} + #undef HLO_MATCHER } // namespace opcode_matchers diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc index 1c21703a45e11914854153bc14fabd85e9ea57f2..c6373b2e46af7df1c7cfe03f95e155f519f39a11 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc @@ -100,5 +100,63 @@ TEST(HloMatchersTest, CustomCallMatcher) { R"(custom-call with call target that is equal to "foo_target")"); } +TEST(HloMatchersTest, ShapeMatcher) { + auto p0 = HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1}), "param"); + + EXPECT_THAT(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {5, 7}))); + EXPECT_THAT( + p0.get(), + ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {5, 7})))); + EXPECT_THAT(p0.get(), + ::testing::Not(op::Shape(ShapeUtil::MakeShape(F32, {7, 5})))); + EXPECT_THAT( + p0.get(), + ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {7, 5})))); + EXPECT_THAT(p0.get(), + op::Shape(ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1}))); + EXPECT_THAT(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout( + F32, {5, 7}, {0, 1}))); + EXPECT_THAT(p0.get(), + ::testing::Not(op::ShapeWithLayout( + ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {1, 0})))); + + EXPECT_THAT(Explain(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {7, 5}))), + "%param = f32[5,7]{0,1} parameter(0) has incorrect shape " + "(expected: f32[7,5])"); + EXPECT_THAT( + Explain(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout( + F32, {7, 5}, {1, 0}))), + "%param = f32[5,7]{0,1} parameter(0) has incorrect shape " + "(expected: f32[7,5]{1,0})"); +} + +TEST(HloMatchersTest, ShardingMatcher) { + auto p0 = HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {5}), + "param.0"); + p0->clear_sharding(); + auto p1 = HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {7}), + "param.1"); + p1->set_sharding(HloSharding::AssignDevice(1)); + + EXPECT_THAT(p0.get(), op::NoSharding()); + EXPECT_THAT(p0.get(), + ::testing::Not(op::Sharding(HloSharding::AssignDevice(1)))); + EXPECT_THAT(p1.get(), ::testing::Not(op::NoSharding())); + EXPECT_THAT(p1.get(), + ::testing::Not(op::Sharding(HloSharding::AssignDevice(0)))); + EXPECT_THAT(p1.get(), op::Sharding(HloSharding::AssignDevice(1))); + + EXPECT_THAT(Explain(p0.get(), op::Sharding(HloSharding::AssignDevice(1))), + "%param.0 = f32[5]{0} parameter(0) has no sharding (expected: " + "{maximal device=1})"); + EXPECT_THAT(Explain(p1.get(), op::NoSharding()), + "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} " + "expected to have no sharding."); + EXPECT_THAT(Explain(p1.get(), op::Sharding(HloSharding::AssignDevice(0))), + "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} " + "has incorrect sharding (expected: {maximal device=0})"); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h index dddc72480f93c4c3cc29f41db99fa773dc8d6b68..ca763076a16af1150a8623fb7dbf22c46a5ca263 100644 --- a/tensorflow/compiler/xla/service/hlo_opcode.h +++ b/tensorflow/compiler/xla/service/hlo_opcode.h @@ -54,10 +54,10 @@ namespace xla { V(kBitcast, "bitcast") \ V(kBitcastConvert, "bitcast-convert") \ V(kBroadcast, "broadcast") \ - V(kBroadcastDimOne, "broadcast-dim-one") \ V(kCall, "call", kHloOpcodeIsVariadic) \ V(kCeil, "ceil") \ V(kClamp, "clamp") \ + V(kClz, "count-leading-zeros") \ V(kComplex, "complex") \ V(kConcatenate, "concatenate", kHloOpcodeIsVariadic) \ V(kConditional, "conditional") \ diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc index 2e834a79d9f63154172798d252be938d0d475c01..81c43db292a75dfd1e882537052b75d64a55002e 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.cc +++ b/tensorflow/compiler/xla/service/hlo_runner.cc @@ -30,8 +30,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { /*static*/ StatusOr> @@ -109,33 +107,31 @@ StatusOr> HloRunner::Execute( const ExecutableRunOptions& run_options = service_run_options.run_options(); // Copy arguments to device. - std::vector> argument_buffers; - std::vector argument_buffer_ptrs; + std::vector argument_buffers; for (Literal* argument : arguments) { TF_ASSIGN_OR_RETURN( - std::unique_ptr argument_buffer, + ScopedShapedBuffer argument_buffer, backend().transfer_manager()->AllocateScopedShapedBuffer( argument->shape(), run_options.allocator(), run_options.device_ordinal())); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice( - stream.parent(), *argument, *argument_buffer)); + stream.parent(), *argument, argument_buffer)); argument_buffers.push_back(std::move(argument_buffer)); - argument_buffer_ptrs.push_back(argument_buffers.back().get()); + } + + std::vector argument_buffer_ptrs; + argument_buffer_ptrs.reserve(argument_buffers.size()); + for (const auto& buf : argument_buffers) { + argument_buffer_ptrs.push_back(&buf); } TF_ASSIGN_OR_RETURN( - std::unique_ptr result, + ScopedShapedBuffer result, executable->ExecuteOnStreamWrapper( &service_run_options, /*profile=*/nullptr, argument_buffer_ptrs)); - // Create a ScopedShapedBuffer of the result to manage deallocation. This will - // deallocate all the device memory when it goes out of scope. - TF_ASSIGN_OR_RETURN( - std::unique_ptr scoped_result, - ScopedShapedBuffer::MakeScoped(result.get(), run_options.allocator())); - auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice( - stream.parent(), *scoped_result); + stream.parent(), result); if (result_literal.ok()) { VLOG(4) << "Executed binary and got result: " << result_literal.ValueOrDie()->ToString(); @@ -157,7 +153,13 @@ StatusOr>> HloRunner::ExecuteReplicated( backend().computation_placer()->AssignDevices(options.num_replicas, 1)); std::vector> streams; std::vector service_run_options; - std::vector> argument_buffers; + + std::vector argument_buffers; + // This reserve() call is necessary for correctness, because + // argument_buffer_ptrs contains pointers into the elements of + // argument_buffers. + argument_buffers.reserve(options.num_replicas * options.arguments.size()); + // Plus one so we can safely get &argument_buffer_ptrs[0] in case there are // no arguments. std::vector argument_buffer_ptrs( @@ -177,13 +179,13 @@ StatusOr>> HloRunner::ExecuteReplicated( // Copy arguments to device. for (const Literal* argument : options.arguments) { TF_ASSIGN_OR_RETURN( - std::unique_ptr argument_buffer, + ScopedShapedBuffer argument_buffer, backend().transfer_manager()->AllocateScopedShapedBuffer( argument->shape(), backend().memory_allocator(), device)); TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice( - executor, *argument, *argument_buffer)); + executor, *argument, argument_buffer)); argument_buffers.push_back(std::move(argument_buffer)); - argument_buffer_ptrs[index++] = argument_buffers.back().get(); + argument_buffer_ptrs[index++] = &argument_buffers.back(); } argument_buffer_slices.emplace_back( &argument_buffer_ptrs[index - options.arguments.size()], @@ -242,19 +244,16 @@ StatusOr>> HloRunner::ExecuteReplicated( } LOG(INFO) << "Replicated execution started"; - TF_ASSIGN_OR_RETURN(std::vector> results, + TF_ASSIGN_OR_RETURN(std::vector results, executable->ExecuteOnStreams(service_run_options, argument_buffer_slices)); LOG(INFO) << "Replicated execution terminated"; std::vector> exec_results; for (int64 i = 0; i < options.num_replicas; ++i) { - TF_ASSIGN_OR_RETURN(std::unique_ptr result, - ScopedShapedBuffer::MakeScoped( - results[i].get(), backend().memory_allocator())); TF_ASSIGN_OR_RETURN(std::unique_ptr literal, backend().transfer_manager()->TransferLiteralFromDevice( - streams[i]->parent(), *result)); + streams[i]->parent(), results[i])); exec_results.push_back(std::move(literal)); } return std::move(exec_results); diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h index f54fb44766eb07f402b2946abc83d50d155e47c1..53f7c6fe4a09111c5ee24f2290f0f4aeed0a4401 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.h +++ b/tensorflow/compiler/xla/service/hlo_runner.h @@ -80,7 +80,7 @@ class HloRunner { bool run_hlo_passes = false; }; - explicit HloRunner(::perftools::gputools::Platform* platform); + explicit HloRunner(se::Platform* platform); ~HloRunner(); @@ -149,8 +149,7 @@ class HloRunner { // will be used to configure the replication parameters. Replicated executions // should pass the device_assignment parameter. ServiceExecutableRunOptions GetServiceRunOptionsForDevice( - int64 device, ::perftools::gputools::Stream* stream, - DeviceAssignment* device_assignment); + int64 device, se::Stream* stream, DeviceAssignment* device_assignment); std::unique_ptr backend_; }; diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc index 1b42349b0b3ad9634bb910b3843affed6a0ca334..994de441237493b5e2254a0a66763d6195c5ea85 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding.cc @@ -256,37 +256,24 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, ", input_shape=", ShapeUtil::HumanString(shape)); } - // The tile shape must not be the same as the input shape without maximal_ - // also set. If this is the case, we're not actually sharded and the correct - // constructor should have been used. - if (ShapeUtil::Equal(shape, tile_shape_)) { + // The correct constructor have to be used to create tile maximal shardings. + if (tile_assignment_.num_elements() == 1) { return tensorflow::errors::InvalidArgument( - "Tile shape is the same as the input shape. If a replicated sharding " - "was intended, use HloSharding::Replicated(). If a device placement " - "was intended, use HloSharding::AssignDevice()"); + "Tile assignment only contains a single device. If a replicated " + "sharding was intended, use HloSharding::Replicated(). If a device " + "placement was intended, use HloSharding::AssignDevice()"); } - // The tile shape must not be greater than the input shape in any dimension. - for (int64 i = 0, e = ShapeUtil::Rank(shape); i != e; ++i) { - auto tile_dim = tile_shape_.dimensions(i); - auto shape_dim = shape.dimensions(i); - if (tile_dim > shape_dim) { - return tensorflow::errors::InvalidArgument( - StrCat("Tile is larger than input shape (dimension ", i, ", ", - tile_dim, " > ", shape_dim)); - } - } - - // The tile assignment tensor must be exactly dimensioned to ceil(shape[dim] - // tile[dim]) for every dimension contained within tile. + // The tile assignment tensor must contain enough element to cover the full + // shape with tiles of the specified size. for (int64 i = 0, e = tile_assignment_.dimensions().size(); i != e; ++i) { - int64 expected_dim = - CeilOfRatio(shape.dimensions(i), tile_shape_.dimensions(i)); - if (tile_assignment_.dimensions()[i] != expected_dim) { + int64 total_tile_size = tile_assignment_.dim(i) * tile_shape_.dimensions(i); + if (shape.dimensions(i) > total_tile_size) { return tensorflow::errors::InvalidArgument( - StrCat("Tile assignment tensor has incorrect shape. Dimension ", i, - " expected ", expected_dim, " but got ", - tile_assignment_.dimensions()[i])); + StrCat("Tile assignment tensor has too few element to cover the full " + "shape. Dimension ", + i, ", shape ", shape.dimensions(i), ", total size ", + total_tile_size)); } } diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc index 69ea4233e45c2e59c8d1541a0517a007f4bbf42f..3bf0d25efb7fad78aeccdd9269c289950b2171ab 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc @@ -88,7 +88,7 @@ TEST_F(HloShardingTest, Tile) { } { - // Test should pass. + // Test should fail because of more devices used then `num_device`. Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); HloSharding sharding = HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); @@ -97,17 +97,8 @@ TEST_F(HloShardingTest, Tile) { } { - // Test should fail due to the tile being larger than the input space. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); - EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {2, 2}), - /*num_devices=*/4)); - } - - { - // Test should fail due to the tile not dividing the input space into 4 - // sections (even with padding). + // Test should fail because the total tiled size in dimension 0 is 4 but we + // have 6 elements along that dimensions. Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); HloSharding sharding = HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 63ec5964eb935239e86233c1ae94e2bcce3b0461..8a30cbf9cd622ffb64d345ddaf0dc88f34850bfc 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -15,6 +15,7 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/lib/core/errors.h" @@ -174,34 +175,17 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) { TF_RETURN_IF_ERROR(CheckShape(broadcast, broadcast->shape())); TF_RET_CHECK(ShapeUtil::Rank(operand_shape) == broadcast->dimensions().size()); - for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) { - int64 output_dimension = broadcast->dimensions()[i]; + for (int64 operand_dimension = 0; + operand_dimension < ShapeUtil::Rank(operand_shape); + ++operand_dimension) { + int64 output_dimension = broadcast->dimensions()[operand_dimension]; TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) == - operand_shape.dimensions(i)) + operand_shape.dimensions(operand_dimension)) << broadcast->ToString() << " operand shape " << operand_shape; } return tensorflow::Status::OK(); } -Status ShapeVerifier::HandleBroadcastDimOne(HloInstruction* broadcastDimOne) { - const Shape& operand_shape = broadcastDimOne->operand(0)->shape(); - int64 operand_rank = ShapeUtil::Rank(operand_shape); - const Shape& output_shape = broadcastDimOne->shape(); - // Check for mixed precision. - TF_RETURN_IF_ERROR(CheckShape(broadcastDimOne, output_shape)); - TF_RET_CHECK(operand_rank == ShapeUtil::Rank(output_shape)); - for (int64 i = 0; i < operand_rank; ++i) { - int64 operand_dimension = operand_shape.dimensions(i); - int64 output_dimension = output_shape.dimensions(i); - TF_RET_CHECK(operand_dimension == 1 || - operand_dimension == output_dimension) - << "Dimension " << i << " of broadcastDimOne " - << broadcastDimOne->ToString() << " is " << operand_dimension - << ", expected 1 or " << output_dimension; - } - return tensorflow::Status::OK(); -} - Status ShapeVerifier::HandleReshape(HloInstruction* reshape) { // Check for mixed precision. TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape())); @@ -748,6 +732,73 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { return tensorflow::Status::OK(); } +Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) { + auto* while_cond = instruction->while_condition(); + auto* while_body = instruction->while_body(); + if (while_cond->num_parameters() != 1) { + return FailedPrecondition( + "While condition must have exactly 1 parameter; had %lld : %s", + while_cond->num_parameters(), while_cond->ToString().c_str()); + } + if (while_body->num_parameters() != 1) { + return FailedPrecondition( + "While body must have exactly 1 parameter; had %lld : %s", + while_body->num_parameters(), while_body->ToString().c_str()); + } + if (instruction->operand_count() != 1) { + return FailedPrecondition( + "While loop must have exactly one operand; had %lld : %s", + instruction->operand_count(), instruction->ToString().c_str()); + } + auto* init = instruction->operand(0); + auto* cond_param = while_cond->parameter_instruction(0); + if (!ShapeUtil::Compatible(init->shape(), cond_param->shape())) { + return FailedPrecondition( + "While condition's parameter must have the same shape as the " + "loop's 'init'. init: %s, param: %s", + init->ToString().c_str(), cond_param->ToString().c_str()); + } + auto* cond_root = while_cond->root_instruction(); + if (!ShapeUtil::Compatible(cond_root->shape(), + ShapeUtil::MakeShape(PRED, {}))) { + return FailedPrecondition("While condition should have shape PRED: %s", + cond_root->ToString().c_str()); + } + auto* body_param = while_body->parameter_instruction(0); + if (!ShapeUtil::Compatible(init->shape(), body_param->shape())) { + return FailedPrecondition( + "While body's parameter must have the same shape as the loop's" + " 'init'. init: %s, param: %s", + init->ToString().c_str(), body_param->ToString().c_str()); + } + auto* body_root = while_body->root_instruction(); + if (!ShapeUtil::Compatible(init->shape(), body_root->shape())) { + return FailedPrecondition( + "While body should have same shape as the loop's 'init'." + "init: %s, body: %s", + init->ToString().c_str(), body_root->ToString().c_str()); + } + return tensorflow::Status::OK(); +} + +Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) { + const Shape& out_shape = instruction->shape(); + for (HloInstruction* operand : instruction->operands()) { + const Shape& operand_shape = operand->shape(); + if (!ShapeUtil::IsScalar(operand_shape) && + !ShapeUtil::CompatibleIgnoringElementType(operand_shape, out_shape)) { + return FailedPrecondition( + "Implicit broadcast is not allowed in HLO." + "Found non-compatible shapes for instruction %s.\n" + "output: %s\noperand: %s\n", + HloOpcodeString(instruction->opcode()).c_str(), + ShapeUtil::HumanString(out_shape).c_str(), + ShapeUtil::HumanString(operand_shape).c_str()); + } + } + return tensorflow::Status::OK(); +} + StatusOr HloVerifier::Run(HloModule* module) { TF_RETURN_IF_ERROR(VerifyHloStructure(module)); @@ -788,39 +839,9 @@ StatusOr HloVerifier::Run(HloModule* module) { << instruction->dimensions().size() << " != " << ShapeUtil::Rank(instruction->operand(0)->shape()); } else if (instruction->opcode() == HloOpcode::kWhile) { - auto* while_cond = instruction->while_condition(); - auto* while_body = instruction->while_body(); - TF_RET_CHECK(while_cond->num_parameters() == 1) - << "While condition must have exactly 1 parameter; had " - << while_cond->num_parameters() << ": " << while_cond->ToString(); - TF_RET_CHECK(while_body->num_parameters() == 1) - << "While body must have exactly 1 parameter; had " - << while_body->num_parameters() << ": " << while_body->ToString(); - TF_RET_CHECK(instruction->operand_count() == 1) - << "While loop must have exactly one operand; had " - << instruction->operand_count() << ": " << instruction->ToString(); - - auto* init = instruction->operand(0); - auto* cond_param = while_cond->parameter_instruction(0); - TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), cond_param->shape())) - << "While condition's parameter must have the same shape as the " - "loop's 'init'. init: " - << init->ToString() << ", param: " << cond_param->ToString(); - auto* cond_root = while_cond->root_instruction(); - TF_RET_CHECK(ShapeUtil::Compatible(cond_root->shape(), - ShapeUtil::MakeShape(PRED, {}))) - << "While condition should have shape PRED: " - << cond_root->ToString(); - - auto* body_param = while_body->parameter_instruction(0); - TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_param->shape())) - << "While body's parameter must have the same shape as the loop's " - "'init'. init: " - << init->ToString() << ", param: " << body_param->ToString(); - auto* body_root = while_body->root_instruction(); - TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_root->shape())) - << "While body should have same shape as the loop's 'init'. init: " - << init->ToString() << ", body: " << body_root->ToString(); + TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction)); + } else if (instruction->IsElementwise()) { + TF_RETURN_IF_ERROR(CheckElementwiseInstruction(instruction)); } auto previous = instructions.find(instruction->name()); diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h index a4dff977ba268137d8ab94c576b4b511e911806f..6208887547a14d22b512ef308dd2668af2f4468d 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.h +++ b/tensorflow/compiler/xla/service/hlo_verifier.h @@ -54,7 +54,6 @@ class ShapeVerifier : public DfsHloVisitor { Status HandleReduce(HloInstruction* reduce) override; Status HandleBitcast(HloInstruction* bitcast) override; Status HandleBroadcast(HloInstruction* broadcast) override; - Status HandleBroadcastDimOne(HloInstruction* broadcastDimOne) override; Status HandleReshape(HloInstruction* reshape) override; Status HandleTranspose(HloInstruction* transpose) override; Status HandleParameter(HloInstruction*) override; @@ -103,7 +102,7 @@ class ShapeVerifier : public DfsHloVisitor { Status CheckTernaryShape(const HloInstruction* instruction); Status CheckVariadicShape(const HloInstruction* instruction); - // Checks if the given two instructions shares the same channel id. + // Checks if the given two instructions share the same channel id. Status CheckSameChannel(const HloInstruction* instr1, const HloInstruction* instr2); @@ -145,9 +144,15 @@ class HloVerifier : public HloPassInterface { // CHECKs various invariants of a fusion instruction. Status CheckFusionInstruction(HloInstruction* fusion) const; + Status CheckWhileInstruction(HloInstruction* instruction); + + // Checks that the non-scalar operand shapes are compatible to the output + // shape, i.e., that there are no implicit broadcasts of size-one dimensions. + Status CheckElementwiseInstruction(HloInstruction* instruction); + // Creates a ShapeVerifier that checks that shapes match inferred - // expectations. This is a factory function because ShapeVerifier, Note that - // ShapeVerifier, being a DfsHloVisitor, is stateful. We want a clean object + // expectations. This is a factory function because ShapeVerifier, + // being a DfsHloVisitor, is stateful. We want a clean object // for each run of the verifier. ShapeVerifierFactory shape_verifier_factory_; }; diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc index 3f4dbf897df7e1fd62f4229ed90c949c59da9d46..b9ccfeddb565b7b44f5b38281a49df6cd0fdc766 100644 --- a/tensorflow/compiler/xla/service/instruction_fusion.cc +++ b/tensorflow/compiler/xla/service/instruction_fusion.cc @@ -37,9 +37,9 @@ namespace xla { case HloOpcode::kBitcast: case HloOpcode::kBitcastConvert: case HloOpcode::kBroadcast: - case HloOpcode::kBroadcastDimOne: case HloOpcode::kCeil: case HloOpcode::kClamp: + case HloOpcode::kClz: case HloOpcode::kComplex: case HloOpcode::kConcatenate: case HloOpcode::kConstant: @@ -143,8 +143,7 @@ bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) { }); return std::count_if(hlo->operands().begin(), hlo->operands().end(), [output_rank](HloInstruction* operand) { - if (operand->opcode() == HloOpcode::kBroadcast || - operand->opcode() == HloOpcode::kBroadcastDimOne) { + if (operand->opcode() == HloOpcode::kBroadcast) { return false; } if (operand->opcode() == HloOpcode::kConstant && @@ -249,8 +248,7 @@ StatusOr InstructionFusion::Run(HloModule* module) { auto reachability = computation->ComputeReachability(); auto cheap_to_duplicate = [this](HloInstruction* producer) { - if (producer->opcode() == HloOpcode::kBroadcast || - producer->opcode() == HloOpcode::kBroadcastDimOne) { + if (producer->opcode() == HloOpcode::kBroadcast) { return true; } if (producer->opcode() == HloOpcode::kConstant && diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc index 5b9bf5faf366d674ecadd59fa8a0af8d4976a962..76b3ecad26fe92e910fd3fe0e405c726da7e14b7 100644 --- a/tensorflow/compiler/xla/service/interpreter/compiler.cc +++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc @@ -41,9 +41,6 @@ limitations under the License. namespace xla { namespace interpreter { -namespace se = ::perftools::gputools; -namespace sep = ::perftools::gputools::interpreter; - Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) { HloPassPipeline pipeline("Interpreter"); @@ -96,7 +93,7 @@ InterpreterCompiler::CompileAheadOfTime( } se::Platform::Id InterpreterCompiler::PlatformId() const { - return sep::kXlaInterpreterPlatformId; + return se::interpreter::kXlaInterpreterPlatformId; } HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction() @@ -109,11 +106,12 @@ static std::unique_ptr CreateComputationPlacer() { } static bool InitModule() { - xla::Compiler::RegisterCompilerFactory(sep::kXlaInterpreterPlatformId, []() { - return xla::MakeUnique(); - }); + xla::Compiler::RegisterCompilerFactory( + se::interpreter::kXlaInterpreterPlatformId, []() { + return xla::MakeUnique(); + }); xla::ComputationPlacer::RegisterComputationPlacer( - sep::kXlaInterpreterPlatformId, &CreateComputationPlacer); + se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer); return true; } diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h index c8660c04d86a82e7dfcfd1658310c2a0e4fa0083..e90ae3e818522e6e4fd9d9f5acb846800bc899ca 100644 --- a/tensorflow/compiler/xla/service/interpreter/compiler.h +++ b/tensorflow/compiler/xla/service/interpreter/compiler.h @@ -44,19 +44,16 @@ class InterpreterCompiler : public Compiler { ~InterpreterCompiler() override {} StatusOr> RunHloPasses( - std::unique_ptr hlo_module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr hlo_module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( - std::unique_ptr hlo_module, - perftools::gputools::StreamExecutor* stream_exec, + std::unique_ptr hlo_module, se::StreamExecutor* stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr>> Compile( std::vector> hlo_modules, - std::vector> - stream_exec, + std::vector> stream_exec, DeviceMemoryAllocator* device_allocator) override; StatusOr>> @@ -65,7 +62,7 @@ class InterpreterCompiler : public Compiler { HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override; - perftools::gputools::Platform::Id PlatformId() const override; + se::Platform::Id PlatformId() const override; private: Status RunHloOptimization(HloModule* hlo_module); diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index 883063d0f075f5b0d79edc01bcd27a7c579272f4..61f199bc9e8f4f95a2f097af4abf9395a1e05f64 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -38,8 +38,6 @@ limitations under the License. namespace xla { namespace interpreter { -namespace se = ::perftools::gputools; - InterpreterExecutable::InterpreterExecutable( std::unique_ptr hlo_module) : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr, @@ -47,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable( InterpreterExecutable::~InterpreterExecutable() {} -StatusOr> InterpreterExecutable::ExecuteOnStream( +StatusOr InterpreterExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -90,12 +88,12 @@ StatusOr> InterpreterExecutable::ExecuteOnStream( evaluator.Evaluate>(*computation, arg_literals)); // Transform the result literal back into a ShapedBuffer. - TF_ASSIGN_OR_RETURN(std::unique_ptr result, - transfer_manager->AllocateShapedBuffer( + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, + transfer_manager->AllocateScopedShapedBuffer( result_literal->shape(), run_options->allocator(), executor->device_ordinal())); TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice( - executor, *result_literal, *result)); + executor, *result_literal, result)); uint64 end_micros = tensorflow::Env::Default()->NowMicros(); @@ -108,8 +106,7 @@ StatusOr> InterpreterExecutable::ExecuteOnStream( return std::move(result); } -StatusOr> -InterpreterExecutable::ExecuteAsyncOnStream( +StatusOr InterpreterExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { return tensorflow::errors::Unimplemented( diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h index 410110a1adf04c83001c38ed03f5d60dd203dc7e..b0b797ca7d6f449a11c662ffba7c2a0a0040e47e 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.h +++ b/tensorflow/compiler/xla/service/interpreter/executable.h @@ -43,12 +43,12 @@ class InterpreterExecutable : public Executable { InterpreterExecutable(std::unique_ptr hlo_module); ~InterpreterExecutable() override; - StatusOr> ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr> ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc index 3caf9e7b82b21a84197ffe60267d6d953f9547a1..97e9fa2c8e8ecd918ffe3df2fd4e731f3b91e6db 100644 --- a/tensorflow/compiler/xla/service/interpreter/executor.cc +++ b/tensorflow/compiler/xla/service/interpreter/executor.cc @@ -19,8 +19,7 @@ limitations under the License. #include "tensorflow/compiler/xla/status_macros.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { host::HostStream *AsExecutorStream(Stream *stream) { @@ -119,5 +118,4 @@ DeviceDescription *XlaInterpreterExecutor::PopulateDeviceDescription() const { } } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h index 77426b0820d2d4e6a3a3216025837de7fa5e5c65..9b109022fbfc698f7dadc678ef837da270a5e74a 100644 --- a/tensorflow/compiler/xla/service/interpreter/executor.h +++ b/tensorflow/compiler/xla/service/interpreter/executor.h @@ -44,8 +44,7 @@ limitations under the License. #include "tensorflow/stream_executor/stream_executor_internal.h" #include "tensorflow/stream_executor/timer.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { using Args = tensorflow::gtl::ArraySlice; @@ -213,7 +212,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface { }; } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor #endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_ diff --git a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc index 3cf8506d1c469d7745d26834a51b4ce0eebaa942..d27cd7502f10a1f615fc5b0d610acafdf55e3e43 100644 --- a/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/interpreter/interpreter_transfer_manager.cc @@ -21,12 +21,10 @@ limitations under the License. #include "tensorflow/compiler/xla/service/interpreter/platform_id.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" -namespace sei = ::perftools::gputools::interpreter; - namespace xla { InterpreterTransferManager::InterpreterTransferManager() - : GenericTransferManager(sei::kXlaInterpreterPlatformId, + : GenericTransferManager(se::interpreter::kXlaInterpreterPlatformId, /*pointer_size=*/sizeof(void*)) {} } // namespace xla @@ -38,7 +36,8 @@ CreateInterpreterTransferManager() { static bool InitModule() { xla::TransferManager::RegisterTransferManager( - sei::kXlaInterpreterPlatformId, &CreateInterpreterTransferManager); + stream_executor::interpreter::kXlaInterpreterPlatformId, + &CreateInterpreterTransferManager); return true; } diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc index 015e00e1e8edc5c77066b6038f98621862af5440..92e069a8c67c1d441ba9d396dee503c9b3bde0df 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform.cc +++ b/tensorflow/compiler/xla/service/interpreter/platform.cc @@ -28,11 +28,7 @@ limitations under the License. #include "tensorflow/stream_executor/multi_platform_manager.h" #include "tensorflow/stream_executor/platform.h" -namespace se = ::perftools::gputools; -namespace sep = ::perftools::gputools::interpreter; - -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {} @@ -75,8 +71,8 @@ port::StatusOr XlaInterpreterPlatform::GetExecutor( port::StatusOr> XlaInterpreterPlatform::GetUncachedExecutor( const StreamExecutorConfig& config) { - auto executor = port::MakeUnique( - this, port::MakeUnique(config.plugin_config)); + auto executor = MakeUnique( + this, MakeUnique(config.plugin_config)); auto init_status = executor->Init(config.ordinal, config.device_options); if (!init_status.ok()) { return port::Status{ @@ -99,16 +95,16 @@ void XlaInterpreterPlatform::UnregisterTraceListener(TraceListener* listener) { } static void InitializeXlaInterpreterPlatform() { - std::unique_ptr platform(new sep::XlaInterpreterPlatform); - SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform))); + std::unique_ptr platform(new XlaInterpreterPlatform); + SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform))); } } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor -REGISTER_MODULE_INITIALIZER(interpreter_platform, - sep::InitializeXlaInterpreterPlatform()); +REGISTER_MODULE_INITIALIZER( + interpreter_platform, + stream_executor::interpreter::InitializeXlaInterpreterPlatform()); DECLARE_MODULE_INITIALIZER(multi_platform_manager); diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h index 2f71b29be4401a8374cdd0bad5830a632305fc26..d68c5aa20dda7ac246ed4aa667851e385a604c04 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform.h +++ b/tensorflow/compiler/xla/service/interpreter/platform.h @@ -23,8 +23,7 @@ limitations under the License. #include "tensorflow/stream_executor/stream_executor.h" #include "tensorflow/stream_executor/trace_listener.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { class XlaInterpreterPlatform : public Platform { @@ -64,7 +63,6 @@ class XlaInterpreterPlatform : public Platform { }; } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor #endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_ diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.cc b/tensorflow/compiler/xla/service/interpreter/platform_id.cc index b7fb365b70db7235764435305085e36869cbb13a..3272396ce5045129a7689a160ec859d11fbbe9fa 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform_id.cc +++ b/tensorflow/compiler/xla/service/interpreter/platform_id.cc @@ -14,12 +14,10 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/interpreter/platform_id.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { PLATFORM_DEFINE_ID(kXlaInterpreterPlatformId); } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor diff --git a/tensorflow/compiler/xla/service/interpreter/platform_id.h b/tensorflow/compiler/xla/service/interpreter/platform_id.h index 292f958449b52ff2f522bd31f115079b4f7e0835..a6cc10bcc1eb756a3146d4a834efa4cd3ceb2d27 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform_id.h +++ b/tensorflow/compiler/xla/service/interpreter/platform_id.h @@ -18,14 +18,12 @@ limitations under the License. #include "tensorflow/stream_executor/platform.h" -namespace perftools { -namespace gputools { +namespace stream_executor { namespace interpreter { extern const Platform::Id kXlaInterpreterPlatformId; } // namespace interpreter -} // namespace gputools -} // namespace perftools +} // namespace stream_executor #endif // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_ diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc index 911b243fe28a5baf8a4b8ed752b892265f5388ac..b17c9d504501a907e27d5152e0082799e87443c7 100644 --- a/tensorflow/compiler/xla/service/llvm_compiler.cc +++ b/tensorflow/compiler/xla/service/llvm_compiler.cc @@ -23,7 +23,7 @@ limitations under the License. namespace xla { StatusOr>> LLVMCompiler::Compile( std::vector> modules, - std::vector> stream_execs, + std::vector> stream_execs, DeviceMemoryAllocator* device_allocator) { // Tensorflow tries to enable the following behaviors in all its threads: // diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h index d74e81bb7f622ac5e89203a3d02ca5ad839da07e..f1c623508c5307f2b1c036d3ec6823b75c7eda13 100644 --- a/tensorflow/compiler/xla/service/llvm_compiler.h +++ b/tensorflow/compiler/xla/service/llvm_compiler.h @@ -60,19 +60,18 @@ class LLVMCompiler : public Compiler { // Bring in // StatusOr> RunBackend( // std::unique_ptr module, - // perftools::gputools::StreamExecutor* stream_exec, + // se::StreamExecutor* stream_exec, // DeviceMemoryAllocator* device_allocator) // StatusOr> RunHloPasses( // std::unique_ptr module, - // perftools::gputools::StreamExecutor* stream_exec, + // se::StreamExecutor* stream_exec, // DeviceMemoryAllocator* device_allocator) using Compiler::RunBackend; using Compiler::RunHloPasses; StatusOr>> Compile( std::vector> modules, - std::vector> - stream_execs, + std::vector> stream_execs, DeviceMemoryAllocator* device_allocator) override; protected: diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc index 499f280211aacd00e79b3ca0ddb3413f933b02da..0fa4061738612df76c72a18a9353f16bf6a42677 100644 --- a/tensorflow/compiler/xla/service/local_service.cc +++ b/tensorflow/compiler/xla/service/local_service.cc @@ -43,13 +43,11 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { /* static */ StatusOr> LocalService::NewService( const ServiceOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); if (platform == nullptr) { TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform()); } diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc index 7d8c05fffa4ab11d7dbf9956d2cb7ebd5bcdd3c4..f74bcb0b79355c8e69890487266cbc5f2a4500be 100644 --- a/tensorflow/compiler/xla/service/name_uniquer.cc +++ b/tensorflow/compiler/xla/service/name_uniquer.cc @@ -53,17 +53,18 @@ NameUniquer::NameUniquer(const string& separator) { } string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) { - string root = prefix.empty() ? "name" : prefix.ToString(); - root = GetSanitizedName(root); + string root = GetSanitizedName(prefix.empty() ? "name" : prefix.ToString()); // Strip away numeric suffix (if any). Only recognize separator if it is in // the middle of the name. + bool has_numeric_suffix = false; + int64 numeric_suffix = 0; size_t separator_index = root.rfind(separator_); if (separator_index != string::npos && (separator_index > 0) && (separator_index < root.size() - 1)) { string after_suffix = root.substr(separator_index + 1); - int64 numeric_suffix; if (tensorflow::strings::safe_strto64(after_suffix, &numeric_suffix)) { + has_numeric_suffix = true; // Remove numeric suffix from root. root = root.substr(0, separator_index); // Update count to at least the numeric suffix value to avoid future @@ -71,11 +72,11 @@ string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) { generated_names_[root] = std::max(generated_names_[root], numeric_suffix); } } - int64* count = &(generated_names_[root]); if (*count == 0) { *count = 1; - return root; + return has_numeric_suffix ? tensorflow::strings::StrCat(root, separator_, 0) + : root; } else { tensorflow::strings::StrAppend(&root, separator_, *count); // Increment lookup under old 'root' name. diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc index 4258cf16876ab46dce6df062ab701b1b1a4a7580..2ec255558c4ed3695ec6c824458cbedac44dc297 100644 --- a/tensorflow/compiler/xla/service/name_uniquer_test.cc +++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc @@ -57,11 +57,18 @@ TEST_F(NameUniquerTest, NumericSuffixes) { EXPECT_EQ("foo.55", uniquer.GetUniqueName("foo")); EXPECT_EQ("foo.55.1", uniquer.GetUniqueName("foo.55.1")); EXPECT_EQ("foo.55.2", uniquer.GetUniqueName("foo.55.1")); - EXPECT_EQ("bar", uniquer.GetUniqueName("bar.-1000")); + EXPECT_EQ("bar.0", uniquer.GetUniqueName("bar.-1000")); EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.-2000")); EXPECT_EQ("bar.2", uniquer.GetUniqueName("bar.1")); } +TEST_F(NameUniquerTest, PrefixHasSuffix) { + NameUniquer uniquer("."); + + EXPECT_EQ("foo.11.0", uniquer.GetUniqueName("foo.11.0")); + EXPECT_EQ("foo.11", uniquer.GetUniqueName("foo.11")); +} + TEST_F(NameUniquerTest, Sanitize) { NameUniquer uniquer("_"); @@ -73,7 +80,7 @@ TEST_F(NameUniquerTest, Sanitize) { EXPECT_EQ("foo_55", uniquer.GetUniqueName("foo")); // Invalid characters will be replaced with '_'. - EXPECT_EQ("bar", uniquer.GetUniqueName("bar<-1000")); + EXPECT_EQ("bar_0", uniquer.GetUniqueName("bar<-1000")); EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar<-2000")); EXPECT_EQ("bar_2", uniquer.GetUniqueName("bar_1")); diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h index 5d4963807721eb177400131fa16a69f32fb431ab..586f6ef7a9c4f17f69340e77be17aec2f677a791 100644 --- a/tensorflow/compiler/xla/service/pattern_matcher.h +++ b/tensorflow/compiler/xla/service/pattern_matcher.h @@ -532,7 +532,7 @@ class ShapePattern { ShapeType, ShapePatternLayoutImpl>> - IsDenseArray(const ::xla::Layout* layout) const { + IsDenseArray() const { return WithLayout(Layout().WithDenseFormat()); } @@ -540,7 +540,7 @@ class ShapePattern { ShapeType, ShapePatternLayoutImpl>> - IsSparseArray(const ::xla::Layout* layout) const { + IsSparseArray() const { return WithLayout(Layout().WithSparseFormat()); } @@ -879,7 +879,6 @@ XLA_UNOP_PATTERN(Abs) XLA_UNOP_PATTERN(RoundNearestAfz) XLA_UNOP_PATTERN(Bitcast) XLA_UNOP_PATTERN(Broadcast) -XLA_UNOP_PATTERN(BroadcastDimOne) XLA_UNOP_PATTERN(Ceil) XLA_UNOP_PATTERN(Copy) XLA_UNOP_PATTERN(Cos) diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc index 5291b1437afc67312382fe52bf9a66a1843b1b4c..c88157c312524fb273e6df368d2ef61d679d1d8b 100644 --- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc +++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc @@ -67,6 +67,7 @@ TEST(PatternMatcherTest, ScalarShape) { EXPECT_TRUE(Match(&scalar_shape, match::Shape(&matched_shape).IsScalar())); EXPECT_EQ(matched_shape, &scalar_shape); EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsArray())); + EXPECT_TRUE(Match(&scalar_shape, match::Shape().IsDenseArray())); EXPECT_FALSE(Match(&scalar_shape, match::Shape().IsTuple())); EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithElementType(F32))); EXPECT_TRUE(Match(&scalar_shape, match::Shape().WithRank(0))); @@ -75,11 +76,13 @@ TEST(PatternMatcherTest, ScalarShape) { match::Shape().WithSubshape({0}, match::Shape()).WithElementType(F32))); } -TEST(PatternMatcherTest, ArrayShape) { +TEST(PatternMatcherTest, DenseArrayShape) { auto array_shape = ShapeUtil::MakeShape(F32, {2, 3, 4}); Shape* matched_shape; EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray())); EXPECT_EQ(matched_shape, &array_shape); + EXPECT_TRUE(Match(&array_shape, match::Shape().IsDenseArray())); + EXPECT_FALSE(Match(&array_shape, match::Shape().IsSparseArray())); EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar())); EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple())); EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32))); @@ -90,6 +93,33 @@ TEST(PatternMatcherTest, ArrayShape) { EXPECT_FALSE(Match(&array_shape, match::Shape().WithLayout( match::Layout(&matched_layout).WithSparseFormat()))); + EXPECT_TRUE(Match(&array_shape, + match::Shape().WithLayout( + match::Layout(&matched_layout).WithDenseFormat()))); + EXPECT_EQ(matched_layout, &array_shape.layout()); +} + +TEST(PatternMatcherTest, SparseArrayShape) { + auto array_shape = ShapeUtil::MakeShapeWithSparseLayout(F32, {2, 3, 4}, 10); + Shape* matched_shape; + EXPECT_TRUE(Match(&array_shape, match::Shape(&matched_shape).IsArray())); + EXPECT_EQ(matched_shape, &array_shape); + EXPECT_FALSE(Match(&array_shape, match::Shape().IsDenseArray())); + EXPECT_TRUE(Match(&array_shape, match::Shape().IsSparseArray())); + EXPECT_FALSE(Match(&array_shape, match::Shape().IsScalar())); + EXPECT_FALSE(Match(&array_shape, match::Shape().IsTuple())); + EXPECT_TRUE(Match(&array_shape, match::Shape().WithElementType(F32))); + EXPECT_TRUE(Match(&array_shape, match::Shape().WithRank(3))); + EXPECT_FALSE( + Match(&array_shape, match::Shape().WithSubshape({0}, match::Shape()))); + Layout* matched_layout; + EXPECT_FALSE(Match(&array_shape, + match::Shape().WithLayout( + match::Layout(&matched_layout).WithDenseFormat()))); + EXPECT_TRUE(Match(&array_shape, + match::Shape().WithLayout( + match::Layout(&matched_layout).WithSparseFormat()))); + EXPECT_EQ(matched_layout, &array_shape.layout()); } TEST(PatternMatcherTest, TupleShape) { diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc index aa974ee61a27de9c19e97d8a6eb48f9261ce4bd9..7c63c0acc7764d558b2151190f0fa79fac355cbf 100644 --- a/tensorflow/compiler/xla/service/platform_util.cc +++ b/tensorflow/compiler/xla/service/platform_util.cc @@ -29,8 +29,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace se = ::perftools::gputools; - namespace xla { using tensorflow::str_util::Lowercase; diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h index 69188820a70707d9c9be10b20fb7de92ad4d9873..571451ba43a81d19b70e4954e45d3447f15dcedc 100644 --- a/tensorflow/compiler/xla/service/platform_util.h +++ b/tensorflow/compiler/xla/service/platform_util.h @@ -34,29 +34,27 @@ class PlatformUtil { // // Note that, even if a platform is present with zero devices, if we *do* have // compilation support for it, it will be returned in this sequence. - static StatusOr> - GetSupportedPlatforms(); + static StatusOr> GetSupportedPlatforms(); // Convenience function which returns the default supported platform for // tests. If exactly one supported platform is present, then this platform is // the default platform. If exactly two platforms are present and one of them // is the interpreter platform, then the other platform is the default // platform. Otherwise returns an error. - static StatusOr GetDefaultPlatform(); + static StatusOr GetDefaultPlatform(); // Convenience function which returns the sole supported platform. If // exactly one supported platform is present, then this platform is the // default platform. Otherwise returns an error. - static StatusOr GetSolePlatform(); + static StatusOr GetSolePlatform(); // Returns the platform according to the given name. Returns error if there is // no such platform. - static StatusOr GetPlatform( - const string& platform_name); + static StatusOr GetPlatform(const string& platform_name); // Returns exactly one platform that does not have given name. Returns error // if there is no such platform, or there are multiple such platforms. - static StatusOr GetPlatformExceptFor( + static StatusOr GetPlatformExceptFor( const string& platform_name); // Returns a vector of StreamExecutors for the given platform. The vector is @@ -64,8 +62,8 @@ class PlatformUtil { // element is nullptr, then the device is present by not supported by XLA. // // If the platform has no visible devices, a not-found error is returned. - static StatusOr> - GetStreamExecutors(perftools::gputools::Platform* platform); + static StatusOr> GetStreamExecutors( + se::Platform* platform); private: TF_DISALLOW_COPY_AND_ASSIGN(PlatformUtil); diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc index 49ec38eb62c7b51c7a2d301d882cef032b288036..0f26a025bf125f70199637894741540f89eae7e5 100644 --- a/tensorflow/compiler/xla/service/reshape_mover.cc +++ b/tensorflow/compiler/xla/service/reshape_mover.cc @@ -155,15 +155,20 @@ HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand, case HloOpcode::kConstant: { if (first_reshape_operand->opcode() == HloOpcode::kReshape) { VLOG(5) << "Adding reshape to kConstant operand"; - return computation->AddInstruction( + HloInstruction* reshape = computation->AddInstruction( HloInstruction::CreateReshape(new_shape, operand)); + operand->SetupDerivedInstruction(reshape); + return reshape; } else { CHECK(first_reshape_operand->opcode() == HloOpcode::kTranspose); VLOG(5) << "Adding transpose to kConstant operand"; std::vector inverse_permutation = InversePermutation(first_reshape_operand->dimensions()); - return computation->AddInstruction(HloInstruction::CreateTranspose( - new_shape, operand, inverse_permutation)); + HloInstruction* transpose = + computation->AddInstruction(HloInstruction::CreateTranspose( + new_shape, operand, inverse_permutation)); + operand->SetupDerivedInstruction(transpose); + return transpose; } } case HloOpcode::kRng: { diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc index 094f7319f462a71f4bfe972771a1de4aedbb8ee3..13e2d3258e3b92f52320201c382594962c0e3b2b 100644 --- a/tensorflow/compiler/xla/service/reshape_mover_test.cc +++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc @@ -458,57 +458,6 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) { EXPECT_EQ(select, computation->root_instruction()); } -// Tree looks like: -// -// param0 [1,128,1] -// | -// reshape [128,1] constant [128,1024] -// \ / -// multiply w/implicit broadcast [128,1024] -// -// The reshape mover would like to sink the reshape below the multiply. -// -// Previously we would attempt to insert a reshape of the constant to [1,128,1] -// (which is unsound, because it has a different number of elements) as -// preparation for sinking the reshape. -// -// To eliminate the unsoundness, we outlaw reshape sinking when one of the -// operands is implicitly broadcast in the elementwise consumer. -// -// TODO(b/37799338) However, it would be possible in this case to do a more -// in-depth analysis to get reshape movement to occur: -// -// 1. Note that the broadcast dimension (logical dimension 1) in the operands -// would map back to logical dimension 2 in the param0 node. -// 2. Match rank of the constant to the param0 node (by prepending a trivial 1 -// dimension). -// 3. Reshape to [128,1024] at the root. -// -// But this is not currently done. -TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) { - HloComputation::Builder builder(TestName()); - auto param0 = builder.AddInstruction(HloInstruction::CreateParameter( - 0, ShapeUtil::MakeShape(F32, {1, 128, 1}), "param0")); - auto reshape = builder.AddInstruction(HloInstruction::CreateReshape( - ShapeUtil::MakeShape(F32, {128, 1}), param0)); - Array2D a(128, 1024); - auto literal = Literal::CreateR2FromArray2D(a); - auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(std::move(literal))); - auto multiply = builder.AddInstruction(HloInstruction::CreateBinary( - constant->shape(), HloOpcode::kMultiply, constant, reshape)); - - auto computation = module().AddEntryComputation(builder.Build()); - EXPECT_THAT(computation->root_instruction(), - op::Multiply(op::Constant(), op::Reshape(param0))); - - EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie()); - - EXPECT_THAT(computation->root_instruction(), - op::Multiply(op::Constant(), op::Reshape(param0))); - EXPECT_EQ(multiply, computation->root_instruction()); -} - // Tree looks like this: // // add1 diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 52500e4e79042c51d4bea17dea6845ed23433d6c..086bd61dd04aa13b6bba7898806951fedc6fbfd4 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -54,8 +54,6 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - using ::tensorflow::strings::Printf; using ::tensorflow::strings::StrCat; using ::xla::source_map_util::InvalidParameterArgument; @@ -95,15 +93,12 @@ tensorflow::Status RecordResult(const ShapedBuffer& result, } // namespace -ServiceOptions& ServiceOptions::set_platform( - perftools::gputools::Platform* platform) { +ServiceOptions& ServiceOptions::set_platform(se::Platform* platform) { platform_ = platform; return *this; } -perftools::gputools::Platform* ServiceOptions::platform() const { - return platform_; -} +se::Platform* ServiceOptions::platform() const { return platform_; } ServiceOptions& ServiceOptions::set_number_of_replicas(int number_of_replicas) { number_of_replicas_ = number_of_replicas; @@ -123,7 +118,7 @@ int ServiceOptions::intra_op_parallelism_threads() const { } /* static */ StatusOr> Service::NewService( - perftools::gputools::Platform* platform) { + se::Platform* platform) { ServiceOptions default_options; default_options.set_platform(platform); return NewService(default_options); @@ -131,7 +126,7 @@ int ServiceOptions::intra_op_parallelism_threads() const { /* static */ StatusOr> Service::NewService( const ServiceOptions& options) { - perftools::gputools::Platform* platform = options.platform(); + se::Platform* platform = options.platform(); std::unique_ptr execute_backend; if (platform == nullptr) { TF_ASSIGN_OR_RETURN(platform, PlatformUtil::GetDefaultPlatform()); @@ -235,8 +230,7 @@ tensorflow::Status Service::ValidateResultShapeWithLayout( StatusOr>> Service::ResolveAndValidateArguments( tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice - stream_executors) { + tensorflow::gtl::ArraySlice stream_executors) { CHECK_EQ(options_.number_of_replicas(), stream_executors.size()); std::vector> replicated_arguments; replicated_arguments.resize(options_.number_of_replicas()); @@ -349,8 +343,7 @@ StatusOr> Service::CreateModuleConfig( StatusOr>> Service::BuildExecutables( std::vector versioned_handles, std::vector> module_configs, - Backend* backend, - std::vector> executors, + Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator) { VLOG(1) << Printf("BuildExecutable on service %p", this); @@ -412,8 +405,7 @@ StatusOr>> Service::BuildExecutables( StatusOr>> Service::BuildExecutables( const std::vector& module_protos, std::vector> module_configs, - Backend* backend, - std::vector> executors, + Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator) { VLOG(1) << Printf("BuildExecutable on service %p", this); @@ -493,7 +485,7 @@ StatusOr> Service::BuildExecutable( StatusOr> Service::BuildAndCacheExecutable( const VersionedComputationHandle& versioned_handle, std::unique_ptr module_config, Backend* backend, - perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile, + se::StreamExecutor* executor, ExecutionProfile* profile, DeviceMemoryAllocator* device_allocator) { std::shared_ptr executable = compilation_cache_.LookUp(versioned_handle, *module_config); @@ -541,7 +533,7 @@ Service::ExecuteParallelAndRegisterResult( // Streams where the computation are launched, so we can wait on the streams // to complete. std::vector::SmartPtr> streams; - std::vector> timers; + std::vector> timers; // Global data handles for the computation results, one for each computation. std::vector result_handles; @@ -558,15 +550,14 @@ Service::ExecuteParallelAndRegisterResult( // Stream executors for the replicas of the current computation. TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i])); CHECK_EQ(replicas.size(), arguments[i].size()); - std::vector> result_buffers; + std::vector result_buffers; for (int64 replica = 0; replica < replicas.size(); ++replica) { TF_ASSIGN_OR_RETURN(Pool::SmartPtr stream, backend->BorrowStream(replicas[replica])); streams.push_back(std::move(stream)); if (replica == 0 && profile != nullptr) { - timers.emplace_back( - new perftools::gputools::Timer(streams.back()->parent())); + timers.emplace_back(new se::Timer(streams.back()->parent())); streams.back() ->InitTimer(timers.back().get()) .ThenStartTimer(timers.back().get()); @@ -591,7 +582,7 @@ Service::ExecuteParallelAndRegisterResult( backend->StreamBorrower()); // Asynchronously launch the computation. - TF_ASSIGN_OR_RETURN(std::unique_ptr result, + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, executables[i]->ExecuteAsyncOnStream( &run_options, arguments[i][replica])); @@ -734,9 +725,9 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg, return computation->SetReturnValue(arg->operand()); } -StatusOr> -Service::GetExecutors(const ExecutionOptions& execution_options, - int64 requests_size, int64 request_index) const { +StatusOr> Service::GetExecutors( + const ExecutionOptions& execution_options, int64 requests_size, + int64 request_index) const { if (execution_options.device_handles().empty()) { return FailedPrecondition( "device handles must be given to execute parallel computations"); @@ -748,7 +739,7 @@ Service::GetExecutors(const ExecutionOptions& execution_options, "handles.", requests_size, request_index, execution_options.device_handles_size()); } - std::vector executors; + std::vector executors; for (const auto& device_handle : execution_options.device_handles()) { TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_, device_handle)); @@ -780,7 +771,7 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg, VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString(); std::vector>> all_arguments; - std::vector> all_executors; + std::vector> all_executors; std::vector versioned_handles; std::vector> module_configs; std::vector computation_names; @@ -891,7 +882,7 @@ tensorflow::Status Service::ExecuteGraphParallel( VLOG(1) << "running execute-graph-parallel request"; std::vector>> all_arguments; - std::vector> all_executors; + std::vector> all_executors; std::vector module_protos; std::vector> module_configs; std::vector computation_names; @@ -1243,7 +1234,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg, streams.push_back(std::move(stream)); } - std::vector> result_buffers; + std::vector result_buffers; for (size_t i = 0; i < streams.size(); ++i) { const auto& stream = streams[i]; ExecutableRunOptions options; @@ -1256,7 +1247,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg, ServiceExecutableRunOptions service_options( options, execute_backend_->StreamBorrower()); - TF_ASSIGN_OR_RETURN(std::unique_ptr this_result_buffer, + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer this_result_buffer, executable->ExecuteAsyncOnStream( &service_options, replicated_arguments[i])); @@ -1356,16 +1347,16 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg, } // Allocate memory in each replica and transfer the data to all replicas. - std::vector> replicated_buffers; + std::vector replicated_buffers; for (se::StreamExecutor* executor : replicas) { TF_ASSIGN_OR_RETURN( - std::unique_ptr shaped_buffer, - execute_backend_->transfer_manager()->AllocateShapedBuffer( + ScopedShapedBuffer shaped_buffer, + execute_backend_->transfer_manager()->AllocateScopedShapedBuffer( shape, execute_backend_->memory_allocator(), executor->device_ordinal())); TF_RETURN_IF_ERROR( execute_backend_->transfer_manager()->TransferLiteralToDevice( - executor, *literal, *shaped_buffer)); + executor, *literal, shaped_buffer)); replicated_buffers.emplace_back(std::move(shaped_buffer)); } TF_ASSIGN_OR_RETURN(*result->mutable_data(), @@ -1953,9 +1944,9 @@ DeviceHandle Service::SingleComputationDeviceHandle() const { return device_handle; } -StatusOr> Service::Replicas( +StatusOr> Service::Replicas( const Backend& backend, const DeviceHandle& device_handle) const { - std::vector replicas; + std::vector replicas; for (int replica = 0; replica < options_.number_of_replicas(); ++replica) { // From the computation placer, find out the device ids of the replicas for // the given device handle. diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h index e399f1ac1904f8d6145f43b0ed12d8018765d9a1..476bd0597de735a9f777be78f5ab01dac1188525 100644 --- a/tensorflow/compiler/xla/service/service.h +++ b/tensorflow/compiler/xla/service/service.h @@ -53,8 +53,8 @@ namespace xla { class ServiceOptions { public: // Set the platform backing the service, or nullptr for the default platform. - ServiceOptions& set_platform(perftools::gputools::Platform* platform); - perftools::gputools::Platform* platform() const; + ServiceOptions& set_platform(se::Platform* platform); + se::Platform* platform() const; // Set the number of replicas to use when compiling replicated // programs. @@ -66,7 +66,7 @@ class ServiceOptions { int intra_op_parallelism_threads() const; private: - perftools::gputools::Platform* platform_ = nullptr; + se::Platform* platform_ = nullptr; int number_of_replicas_ = 1; int intra_op_parallelism_threads_ = -1; }; @@ -79,7 +79,7 @@ class Service : public ServiceInterface { public: // Factory method for creating a new Service. static StatusOr> NewService( - perftools::gputools::Platform* platform = nullptr); + se::Platform* platform = nullptr); static StatusOr> NewService( const ServiceOptions& options); @@ -286,7 +286,7 @@ class Service : public ServiceInterface { ExecuteResponse* result); // Prepare the executors for executing parallel. - StatusOr> GetExecutors( + StatusOr> GetExecutors( const ExecutionOptions& execution_options, int64 requests_size, int64 request_index) const; @@ -310,8 +310,7 @@ class Service : public ServiceInterface { StatusOr>> ResolveAndValidateArguments( tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice - stream_executors); + tensorflow::gtl::ArraySlice stream_executors); // Create a Hlo module config for the given program shape and arguments. // execution_options is optional; if not given a default is used. @@ -329,7 +328,7 @@ class Service : public ServiceInterface { StatusOr> BuildExecutable( const VersionedComputationHandle& versioned_handle, std::unique_ptr module_config, Backend* backend, - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator = nullptr); // Builds an Executable for the given HLO module proto. @@ -338,7 +337,7 @@ class Service : public ServiceInterface { StatusOr> BuildExecutable( const HloModuleProto& module_proto, std::unique_ptr module_config, Backend* backend, - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator = nullptr); // Same as BuildExecutable() above, but builds a list of Executables for the @@ -346,14 +345,12 @@ class Service : public ServiceInterface { StatusOr>> BuildExecutables( std::vector versioned_handles, std::vector> module_configs, - Backend* backend, - std::vector> executors, + Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator); StatusOr>> BuildExecutables( const std::vector& module_protos, std::vector> module_configs, - Backend* backend, - std::vector> executors, + Backend* backend, std::vector> executors, DeviceMemoryAllocator* device_allocator); // Similar to BuildExecutable, but look in the compilation cache for the @@ -362,7 +359,7 @@ class Service : public ServiceInterface { StatusOr> BuildAndCacheExecutable( const VersionedComputationHandle& versioned_handle, std::unique_ptr module_config, Backend* backend, - perftools::gputools::StreamExecutor* executor, ExecutionProfile* profile, + se::StreamExecutor* executor, ExecutionProfile* profile, DeviceMemoryAllocator* device_allocator = nullptr); // Runs the given executable with the given arguments and register the result @@ -411,7 +408,7 @@ class Service : public ServiceInterface { // Returns the stream executors assigned to the replicas represented by the // given device handle. Each device_handle is a virtual replicated device that // represents a set of physical devices for the replicas. - StatusOr> Replicas( + StatusOr> Replicas( const Backend& backend, const DeviceHandle& device_handle) const; Status MaybeDumpHloModule(const HloModule& module) const; diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h index 6c1f8feac7ed4423051cf2737be57dcfab508671..7f3910cdb0366078b97fb5f6a2dc498b37570926 100644 --- a/tensorflow/compiler/xla/service/service_executable_run_options.h +++ b/tensorflow/compiler/xla/service/service_executable_run_options.h @@ -28,7 +28,7 @@ namespace xla { class ServiceExecutableRunOptions { public: using StreamBorrower = - std::function::SmartPtr>(int)>; + std::function::SmartPtr>(int)>; ServiceExecutableRunOptions() : ServiceExecutableRunOptions(ExecutableRunOptions()) {} @@ -45,14 +45,13 @@ class ServiceExecutableRunOptions { ExecutableRunOptions* mutable_run_options() { return &run_options_; } // Delegate to `ExecutableRunOptions` member. - perftools::gputools::Stream* stream() const { return run_options_.stream(); } + se::Stream* stream() const { return run_options_.stream(); } DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); } int device_ordinal() const { return run_options_.device_ordinal(); } // Borrows a stream and returns a smart pointer which returns the stream on // destruction. - StatusOr::SmartPtr> BorrowStream( - int device_ordinal) const { + StatusOr::SmartPtr> BorrowStream(int device_ordinal) const { return borrow_stream_ ? borrow_stream_(device_ordinal) : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache"); diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc index 77e12d36024dae56003ad4e59b54f9934dfc2c58..48b2922e77b78719e5d3469cbaa4fc15969de91b 100644 --- a/tensorflow/compiler/xla/service/shape_inference.cc +++ b/tensorflow/compiler/xla/service/shape_inference.cc @@ -52,6 +52,8 @@ UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) { return UNOP_ABS; case HloOpcode::kCeil: return UNOP_CEIL; + case HloOpcode::kClz: + return UNOP_CLZ; case HloOpcode::kCos: return UNOP_COS; case HloOpcode::kExp: @@ -360,6 +362,7 @@ StatusOr InferWindowOutputShape(const Shape& base_shape, arg, primitive_util::ComplexComponentType(arg.element_type())); } return arg; + case UNOP_CLZ: case UNOP_NEGATE: case UNOP_ROUND_NEAREST_AFZ: case UNOP_SIGN: diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc index 6e9986165f7eaf71a964b42b734a5ae5db5e45d7..fb3b5f06dad67b4305aed0305c9f6441e666db53 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer.cc @@ -28,8 +28,6 @@ limitations under the License. #include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/logging.h" -namespace se = ::perftools::gputools; - namespace xla { using ::tensorflow::strings::Appendf; @@ -68,6 +66,8 @@ ShapedBuffer& ShapedBuffer::operator=(ShapedBuffer&& s) { return *this; } +ShapedBuffer::~ShapedBuffer() {} + void ShapedBuffer::clear() { for (auto& pair : buffers_) { // A default constructed DeviceMemoryBase is a null pointer. @@ -104,18 +104,6 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) { return out; } -/* static */ -StatusOr> ScopedShapedBuffer::MakeScoped( - ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator) { - auto scoped_buffer = WrapUnique(new ScopedShapedBuffer( - shaped_buffer->on_host_shape(), shaped_buffer->on_device_shape(), - allocator, shaped_buffer->device_ordinal())); - scoped_buffer->buffers_ = shaped_buffer->buffers(); - shaped_buffer->clear(); - - return std::move(scoped_buffer); -} - ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, DeviceMemoryAllocator* allocator, @@ -128,7 +116,25 @@ ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer, DeviceMemoryAllocator* allocator) : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {} +ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s) + : ShapedBuffer(static_cast(s)), allocator_(s.allocator_) { + // Null out s.allocator_ so it doesn't try to free anything in its destructor. + s.allocator_ = nullptr; +} + +ScopedShapedBuffer& ScopedShapedBuffer::operator=(ScopedShapedBuffer&& s) { + *static_cast(this) = std::move(static_cast(s)); + allocator_ = s.allocator_; + // Null out s.allocator_ so it doesn't try to free anything in its destructor. + s.allocator_ = nullptr; + return *this; +} + ScopedShapedBuffer::~ScopedShapedBuffer() { + // allocator_ will be null if we were moved-from. + if (allocator_ == nullptr) { + return; + } // Deallocate all non-null buffers. A buffer may appear in more than one spot // in the shape (eg, a tuple with a repeated element) so keep track of what // has been deallocated. @@ -144,9 +150,9 @@ ScopedShapedBuffer::~ScopedShapedBuffer() { } } -std::unique_ptr ScopedShapedBuffer::release() { - auto shaped_buffer = MakeUnique(std::move(*this)); - buffers_ = ShapeTree(); +ShapedBuffer ScopedShapedBuffer::release() { + ShapedBuffer shaped_buffer(static_cast(*this)); + buffers_ = ShapeTree(); return shaped_buffer; } diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h index b816df8385ef65b0b69ede1d6e65a1991b4bd7c6..e10fca9e9466c018f6cb4da2f5618e4db4977307 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.h +++ b/tensorflow/compiler/xla/service/shaped_buffer.h @@ -30,6 +30,8 @@ limitations under the License. namespace xla { +class ScopedShapedBuffer; + // Class which encapsulates a buffer or set of buffers containing data of a // particular XLA shape. class ShapedBuffer { @@ -41,8 +43,19 @@ class ShapedBuffer { // determines the number of device allocations (DeviceMemoryBase) held by the // ShapedBuffer. ShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, - const perftools::gputools::Platform* platform, - int device_ordinal); + const se::Platform* platform, int device_ordinal); + + // Movable, but not copyable. + ShapedBuffer(ShapedBuffer&& s); + ShapedBuffer& operator=(ShapedBuffer&&); + ShapedBuffer(const ShapedBuffer&) = delete; + ShapedBuffer& operator=(const ShapedBuffer&) = delete; + + // Prevent (some forms of) accidental object slicing. + ShapedBuffer(const ScopedShapedBuffer&) = delete; + ShapedBuffer& operator=(const ScopedShapedBuffer&) = delete; + + virtual ~ShapedBuffer(); // Returns the shape of the on-host representation of the data held by this // ShapedBuffer. @@ -52,48 +65,36 @@ class ShapedBuffer { // ShapedBuffer. const Shape& on_device_shape() const { return on_device_shape_; } - const perftools::gputools::Platform* platform() const { return platform_; } + const se::Platform* platform() const { return platform_; } int device_ordinal() const { return device_ordinal_; } // Return the root buffer of the shape (shape index {}). - const perftools::gputools::DeviceMemoryBase& root_buffer() const { + const se::DeviceMemoryBase& root_buffer() const { return buffer(/*index=*/{}); } // Returns the buffer at the given shape index where index is defined as in // ShapeUtil::GetSubshape. - const perftools::gputools::DeviceMemoryBase& buffer( - const ShapeIndex& index) const { + const se::DeviceMemoryBase& buffer(const ShapeIndex& index) const { return buffers_.element(index); } // Sets the device memory buffer at the given index. - void set_buffer(const perftools::gputools::DeviceMemoryBase& buffer, - const ShapeIndex& index) { + void set_buffer(const se::DeviceMemoryBase& buffer, const ShapeIndex& index) { *buffers_.mutable_element(index) = buffer; } // Returns the underlying ShapeTree containing all the device addresses in the // ShapedBuffer. - const ShapeTree& buffers() const { - return buffers_; - } - ShapeTree& buffers() { - return buffers_; - } + const ShapeTree& buffers() const { return buffers_; } + ShapeTree& buffers() { return buffers_; } // Set all device memory pointers in the object to null. void clear(); string ToString() const; - ShapedBuffer(ShapedBuffer&& s); - ShapedBuffer& operator=(ShapedBuffer&&); - protected: - ShapedBuffer(const ShapedBuffer&) = delete; - ShapedBuffer& operator=(const ShapedBuffer&) = delete; - // The shape of the data when represented on the host. Shape on_host_shape_; @@ -101,13 +102,13 @@ class ShapedBuffer { Shape on_device_shape_; // The platform the memory is allocated on. - const perftools::gputools::Platform* platform_; + const se::Platform* platform_; // The device the memory is allocated on. int device_ordinal_; // The tree of device buffers. Its shape is on_device_shape(). - ShapeTree buffers_; + ShapeTree buffers_; }; std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer); @@ -115,41 +116,45 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer); // ShapedBuffer derived class which allocates all internal buffers on // construction and deallocates the memory when the object is // destructed. +// +// TODO(timshen): Remove inheritance between ScopedShapedBuffer and +// ShapedBuffer. There should never be a need to consider a ScopedShapedBuffer +// as a ShapedBuffer, because in that case we should just be able to pass around +// our ShapeTree. Inheritance only adds complexity. See +// discussion in cl/192849370. class ScopedShapedBuffer : public ShapedBuffer { public: - // Takes a ShapedBuffer and returns a ScopedShapedBuffer which manages the - // deallocation of the device memory held in the shaped buffer. All device - // memory pointers in the given ShapedBuffer are set to null. - static StatusOr> MakeScoped( - ShapedBuffer* shaped_buffer, DeviceMemoryAllocator* allocator); - - // Create a ScopedShapedBuffer with null DeviceMemoryBases at each index. - ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, - DeviceMemoryAllocator* allocator, int device_ordinal); + // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index. + explicit ScopedShapedBuffer(const Shape& on_host_shape, + const Shape& on_device_shape, + DeviceMemoryAllocator* allocator, + int device_ordinal); // Create a ScopedShapedBuffer by taking over the memory from the incoming // ShapedBuffer. - ScopedShapedBuffer(ShapedBuffer shaped_buffer, - DeviceMemoryAllocator* allocator); + explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer, + DeviceMemoryAllocator* allocator); + + // Movable, but not copyable. + ScopedShapedBuffer(ScopedShapedBuffer&& s); + ScopedShapedBuffer& operator=(ScopedShapedBuffer&&); + ScopedShapedBuffer(const ScopedShapedBuffer&) = delete; + ScopedShapedBuffer& operator=(const ScopedShapedBuffer&) = delete; + + // All buffers in the shape are deallocated on destruction. + ~ScopedShapedBuffer() override; // Return the allocator used to allocate the device memory held in this // ScopedShapedBuffer. DeviceMemoryAllocator* memory_allocator() const { return allocator_; } - // Release all device memory owned by this ScopedShapedBuffer and - // return the device memory pointers in the form of a - // ShapedBuffer. The returned ShapedBuffer takes over the memory - // from the ScopedShapedBuffer. The resulting ScopedShapedBuffer can - // only be destroyed. - std::unique_ptr release(); - - // All buffers in the shape are deallocated on destruction. - virtual ~ScopedShapedBuffer(); + // Releases all device memory owned by this ScopedShapedBuffer and returns the + // device memory pointers in the form of a ShapedBuffer. The returned + // ShapedBuffer takes over the memory from the ScopedShapedBuffer. The + // resulting ScopedShapedBuffer can only be destroyed. + ShapedBuffer release(); protected: - ScopedShapedBuffer(const ScopedShapedBuffer&) = delete; - void operator=(const ScopedShapedBuffer&) = delete; - DeviceMemoryAllocator* allocator_; }; diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc index 2f36e2b16e0f2eed10aef811dd3cceeba6a5b8a9..8b71a415091f028b3167cddb2583754e72ba17c8 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.cc +++ b/tensorflow/compiler/xla/service/transfer_manager.cc @@ -25,24 +25,20 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" -namespace se = ::perftools::gputools; - namespace xla { /* static */ tensorflow::mutex TransferManager::platform_transfer_manager_mutex_( tensorflow::LINKER_INITIALIZED); -/* static */ std::map* +/* static */ std::map* TransferManager::GetPlatformTransferManagers() { - static auto* r = - new std::map; + static auto* r = new std::map; return r; } Status TransferManager::TransferArrayToDevice( - perftools::gputools::StreamExecutor* executor, const Literal& literal, - const perftools::gputools::DeviceMemoryBase& dest) { + se::StreamExecutor* executor, const Literal& literal, + const se::DeviceMemoryBase& dest) { const Shape on_device_shape = HostShapeToDeviceShape(literal.shape()); TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape)) << "On-device representation of " @@ -61,8 +57,8 @@ Status TransferManager::TransferArrayToDevice( } StatusOr> TransferManager::TransferArrayFromDevice( - perftools::gputools::StreamExecutor* executor, const Shape& shape, - const perftools::gputools::DeviceMemoryBase& source) { + se::StreamExecutor* executor, const Shape& shape, + const se::DeviceMemoryBase& source) { TF_RET_CHECK(ShapeUtil::Equal(HostShapeToDeviceShape(shape), shape)) << "Shape " << ShapeUtil::HumanString(shape) << " has a differently shaped representation on-device: " @@ -112,8 +108,7 @@ StatusOr> TransferManager::TransferArrayFromDevice( } Status TransferManager::WriteTupleIndexTables( - perftools::gputools::StreamExecutor* executor, - const ShapedBuffer& device_buffer) { + se::StreamExecutor* executor, const ShapedBuffer& device_buffer) { VLOG(2) << "Writing tuple index tables for " << device_buffer; TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal()); @@ -180,7 +175,7 @@ Status TransferManager::TransferBufferToDevice( return Status::OK(); } -StatusOr> TransferManager::AllocateShapedBuffer( +StatusOr TransferManager::AllocateScopedShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal) { if (!LayoutUtil::HasLayout(on_host_shape)) { @@ -192,31 +187,21 @@ StatusOr> TransferManager::AllocateShapedBuffer( const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape); TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape)); - auto shaped_buffer = WrapUnique(new ShapedBuffer( - on_host_shape, on_device_shape, allocator->platform(), device_ordinal)); + ScopedShapedBuffer shaped_buffer(on_host_shape, on_device_shape, allocator, + device_ordinal); // Allocate an appropriate sized buffer for each element in the shape // including the tuple pointer arrays. - for (auto& pair : shaped_buffer->buffers()) { + for (auto& pair : shaped_buffer.buffers()) { const ShapeIndex& index = pair.first; se::DeviceMemoryBase& memory_base = pair.second; const Shape& subshape = ShapeUtil::GetSubshape(on_device_shape, index); TF_ASSIGN_OR_RETURN(memory_base, - allocator->Allocate(shaped_buffer->device_ordinal(), + allocator->Allocate(shaped_buffer.device_ordinal(), GetByteSizeRequirement(subshape))); } return std::move(shaped_buffer); } -StatusOr> -TransferManager::AllocateScopedShapedBuffer(const Shape& on_host_shape, - DeviceMemoryAllocator* allocator, - int device_ordinal) { - TF_ASSIGN_OR_RETURN( - std::unique_ptr unscoped_buffer, - AllocateShapedBuffer(on_host_shape, allocator, device_ordinal)); - return ScopedShapedBuffer::MakeScoped(unscoped_buffer.get(), allocator); -} - } // namespace xla diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h index 9f2b5c4aecf0b52f610171e0c2755de577b2bd9e..d82b4f0f81b5da38c1caf80bddefa0d3f7842463 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.h +++ b/tensorflow/compiler/xla/service/transfer_manager.h @@ -42,7 +42,7 @@ class TransferManager { virtual ~TransferManager() {} // Returns the ID of the platform that this transfer manager acts on. - virtual perftools::gputools::Platform::Id PlatformId() const = 0; + virtual se::Platform::Id PlatformId() const = 0; // Returns the shape of the on-device representation for the given shape on // the host. This is intended for use with ShapedBuffer where buffers are @@ -58,48 +58,45 @@ class TransferManager { // DeviceShape(literal_shape) must be compatible, but need not have the same // layout. virtual StatusOr> TransferLiteralFromDevice( - perftools::gputools::StreamExecutor* executor, - const ShapedBuffer& device_buffer) = 0; + se::StreamExecutor* executor, const ShapedBuffer& device_buffer) = 0; // Transfers the given literal into the previously allocated device memory // represented by the given ShapedBuffer using the given executor. The shape // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible, // but need not have the same layout - virtual Status TransferLiteralToDevice( - perftools::gputools::StreamExecutor* executor, const Literal& literal, - const ShapedBuffer& device_buffer) = 0; + virtual Status TransferLiteralToDevice(se::StreamExecutor* executor, + const Literal& literal, + const ShapedBuffer& device_buffer) = 0; // Convenience methods for transferring an array to or from the device at a // known address. This avoids having to construct a ShapedBuffer just to // transfer an array at a known address. - Status TransferArrayToDevice( - perftools::gputools::StreamExecutor* executor, const Literal& literal, - const perftools::gputools::DeviceMemoryBase& dest); + Status TransferArrayToDevice(se::StreamExecutor* executor, + const Literal& literal, + const se::DeviceMemoryBase& dest); StatusOr> TransferArrayFromDevice( - perftools::gputools::StreamExecutor* executor, const Shape& shape, - const perftools::gputools::DeviceMemoryBase& source); + se::StreamExecutor* executor, const Shape& shape, + const se::DeviceMemoryBase& source); // Transfers the given literal into the Infeed interface of the device, // using the given executor. - virtual Status TransferLiteralToInfeed( - perftools::gputools::StreamExecutor* executor, - const Literal& literal) = 0; + virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor, + const Literal& literal) = 0; // Transfers the given literal from the Outfeed interface of the device, // using the given executor. - virtual Status TransferLiteralFromOutfeed( - perftools::gputools::StreamExecutor* executor, const Shape& literal_shape, - Literal* literal) = 0; + virtual Status TransferLiteralFromOutfeed(se::StreamExecutor* executor, + const Shape& literal_shape, + Literal* literal) = 0; // Resets the devices associated with this transfer manager. virtual Status ResetDevices( - tensorflow::gtl::ArraySlice - executor) = 0; + tensorflow::gtl::ArraySlice executor) = 0; // Given an allocated ShapedBuffer, constructs the tuple index table(s) in // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the // ShapedBuffer is array-shaped this method does nothing. - Status WriteTupleIndexTables(perftools::gputools::StreamExecutor* executor, + Status WriteTupleIndexTables(se::StreamExecutor* executor, const ShapedBuffer& device_buffer); // Determines the byte size requirement for the given shape on the underlying @@ -107,13 +104,10 @@ class TransferManager { // region for a host-to-device transfer. virtual int64 GetByteSizeRequirement(const Shape& shape) const = 0; - // Allocate a ShapedBuffer which can hold data with the given on-host + // Allocates a ScopedShapedBuffer which can hold data with the given on-host // shape. The on-device shape may be different as indicated by // HostShapeToDeviceShape. - StatusOr> AllocateShapedBuffer( - const Shape& on_host_shape, DeviceMemoryAllocator* allocator, - int device_ordinal); - StatusOr> AllocateScopedShapedBuffer( + StatusOr AllocateScopedShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal); @@ -127,13 +121,13 @@ class TransferManager { // Precondition: a platform kind must not be registered more than once. typedef std::unique_ptr (*TransferManagerCreationFunction)(); static void RegisterTransferManager( - perftools::gputools::Platform::Id platform_id, + se::Platform::Id platform_id, TransferManagerCreationFunction transfer_manager); // Returns the transfer manager singleton pointer if it is available for the // given platform, or an error status if it is not. static StatusOr GetForPlatform( - const perftools::gputools::Platform* platform); + const se::Platform* platform); protected: // Transfer a memory block of the given size from 'source' buffer to the @@ -143,35 +137,32 @@ class TransferManager { // // source is the source data that must be in the target-dependent layout that // the Infeed HLO used in the computation expects. - virtual Status TransferBufferToInfeed( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source) = 0; + virtual Status TransferBufferToInfeed(se::StreamExecutor* executor, + int64 size, const void* source) = 0; // Transfer a memory block of the given size from the device source into the // 'destination' buffer. // // size is the size to transfer to destination in bytes. - virtual Status TransferBufferFromDevice( - perftools::gputools::StreamExecutor* executor, - const perftools::gputools::DeviceMemoryBase& source, int64 size, - void* destination); + virtual Status TransferBufferFromDevice(se::StreamExecutor* executor, + const se::DeviceMemoryBase& source, + int64 size, void* destination); // Transfer a memory block of the given size from 'source' buffer to the given // destination of the device. // // size is the size to transfer from source in bytes. - virtual Status TransferBufferToDevice( - perftools::gputools::StreamExecutor* executor, int64 size, - const void* source, perftools::gputools::DeviceMemoryBase* destination); + virtual Status TransferBufferToDevice(se::StreamExecutor* executor, + int64 size, const void* source, + se::DeviceMemoryBase* destination); // Writes the given device-memory pointers in 'elements' to the given region // to construct a tuple index table in the platform-specific tuple // representation. virtual Status WriteSingleTupleIndexTable( - perftools::gputools::StreamExecutor* executor, - tensorflow::gtl::ArraySlice - elements, - const Shape& shape, perftools::gputools::DeviceMemoryBase* region) = 0; + se::StreamExecutor* executor, + tensorflow::gtl::ArraySlice elements, + const Shape& shape, se::DeviceMemoryBase* region) = 0; private: // The mutex that guards the platform-to-transfer manager map. @@ -186,8 +177,7 @@ class TransferManager { }; // Map from platform kind to transfer manager singleton. - static std::map* - GetPlatformTransferManagers(); + static std::map* GetPlatformTransferManagers(); }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc index 83185ac49e9b7c386d10d1cbc4e20dcdfdfd6cae..3efd38ce0daa3e3f3398b32463019df6cd10a009 100644 --- a/tensorflow/compiler/xla/service/transpose_folding.cc +++ b/tensorflow/compiler/xla/service/transpose_folding.cc @@ -159,6 +159,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) { auto new_conv = HloInstruction::CreateConvolve( convolution.shape(), new_lhs, new_rhs, convolution.window(), new_dnums); + convolution.SetupDerivedInstruction(new_conv.get()); TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction( &convolution, std::move(new_conv))); diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc index 532f7fd5bfc1dffa86638a6bc51832beebd74e1d..0f16a592b68e20f5dbd1e4655ad5720ecce5a7bd 100644 --- a/tensorflow/compiler/xla/service/user_computation.cc +++ b/tensorflow/compiler/xla/service/user_computation.cc @@ -49,6 +49,8 @@ HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) { return HloOpcode::kAbs; case UNOP_CEIL: return HloOpcode::kCeil; + case UNOP_CLZ: + return HloOpcode::kClz; case UNOP_COS: return HloOpcode::kCos; case UNOP_EXP: diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h index 4c83750f3e6f3c735db66d8e0b86ae3f43e5ca11..a1dce758cd3ab3f204ce330fca2a7d2bdf57a2be 100644 --- a/tensorflow/compiler/xla/shape_layout.h +++ b/tensorflow/compiler/xla/shape_layout.h @@ -48,8 +48,7 @@ class ShapeLayout { bool MatchesLayoutInShape(const Shape& shape) const; // Copies the layout from the given shape into this ShapeLayout. 'other_shape' - // must be compatible with the ShapeLayout's shape, and 'other_shape' must - // have a layout (LayoutUtil::HasLayout). + // must be compatible with the ShapeLayout's shape. tensorflow::Status CopyLayoutFromShape(const Shape& other_shape); // Clears (Layout::Clear) all the Layouts stored in this object. diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index 63da9154cfc1a5e7e8c0eeaa103d27096540fefe..5fa728e7c2fa5faf6ba347198fdc99e56ca4c324 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/optional.h" +#include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h index 641b5e9a6accc0a2e7737f79bcd485d317e4e521..cccbce5fc83af87396f4d51eb9e785cea93aba0b 100644 --- a/tensorflow/compiler/xla/statusor.h +++ b/tensorflow/compiler/xla/statusor.h @@ -113,17 +113,19 @@ class StatusOr : private internal_statusor::StatusOrData, StatusOr& operator=(StatusOr&&) = default; // Conversion copy/move constructor, T must be convertible from U. - // TODO(b/62186717): These should not participate in overload resolution if U - // is not convertible to T. - template + template ::value>::type* = nullptr> StatusOr(const StatusOr& other); - template + template ::value>::type* = nullptr> StatusOr(StatusOr&& other); // Conversion copy/move assignment operator, T must be convertible from U. - template + template ::value>::type* = nullptr> StatusOr& operator=(const StatusOr& other); - template + template ::value>::type* = nullptr> StatusOr& operator=(StatusOr&& other); // Constructs a new StatusOr with the given value. After calling this @@ -233,12 +235,14 @@ StatusOr& StatusOr::operator=(Status&& status) { } template -template +template ::value>::type*> inline StatusOr::StatusOr(const StatusOr& other) : Base(static_cast::Base&>(other)) {} template -template +template ::value>::type*> inline StatusOr& StatusOr::operator=(const StatusOr& other) { if (other.ok()) this->Assign(other.ValueOrDie()); @@ -248,12 +252,14 @@ inline StatusOr& StatusOr::operator=(const StatusOr& other) { } template -template +template ::value>::type*> inline StatusOr::StatusOr(StatusOr&& other) : Base(static_cast::Base&&>(other)) {} template -template +template ::value>::type*> inline StatusOr& StatusOr::operator=(StatusOr&& other) { if (other.ok()) { this->Assign(std::move(other).ValueOrDie()); diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index 1f90a44d8ba725c1bc7d23b581161f8915ff74fd..c28d14ba8ac3a0b893485dae65a160f903deb3f1 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -153,6 +153,8 @@ tf_cc_binary( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service/cpu:cpu_compiler", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", @@ -191,6 +193,7 @@ cc_library( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:interpreter_plugin", # reference backend "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -257,8 +260,8 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", - "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:computation_placer", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", @@ -288,6 +291,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -311,6 +316,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -330,6 +337,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -371,6 +380,8 @@ xla_test( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:test_utils", @@ -390,6 +401,7 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -442,6 +454,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -461,6 +475,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -478,6 +494,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -514,6 +532,8 @@ xla_test( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -535,6 +555,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -554,6 +576,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -578,6 +602,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -604,6 +630,7 @@ xla_test( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -670,6 +697,8 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -702,9 +731,6 @@ xla_test( "cpu": [ "--xla_cpu_multi_thread_eigen=false", ], - "cpu_parallel": [ - "--xla_cpu_multi_thread_eigen=false", - ], }, shard_count = 20, tags = ["optonly"], @@ -715,6 +741,8 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -738,6 +766,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -760,6 +790,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -801,7 +833,6 @@ xla_test( backend_tags = { # TODO(b/31436974): Fix msan failure. Failed on 2016-09-12. "cpu": ["nomsan"], - "cpu_parallel": ["nomsan"], }, shard_count = 30, deps = [ @@ -813,6 +844,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -836,6 +869,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -898,6 +933,8 @@ xla_test( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", @@ -923,6 +960,8 @@ xla_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -963,6 +1002,8 @@ xla_test( "//tensorflow/compiler/xla:array3d", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1038,6 +1079,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1196,6 +1239,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1235,6 +1280,8 @@ xla_test( "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1256,6 +1303,8 @@ xla_test( "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1294,6 +1343,8 @@ xla_test( deps = [ "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1310,6 +1361,8 @@ xla_test( deps = [ "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1335,6 +1388,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -1355,6 +1410,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -1428,6 +1485,8 @@ xla_test( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1472,6 +1531,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1514,6 +1575,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -1532,6 +1595,8 @@ xla_test( deps = [ "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1595,6 +1660,8 @@ xla_test( ":client_library_test_base", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", ], @@ -1608,6 +1675,8 @@ xla_test( ":client_library_test_base", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", ], @@ -1625,11 +1694,11 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client:computation", - "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/service:session_proto", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1713,6 +1782,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_runner", "//tensorflow/compiler/xla/service:platform_util", @@ -1740,6 +1811,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_runner", "//tensorflow/compiler/xla/service:platform_util", @@ -1777,6 +1850,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -1802,6 +1877,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:platform_util", @@ -1860,6 +1937,8 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -1886,6 +1965,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1982,6 +2063,8 @@ xla_test( ":test_utils", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", ], diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc index 03c91745b978f80801e0da5ac44d31959659b20c..e8a5efe796a9209307ecfa343b89f66ff2a34e0f 100644 --- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc +++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" @@ -214,7 +213,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementC64s) { } XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector lhs{0xFFFFFFFF, static_cast(-1), @@ -255,7 +254,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) { } XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector lhs{static_cast(0x8000000000000000LL), static_cast(0x8000000000000000LL), @@ -1332,7 +1331,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) { // Some Pow cases that can be implemented more efficiently. XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values = {1.0f, 2.0f, 3.2f, -4.0f}; std::vector exponents = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1360,7 +1359,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) { } XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1385,7 +1384,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) { } XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, 4.0f, 0.5f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1410,7 +1409,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) { } XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1435,7 +1434,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) { } XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1460,7 +1459,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) { } XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f}; std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1492,7 +1491,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) { } XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f}; std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1525,7 +1524,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) { } XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f}; std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, 1.0f, 0.5f}; @@ -1558,7 +1557,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) { } XLA_TEST_F(ArrayElementwiseOpTest, Div4F32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f}; std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -2217,6 +2216,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogF32sVector) { error_spec_); } +XLA_TEST_F(ArrayElementwiseOpTest, ClzU32s) { + XlaBuilder builder(TestName()); + auto a = builder.ConstantR1( + {0, 1, 0x10, 0x10000, 0x700000, 0x12345678, 0xF2345678}); + builder.Clz(a); + + ComputeAndCompareR1(&builder, {32, 31, 27, 15, 9, 3, 0}, {}); +} + XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) { // a ------ (add) --------- (add) // / / @@ -2348,7 +2356,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32) { XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Eq) { // Test broadcasting in Eq comparison. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto v = builder.ConstantR1({42, 73}); auto m = builder.ConstantR2({{42, 73}, {42, 52}}); @@ -2774,7 +2782,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) { // Regression test for b/31927799. "slice - y" is fused and requires implicit // broadcast. XLA_TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x_literal = Literal::CreateR1({1, 2, 3}); auto y_literal = Literal::CreateR1({4, 5}); auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie(); diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc index ec3b46acfec0ee0ff514a862ce5b1ca74279efa8..fcd9ff55e393f64476ddd4754e0fa74427f1cb51 100644 --- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc +++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc @@ -15,7 +15,6 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -42,7 +41,7 @@ TEST_F(AxpySimpleTest, AxTenValues) { } XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) { - ComputationBuilder builder(client_, "axpy_10"); + XlaBuilder builder("axpy_10"); auto alpha = builder.ConstantR0(3.1415926535); auto x = builder.ConstantR1({}); auto y = builder.ConstantR1({}); @@ -54,7 +53,7 @@ XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) { } TEST_F(AxpySimpleTest, AxpyTenValues) { - ComputationBuilder builder(client_, "axpy_10"); + XlaBuilder builder("axpy_10"); auto alpha = builder.ConstantR0(3.1415926535); auto x = builder.ConstantR1( {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0}); diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc index e4bf1827acf24bcdbfe20fe39e794a0265ab89e3..22c3394e6f34bd018ffaaaa4d9d68339673c3764 100644 --- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc +++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc @@ -18,9 +18,9 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -34,13 +34,13 @@ namespace { class BadRngShapeValidationTest : public ClientLibraryTestBase {}; TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR0(0.0); auto one = builder.ConstantR0(1.0); Shape default_constructed; builder.RngUniform(zero, one, default_constructed); - StatusOr computation = builder.Build(); + StatusOr computation = builder.Build(); EXPECT_FALSE(computation.ok()); LOG(INFO) << "status received: " << computation.status(); EXPECT_THAT(computation.status().error_message(), @@ -48,7 +48,7 @@ TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) { } TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR0(0.0); auto one = builder.ConstantR0(1.0); Shape sans_layout; @@ -57,7 +57,7 @@ TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) { builder.RngUniform(zero, one, sans_layout); - StatusOr computation = builder.Build(); + StatusOr computation = builder.Build(); ASSERT_TRUE(computation.ok()); LOG(INFO) << computation.status(); } diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc index b853dfaa15d7ff2e21048a5a6a486d22c5a05416..4e65cf11f3f1a027e1adc5a89930caba28958fea 100644 --- a/tensorflow/compiler/xla/tests/bfloat16_test.cc +++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc @@ -19,10 +19,9 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" @@ -52,7 +51,7 @@ class Bfloat16Test : public ClientLibraryTestBase { }; XLA_TEST_F(Bfloat16Test, ScalarOperation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR0(static_cast(2.0f)); auto y = builder.ConstantR0(static_cast(1.0f)); builder.Add(x, y); @@ -62,7 +61,7 @@ XLA_TEST_F(Bfloat16Test, ScalarOperation) { } XLA_TEST_F(Bfloat16Test, LogOperation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR0(static_cast(4.0f)); builder.Log(x); @@ -71,7 +70,7 @@ XLA_TEST_F(Bfloat16Test, LogOperation) { } XLA_TEST_F(Bfloat16Test, NegateScalarF16) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.Neg(builder.ConstantR0(static_cast(2.1f))); ComputeAndCompareR0(&builder, static_cast(-2.1f), {}, @@ -80,7 +79,7 @@ XLA_TEST_F(Bfloat16Test, NegateScalarF16) { XLA_TEST_F(Bfloat16Test, BatchNormTraining) { const int kFeatureIndex = 2; - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto operand = builder.ConstantR4FromArray4D( {{{{static_cast(1.f)}, {static_cast(2.f)}}, @@ -117,7 +116,7 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) { XLA_TEST_F(Bfloat16Test, BatchNormGrad) { const int kFeatureIndex = 2; - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto operand = builder.ConstantR4FromArray4D( Array4D(2, 2, 2, 1, static_cast(0.0f))); diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc index 97fec89b63fb8d3a4264275f3253a91e1ea2ce68..48203b1d40ea69ff00a57c2c9e42620739b23d59 100644 --- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc +++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc @@ -15,8 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -32,7 +32,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_32x4) { auto alhs = MakeLinspaceArray2D(0.0, 1.0, 32, 4); auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 4); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); builder.Add(lhs, rhs); @@ -48,7 +48,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_129x129) { auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 129); auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 129); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); builder.Add(lhs, rhs); @@ -64,7 +64,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_9x5) { auto alhs = MakeLinspaceArray2D(0.0, 1.0, 9, 5); auto arhs = MakeLinspaceArray2D(0.0, 1.0, 9, 1); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); builder.Add(lhs, rhs); @@ -80,7 +80,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) { auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 257); auto arhs = MakeLinspaceArray2D(0.0, 1.0, 129, 1); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); builder.Add(lhs, rhs); @@ -93,7 +93,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) { } TEST_F(BinopScalingTest, R0PlusR2F32) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR0(42.0); auto rhs = builder.ConstantR2({ {1.0, 2.0}, {3.0, 4.0}, @@ -109,7 +109,7 @@ TEST_F(BinopScalingTest, R0PlusR2F32) { } TEST_F(BinopScalingTest, R4PlusR0S32) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // clang-format off Array4D lhs_array({ {{{1, 2}, diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc index 777ac167a3c38c38791e12541a5db3078c37595b..bff60f25ec8f15d372d251ac313200301a04f20f 100644 --- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc +++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc @@ -34,7 +34,7 @@ namespace { class BitcastConvertTest : public ClientLibraryTestBase { public: - explicit BitcastConvertTest(perftools::gputools::Platform* platform = nullptr) + explicit BitcastConvertTest(se::Platform* platform = nullptr) : ClientLibraryTestBase(platform) { mutable_debug_options()->add_xla_disable_hlo_passes("algsimp"); mutable_debug_options()->add_xla_disable_hlo_passes("inline"); diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc index 97095f1cc427789845051a8fea24c95475286fe2..34c86e007beea1cbac04641bdbdab62dc567f13e 100644 --- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc +++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc @@ -19,8 +19,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" @@ -33,10 +33,8 @@ namespace { class BroadcastSimpleTest : public ClientLibraryTestBase { public: - ComputationDataHandle BuildBinOp(HloOpcode op, - const ComputationDataHandle& lhs, - const ComputationDataHandle& rhs, - ComputationBuilder* builder) { + XlaOp BuildBinOp(HloOpcode op, const XlaOp& lhs, const XlaOp& rhs, + XlaBuilder* builder) { switch (op) { case HloOpcode::kMinimum: { return builder->Min(lhs, rhs); @@ -105,21 +103,21 @@ class BroadcastSimpleTest : public ClientLibraryTestBase { using ::testing::HasSubstr; XLA_TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR0(1.5), {}); ComputeAndCompareR0(&b, 1.5, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x3) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR0(2.25), {2, 3}); Array2D expected(2, 3, 2.25); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) { - ComputationBuilder b(client_, TestName()); - ComputationDataHandle src; + XlaBuilder b(TestName()); + XlaOp src; std::unique_ptr param_data = CreateR0Parameter(2.25f, /*parameter_number=*/0, /*name=*/"src", /*builder=*/&b, /*data_handle=*/&src); @@ -131,21 +129,21 @@ XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) { } XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x0) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR0(2.25), {2, 0}); Array2D expected(2, 0); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_0x2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR0(2.25), {0, 2}); Array2D expected(0, 2); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR1({1, 2, 3}), {2}); Array2D expected(2, 3); @@ -160,7 +158,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) { // Tests implicit broadcasting of PREDs. XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); Array2D x_vals(2, 1); x_vals(0, 0) = true; @@ -171,7 +169,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) { y_vals(1, 0, 0) = true; y_vals(1, 1, 0) = true; - ComputationDataHandle x, y; + XlaOp x, y; auto x_data = CreateR2Parameter(x_vals, 0, "x", &b, &x); auto y_data = CreateR3Parameter(y_vals, 1, "y", &b, &y); b.And(x, y, /*broadcast_dimensions=*/{1, 2}); @@ -186,7 +184,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) { } XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR1({}), {2}); Array2D expected(2, 0); @@ -194,7 +192,7 @@ XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) { } XLA_TEST_F(BroadcastSimpleTest, 1DToZeroElement2D) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR1({1, 2, 3}), {0}); Array2D expected(0, 3); @@ -209,7 +207,7 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) { // broadcasting (broadcast_dimensions {1, 2}), then is added to the rhs shape // [2, 3, 1]. Degenerate dimension broadcasting then broadcasts the size one // dimensions. - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Add(b.ConstantR2({{1.0, 5.0}}), b.ConstantLiteral(*Literal::CreateR3( @@ -247,7 +245,7 @@ class BroadcastR3ImplicitTest XLA_TEST_P(BroadcastR3ImplicitTest, Doit) { const R3ImplicitBroadcastSpec& spec = GetParam(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Shape r3_shape, r3_implicit_shape; Array3D r3_array(spec.output_bounds[0], spec.output_bounds[1], @@ -264,8 +262,7 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) { auto r3_implicit_parameter = builder.Parameter(0, r3_implicit_shape, "input"); auto r3_parameter = builder.Parameter(1, r3_shape, "input"); - ComputationDataHandle op = - BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder); + XlaOp op = BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder); Array3D expected_array(spec.output_bounds[0], spec.output_bounds[1], spec.output_bounds[2]); @@ -300,9 +297,9 @@ INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances, // r1 and r3's dim0 matches, and r1's dim1 and dim2 have size 1: XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) { - ComputationBuilder b(client_, TestName()); - ComputationDataHandle r1h; - ComputationDataHandle r3h; + XlaBuilder b(TestName()); + XlaOp r1h; + XlaOp r3h; Array3D r1d = {{{1}}, {{2}}}; Array3D r3d = {{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}; @@ -319,7 +316,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -332,7 +329,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}, {2}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -345,7 +342,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}, {3, 4}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -358,7 +355,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}}, {{3, 4}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -371,7 +368,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}, {2}}, {{3}, {4}}})); auto r3 = b.ConstantLiteral( @@ -385,7 +382,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -491,7 +488,7 @@ class BroadcastR2ImplicitTest XLA_TEST_P(BroadcastR2ImplicitTest, Doit) { const R2ImplicitBroadcastSpec& spec = GetParam(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // Operands with degenerate dimensions require implicit broadcasting: Shape r2_shape, r2_implicit_shape1, r2_implicit_shape2; @@ -517,10 +514,9 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) { auto r2_implicit_parameter2 = builder.Parameter(2, r2_implicit_shape2, "input2"); - ComputationDataHandle op1 = + XlaOp op1 = BuildBinOp(spec.op1, r2_implicit_parameter1, r2_parameter, &builder); - ComputationDataHandle op2 = - BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder); + XlaOp op2 = BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder); Array2D expected_array(spec.output_bounds[0], spec.output_bounds[1]); @@ -547,7 +543,7 @@ INSTANTIATE_TEST_CASE_P(BroadcastR2ImplicitTestInstances, ::testing::ValuesIn(kR2ImplicitBroadcastTestCases)); XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}})); auto r2 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}, {3, 4}})); b.Add(r2, r1); @@ -558,7 +554,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) { } XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR2({{1}, {2}})); auto r2 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}, {3, 4}})); b.Add(r2, r1); @@ -569,7 +565,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantR1({10, 20}); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -582,7 +578,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantR1({10, 20}); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -595,7 +591,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantR1({10, 20}); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -608,7 +604,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1_0 = b.ConstantR1({1000, 2000}); auto r1_1 = b.ConstantR1({100, 200}); auto r1_2 = b.ConstantR1({10, 20}); @@ -629,7 +625,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1_0 = b.ConstantR1({1000, 2000}); auto r1_1 = b.ConstantR1({100, 200}); auto r1_2 = b.ConstantR1({10, 20}); @@ -652,7 +648,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) { XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) { // Binary dimension broadcasting of the smaller lhs ([2, 2] up to [2, 2, 2]) // results in a shape incompatible with the lhs [2, 3, 1]. - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Add(b.ConstantR2({{1.0, 5.0}, {1.0, 5.0}}), b.ConstantLiteral(*Literal::CreateR3( @@ -667,7 +663,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) { XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) { // Test invalid broadcasting with [1, 2] and [2, 3] inputs. - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Add(b.ConstantR2({{1.0, 2.0}}), b.ConstantR2({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}})); @@ -680,7 +676,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) { XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) { // Test invalid broadcasting with [1, 2] and [2, 3] inputs. - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Add(b.ConstantR2({{1.0, 2.0}}), b.ConstantR2({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}})); diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl index eac2eb286c3f7a1cd33aed03686e99ef753b773a..53f2c3bfbfce9585cb68f103a495ce2f1ad8432e 100644 --- a/tensorflow/compiler/xla/tests/build_defs.bzl +++ b/tensorflow/compiler/xla/tests/build_defs.bzl @@ -4,7 +4,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured") load("//tensorflow/compiler/xla/tests:plugin.bzl", "plugins") load("//tensorflow:tensorflow.bzl", "tf_cc_test") -all_backends = ["cpu", "cpu_parallel", "gpu"] + plugins.keys() +all_backends = ["cpu", "gpu"] + plugins.keys() def filter_backends(backends): """Removes "gpu" from a backend list if CUDA is not enabled. @@ -39,10 +39,10 @@ def xla_test(name, **kwargs): """Generates cc_test targets for the given XLA backends. - This rule generates a cc_test target for one or more XLA backends and also - a platform-agnostic cc_library rule. The arguments are identical to cc_test - with two additions: 'backends' and 'backend_args'. 'backends' specifies the - backends to generate tests for ("cpu", "cpu_parallel", "gpu"), and + This rule generates a cc_test target for one or more XLA backends and also a + platform-agnostic cc_library rule. The arguments are identical to cc_test with + two additions: 'backends' and 'backend_args'. 'backends' specifies the + backends to generate tests for ("cpu", "gpu"), and 'backend_args'/'backend_tags' specifies backend-specific args parameters to use when generating the cc_test. @@ -90,9 +90,9 @@ def xla_test(name, deps: Dependencies of the target. xla_test_library_deps: If set, the generated test targets will depend on the respective cc_libraries generated by the xla_test_library rule. - backends: A list of backends to generate tests for. Supported - values: "cpu", "cpu_parallel", "gpu". If this list is empty, the test will - be generated for all supported backends. + backends: A list of backends to generate tests for. Supported values: "cpu", + "gpu". If this list is empty, the test will be generated for all supported + backends. blacklisted_backends: A list of backends to NOT generate tests for. args: Test arguments for the target. tags: Tags for the target. @@ -128,10 +128,6 @@ def xla_test(name, if backend == "cpu": backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"] backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"] - elif backend == "cpu_parallel": - backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"] - backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"] - this_backend_args += ["--xla_backend_extra_options=\"xla_cpu_parallel\""] elif backend == "gpu": backend_deps = ["//tensorflow/compiler/xla/service:gpu_plugin"] backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"] @@ -201,7 +197,7 @@ def xla_test_library(name, hdrs: Headers for the target. deps: Dependencies of the target. backends: A list of backends to generate libraries for. - Supported values: "cpu", "cpu_parallel", "gpu". If this list is empty, the + Supported values: "cpu", "gpu". If this list is empty, the library will be generated for all supported backends. """ @@ -210,7 +206,7 @@ def xla_test_library(name, for backend in filter_backends(backends): this_backend_copts = [] - if backend in ["cpu", "cpu_parallel", "gpu"]: + if backend in ["cpu", "gpu"]: backend_deps = ["//tensorflow/compiler/xla/tests:test_macros_%s" % backend] elif backend in plugins: backend_deps = plugins[backend]["deps"] diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc index 312d8f284d3421b4ef06b94c12949fc5fe4fa0b0..22660c35dcaa0ebbb553aa2d5e2412043a2bb300 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.cc +++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc @@ -32,8 +32,6 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { @@ -59,11 +57,15 @@ se::Platform* GetReferencePlatform() { } // namespace ClientLibraryTestBase::ClientLibraryTestBase( - perftools::gputools::Platform* platform, - const LocalClientOptions& client_options) + se::Platform* platform, const LocalClientOptions& client_options) : client_(GetOrCreateLocalClientOrDie(client_options)), execution_options_(CreateDefaultExecutionOptions()) { CHECK_EQ(platform, client_options.platform()); + + LocalClientOptions ref_options; + ref_options.set_platform(GetReferencePlatform()); + ref_client_ = GetOrCreateLocalClientOrDie(ref_options); + // Disabling constant_folding so that tests (usually written using Constants) // will exercise the intended code paths, instead of being constant folded. // @@ -155,6 +157,7 @@ ClientLibraryTestBase::ExecuteAndTransferReference( *execution_options.mutable_shape_with_output_layout() = *shape_with_output_layout; } + execution_options.clear_device_handles(); return ref_client_->ExecuteAndTransfer(computation, arguments, &execution_options); } @@ -214,6 +217,14 @@ void ClientLibraryTestBase::ComputeAndCompareR1( arguments); } +void ClientLibraryTestBase::ComputeAndCompareR1( + XlaBuilder* builder, const tensorflow::core::Bitmap& expected, + tensorflow::gtl::ArraySlice arguments) { + std::unique_ptr expected_literal = Literal::CreateR1(expected); + ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, + arguments); +} + template void ClientLibraryTestBase::ComputeAndCompareLiteral( BuilderT* builder, const Literal& expected, @@ -455,7 +466,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus( } void ClientLibraryTestBase::ComputeAndCompareR1U8( - ComputationBuilder* builder, tensorflow::StringPiece expected, + XlaBuilder* builder, tensorflow::StringPiece expected, tensorflow::gtl::ArraySlice arguments) { auto actual_status = ExecuteAndTransfer(builder, arguments); EXPECT_IS_OK(actual_status.status()); @@ -616,8 +627,8 @@ ClientLibraryTestBase::ComputeValueAndReference( return std::make_pair(std::move(reference), std::move(result)); } -Computation ClientLibraryTestBase::CreateScalarRelu() { - ComputationBuilder builder(client_, "relu"); +XlaComputation ClientLibraryTestBase::CreateScalarRelu() { + XlaBuilder builder("relu"); auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {}); auto z_value = builder.Parameter(0, shape, "z_value"); auto zero = use_bfloat16_ @@ -629,8 +640,8 @@ Computation ClientLibraryTestBase::CreateScalarRelu() { return computation_status.ConsumeValueOrDie(); } -Computation ClientLibraryTestBase::CreateScalarMax() { - ComputationBuilder builder(client_, "max"); +XlaComputation ClientLibraryTestBase::CreateScalarMax() { + XlaBuilder builder("max"); auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {}); auto x = builder.Parameter(0, shape, "x"); auto y = builder.Parameter(1, shape, "y"); diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h index b3212dd2282375367ce890e960278fc469a5ef52..32eea7c2f3a65d2b4a83435ec6258ea9cf6aaf6a 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.h +++ b/tensorflow/compiler/xla/tests/client_library_test_base.h @@ -64,11 +64,10 @@ std::vector ExpandUseBfloat16( // A client library test establishes an in-process XLA client connection. class ClientLibraryTestBase : public ::testing::Test { protected: - explicit ClientLibraryTestBase( - perftools::gputools::Platform* platform = nullptr); + explicit ClientLibraryTestBase(se::Platform* platform = nullptr); // Creates a new ClientLibraryTestBase with custom client options. - ClientLibraryTestBase(perftools::gputools::Platform* platform, + ClientLibraryTestBase(se::Platform* platform, const LocalClientOptions& client_options); // Returns the name of the test currently being run. @@ -166,6 +165,9 @@ class ClientLibraryTestBase : public ::testing::Test { void ComputeAndCompareR1(ComputationBuilder* builder, const tensorflow::core::Bitmap& expected, tensorflow::gtl::ArraySlice arguments); + void ComputeAndCompareR1(XlaBuilder* builder, + const tensorflow::core::Bitmap& expected, + tensorflow::gtl::ArraySlice arguments); template void ComputeAndCompareR2(BuilderT* builder, const Array2D& expected, @@ -220,7 +222,7 @@ class ClientLibraryTestBase : public ::testing::Test { // Compare the result of the computation to a strings. In XLA strings are // represented using rank-1 U8 shapes. void ComputeAndCompareR1U8( - ComputationBuilder* builder, tensorflow::StringPiece expected, + XlaBuilder* builder, tensorflow::StringPiece expected, tensorflow::gtl::ArraySlice arguments); // Convenience method for running a built computation, transferring the @@ -253,8 +255,8 @@ class ClientLibraryTestBase : public ::testing::Test { ErrorSpec error); // Create scalar operations for use in reductions. - Computation CreateScalarRelu(); - Computation CreateScalarMax(); + XlaComputation CreateScalarRelu(); + XlaComputation CreateScalarMax(); Computation CreateScalarReluSensitivity(); // Special case convenience functions for creating filled arrays. diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc index 32e2f2c0848407ec46a5ac52e2668ef27b92c426..1e5447179677318eb604590195e97f811024898a 100644 --- a/tensorflow/compiler/xla/tests/client_test.cc +++ b/tensorflow/compiler/xla/tests/client_test.cc @@ -109,8 +109,7 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) { /*minor_to_major=*/{1, 0}))); } -XLA_TEST_F(ClientTest, - DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) { +XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) { XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg; Shape shape = ShapeUtil::MakeShape(S32, {2, 2}); diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc index c15d808f1ddfb44a512fa395bb8e515bca3859b6..7ea82a791f72ea23be152ce9f548cc6719922236 100644 --- a/tensorflow/compiler/xla/tests/compute_constant_test.cc +++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc @@ -47,16 +47,14 @@ ClientType client_types[] = {ClientType::kLocal, ClientType::kCompileOnly}; class ComputeConstantTest : public ::testing::Test { public: - explicit ComputeConstantTest( - perftools::gputools::Platform* platform = nullptr) + explicit ComputeConstantTest(se::Platform* platform = nullptr) : platform_(platform) {} string TestName() const { return ::testing::UnitTest::GetInstance()->current_test_info()->name(); } - Client* ClientOrDie(::perftools::gputools::Platform* platform, - ClientType client_type) { + Client* ClientOrDie(se::Platform* platform, ClientType client_type) { if (client_type == ClientType::kLocal) { StatusOr result = ClientLibrary::GetOrCreateLocalClient(platform); @@ -107,7 +105,7 @@ class ComputeConstantTest : public ::testing::Test { return result.ok() ? result.ValueOrDie() : false; } - perftools::gputools::Platform* platform_; + se::Platform* platform_; }; TEST_F(ComputeConstantTest, ScalarInt32Literal) { diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc index 0842a8918bcfec037ab0f9aa24014c7d8296cdf8..e67a30d76c2fac1240c5f14a97dea421db2fed04 100644 --- a/tensorflow/compiler/xla/tests/convert_test.cc +++ b/tensorflow/compiler/xla/tests/convert_test.cc @@ -36,7 +36,7 @@ namespace { class ConvertTest : public ClientLibraryTestBase { public: - explicit ConvertTest(perftools::gputools::Platform* platform = nullptr) + explicit ConvertTest(se::Platform* platform = nullptr) : ClientLibraryTestBase(platform) { mutable_debug_options()->add_xla_disable_hlo_passes("algsimp"); mutable_debug_options()->add_xla_disable_hlo_passes("inline"); diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc index 896b34fb6e2762c14bd9ec2bf1ba13c548d4cf60..b5a42e305987df030c15d089f5877f73bb61de1b 100644 --- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc @@ -18,9 +18,9 @@ limitations under the License. #include #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/statusor.h" @@ -34,13 +34,35 @@ limitations under the License. namespace xla { namespace { +StatusOr CreateConvDimensionNumbers( + int64 input_batch, int64 input_feature, int64 input_first_spatial, + int64 input_second_spatial, int64 output_batch, int64 output_feature, + int64 output_first_spatial, int64 output_second_spatial, + int64 kernel_output_feature, int64 kernel_input_feature, + int64 kernel_first_spatial, int64 kernel_second_spatial) { + ConvolutionDimensionNumbers dimension_numbers; + dimension_numbers.set_input_batch_dimension(input_batch); + dimension_numbers.set_input_feature_dimension(input_feature); + dimension_numbers.add_input_spatial_dimensions(input_first_spatial); + dimension_numbers.add_input_spatial_dimensions(input_second_spatial); + dimension_numbers.set_kernel_output_feature_dimension(kernel_output_feature); + dimension_numbers.set_kernel_input_feature_dimension(kernel_input_feature); + dimension_numbers.add_kernel_spatial_dimensions(kernel_first_spatial); + dimension_numbers.add_kernel_spatial_dimensions(kernel_second_spatial); + dimension_numbers.set_output_batch_dimension(output_batch); + dimension_numbers.set_output_feature_dimension(output_feature); + dimension_numbers.add_output_spatial_dimensions(output_first_spatial); + dimension_numbers.add_output_spatial_dimensions(output_second_spatial); + TF_RETURN_IF_ERROR(XlaBuilder::Validate(dimension_numbers)); + return dimension_numbers; +} + class ConvolutionDimensionNumbersTest : public ClientLibraryTestBase {}; // Tests the convolution operation with invalid input dimension numbers. TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) { auto dimension_numbers_status = - ComputationBuilder::CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0, - 1, 2, 3); + CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3); ASSERT_FALSE(dimension_numbers_status.ok()); ASSERT_THAT(dimension_numbers_status.status().error_message(), ::testing::HasSubstr("input are not unique")); @@ -49,8 +71,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) { // Tests the convolution operation with invalid weight dimension numbers. TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) { auto dimension_numbers_status = - ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0, - 2, 2, 3); + CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 2, 3); ASSERT_FALSE(dimension_numbers_status.ok()); ASSERT_THAT(dimension_numbers_status.status().error_message(), ::testing::HasSubstr("weight are not unique")); @@ -59,8 +80,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) { // Tests the convolution operation with invalid output dimension numbers. TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) { auto dimension_numbers_status = - ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0, - 1, 2, 3); + CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0, 1, 2, 3); ASSERT_FALSE(dimension_numbers_status.ok()); ASSERT_THAT(dimension_numbers_status.status().error_message(), ::testing::HasSubstr("output are not unique")); @@ -76,14 +96,14 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest, client_->TransferToServer(*Literal::CreateR4FromArray4D(*weight_array)) .ConsumeValueOrDie(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(*input_array); auto weight = builder.Parameter(0, ShapeUtil::MakeShape(F32, {4, 3, 1, 1}), "weight"); auto conv1 = builder.Conv(input, weight, {1, 1}, Padding::kValid); ConvolutionDimensionNumbers dim_nums = - ComputationBuilder::CreateDefaultConvDimensionNumbers(); + XlaBuilder::CreateDefaultConvDimensionNumbers(); // Swap batch_dimension and feature_dimension. int64 old_input_batch_dim = dim_nums.input_batch_dimension(); int64 old_output_batch_dim = dim_nums.output_batch_dimension(); diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc index 9c1145def8c11f1222c63adf006102887d49f00d..50d6e25d868c4964ff35023b43a3734ed115bbb8 100644 --- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -52,7 +53,7 @@ class ConvolutionVariantsTest : public ClientLibraryTestBase { }; XLA_TEST_F(ConvolutionVariantsTest, Minimal) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const Array4D input_array(1, 1, 1, 1, {2}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -67,7 +68,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Minimal) { } XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const Array4D input_array(5, 1, 1, 1, {1, 2, 3, 4, 5}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -82,7 +83,7 @@ XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) { } XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(2, 1, 3, 4); input_array.FillWithMultiples(1); @@ -99,7 +100,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) { } XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 2, 1, 1, {10, 1}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -114,7 +115,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 2, {1, 2}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -129,7 +130,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 3, {1, 2, 3}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -144,7 +145,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -159,7 +160,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -174,7 +175,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -189,7 +190,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array( 2, 2, 2, 3, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, // plane 0 @@ -210,7 +211,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 4, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -225,7 +226,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 5, {1, 2, 3, 4, 5}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -240,7 +241,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 4, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -255,7 +256,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 5, {1, 2, 3, 4, 5}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -270,7 +271,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -285,7 +286,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) { } XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 1, {1}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -300,7 +301,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) { } XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 3, {1, 2, 3}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -315,7 +316,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) { } XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -333,7 +334,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 2, 1, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -348,7 +349,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -363,7 +364,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 3, {1, 2, 3}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -378,7 +379,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(64); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -398,7 +399,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(16 * 1 * 1 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -419,7 +420,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); constexpr int bs = 16; constexpr int kx = 2; @@ -450,7 +451,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); constexpr int kx = 2; constexpr int ky = 2; @@ -482,7 +483,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(16, 1, 8, 8); for (int i0 = 0; i0 < 16; ++i0) { @@ -510,7 +511,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(2 * 8 * 8); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -536,7 +537,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(2 * 2 * 8 * 8); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -562,7 +563,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(32 * 2 * 8 * 8); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -602,7 +603,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(16, 16, 1, 1); Array4D filter_array(16, 16, 1, 1); @@ -628,7 +629,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) { } XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 4 * 6); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -640,14 +641,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) { builder.ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{2, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 2, 2, {3924, 4257, 5922, 6255}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -659,14 +660,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) { builder.ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{}, /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 1, 8, {10, 2, 20, 3, 30, 4, 40, 5}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 3 * 4); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -682,8 +683,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) { builder.ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{2, 1}, /*padding=*/{{1, 0}, {0, 0}}, /*lhs_dilation=*/{3, 2}, - /*rhs_dilation=*/{}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + /*rhs_dilation=*/{}, XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 3, 5, {204, 40, 406, 60, 608, // @@ -693,7 +693,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) { } XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -705,14 +705,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) { builder.ConvGeneral( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {-1, -1}}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 1, 2, {23, 34}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -724,14 +724,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) { builder.ConvGeneral( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {-1, 2}}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 1, 5, {23, 34, 45, 50, 0}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -743,14 +743,14 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) { builder.ConvGeneral( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {2, -1}}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 1, 5, {0, 1, 12, 23, 34}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -763,7 +763,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) { /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {3, 2}}, /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); // input: // [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5] @@ -775,7 +775,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) { ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -788,7 +788,7 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) { /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {-3, -2}}, /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); // input: // [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5] @@ -821,7 +821,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x1x2x3_Filter2x1x1x2) { Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -854,7 +854,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x16x1x1_Filter1x16x1x1) { Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -887,7 +887,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter1x16x1x1) { Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -920,7 +920,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) { Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -954,7 +954,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -966,7 +966,7 @@ XLA_TEST_F(ConvolutionVariantsTest, } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 2 * 3 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -1010,7 +1010,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 2 * 3 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -1054,7 +1054,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 2 * 3 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -1095,7 +1095,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 2 * 3 * 2); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -1147,7 +1147,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) { // BackwardInputConv([1,2,3], [5,6], padding_low=0, padding_high=1) XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingLessThanHighPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3})); @@ -1166,19 +1166,18 @@ XLA_TEST_F(ConvolutionVariantsTest, // BackwardInputConv([1], [1,10,100], stride=3, padding=(2,1)) XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingGreaterThanHighPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 1, /*values=*/{1})); auto weights = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 3, /*values=*/{1, 10, 100})); auto mirrored_weights = builder.Rev(weights, {2, 3}); - builder.ConvGeneralDilated( - gradients, mirrored_weights, - /*window_strides=*/{1, 1}, - /*padding=*/{{0, 0}, {0, 3}}, - /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + builder.ConvGeneralDilated(gradients, mirrored_weights, + /*window_strides=*/{1, 1}, + /*padding=*/{{0, 0}, {0, 3}}, + /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{}, + XlaBuilder::CreateDefaultConvDimensionNumbers()); ComputeAndCompareR4(&builder, {{{{100, 0}}}}, {}, error_spec_); } @@ -1187,7 +1186,7 @@ XLA_TEST_F(ConvolutionVariantsTest, // into // BackwardInputConv([1], [1,10,100], padding=(1,1)) XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 1, /*values=*/{1})); @@ -1208,7 +1207,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) { // However, XLA:GPU doesn't actually fuse it because PadInsertion doesn't // support negative padding on backward convolution yet (b/32744257). XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3})); @@ -1224,7 +1223,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) { XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterLowPaddingLessThanHighPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // activations: 1,2,3,4 ---pad--> 0,1,2,3,4,0,0 // gradients: 100,10,1 -dilate-> 100,0,10,0,1 @@ -1240,7 +1239,7 @@ XLA_TEST_F(ConvolutionVariantsTest, /*window_strides=*/{1, 1}, /*padding=*/{{0, 0}, {1, 2}}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); builder.Transpose(forward_conv, {0, 1, 2, 3}); ComputeAndCompareR4(&builder, {{{{24, 130, 240}}}}, {}, error_spec_); @@ -1248,7 +1247,7 @@ XLA_TEST_F(ConvolutionVariantsTest, XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterLowPaddingGreaterThanHighPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // activations: 1,2,3,4 ---pad--> 0,0,1,2,3,4 // gradients: 100,10,1 -dilate-> 100,0,10,0,1 @@ -1266,14 +1265,14 @@ XLA_TEST_F(ConvolutionVariantsTest, /*window_strides=*/{1, 1}, /*padding=*/{{0, 0}, {2, 0}}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); builder.Transpose(forward_conv, {0, 1, 2, 3}); ComputeAndCompareR4(&builder, {{{{13, 24}}}}, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // activations: 1,2,3,4 ---pad--> 0,0,1,2,3,4,0 // gradients: 100,10,1 -dilate-> 100,0,10,0,1 @@ -1293,14 +1292,14 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) { /*window_strides=*/{1, 1}, /*padding=*/{{0, 0}, {2, 1}}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); builder.Transpose(forward_conv, {0, 1, 2, 3}); ComputeAndCompareR4(&builder, {{{{13, 24, 130}}}}, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR3FromArray3D( Array3D(1, 1, 1, /*value=*/1)); @@ -1314,26 +1313,26 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) { } XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto activations = builder.ConstantR3FromArray3D(Array3D({{{1, 2, 3, 4}}})); auto gradients = builder.ConstantR3FromArray3D(Array3D({{{100, 10, 1}}})); - auto forward_conv = builder.ConvGeneralDilated( - activations, gradients, - /*window_strides=*/{1}, - /*padding=*/{{2, 1}}, - /*lhs_dilation=*/{}, /*rhs_dilation=*/{2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers( - /*num_spatial_dims=*/1)); + auto forward_conv = + builder.ConvGeneralDilated(activations, gradients, + /*window_strides=*/{1}, + /*padding=*/{{2, 1}}, + /*lhs_dilation=*/{}, /*rhs_dilation=*/{2}, + XlaBuilder::CreateDefaultConvDimensionNumbers( + /*num_spatial_dims=*/1)); builder.Transpose(forward_conv, {0, 1, 2}); ComputeAndCompareR3(&builder, {{{13, 24, 130}}}, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients_flat = Literal::CreateR1({1}); auto gradients_literal = @@ -1357,7 +1356,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) { } XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto activations_flat = Literal::CreateR1({1, 2, 3, 4}); auto activations_literal = @@ -1378,7 +1377,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) { /*window_strides=*/{1, 1, 1}, /*padding=*/{{0, 0}, {0, 0}, {2, 1}}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers( + XlaBuilder::CreateDefaultConvDimensionNumbers( /*num_spatial_dims=*/3)); builder.Transpose(forward_conv, {0, 1, 2, 3, 4}); ComputeAndCompareLiteral(&builder, *expected_literal, {}, error_spec_); diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc index 5f00c34002803553b9c17b4fce0abafda7369796..ff53a84588fc04effac217112425e5868dfeaa67 100644 --- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc +++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc @@ -35,8 +35,6 @@ limitations under the License. #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { @@ -472,13 +470,6 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { template void RunR3Contiguous(std::vector operand_shape, int32 index, int32 size) { -#ifdef XLA_TEST_BACKEND_CPU_PARALLEL - // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10. - if (std::is_same::value) { - return; - } -#endif - const int32 kSeq = operand_shape[0]; const int32 kBatch = operand_shape[1]; const int32 kDim = operand_shape[2]; @@ -541,30 +532,22 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0(); } // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10. -XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R1BF16)) { - TestR1(); -} +XLA_TEST_F(DynamicUpdateSliceTest, Int32R1BF16) { TestR1(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1(); } -// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10. -XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R2BF16)) { - TestR2(); -} +XLA_TEST_F(DynamicUpdateSliceTest, Int32R2BF16) { TestR2(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2(); } -// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10. -XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R3BF16)) { - TestR3(); -} +XLA_TEST_F(DynamicUpdateSliceTest, Int32R3BF16) { TestR3(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3(); } -XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32WrapBF16)) { +XLA_TEST_F(DynamicUpdateSliceTest, Int32WrapBF16) { TestWrap(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32Wrap) { TestWrap(); } @@ -737,11 +720,11 @@ void BM_DynamicSlice(int num_iters) { auto start_indices_literal = Literal::CreateR1({0, 1, 2, 3}); ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice( - executors[device_ordinal], *start_indices_literal, *buffer)); + executors[device_ordinal], *start_indices_literal, buffer)); std::unique_ptr executable = client - ->Compile(computation, {&buffer->on_host_shape()}, + ->Compile(computation, {&buffer.on_host_shape()}, ExecutableBuildOptions()) .ConsumeValueOrDie(); @@ -750,14 +733,14 @@ void BM_DynamicSlice(int num_iters) { options.set_allocator(&allocator); const int kWarmups = 2; for (int i = 0; i < kWarmups; ++i) { - auto result = executable->Run({buffer.get()}, options); + auto result = executable->Run({&buffer}, options); ASSERT_TRUE(result.ok()); } // Run benchmark. tensorflow::testing::StartTiming(); for (int i = 0; i < num_iters; ++i) { - auto result = executable->Run({buffer.get()}, options); + auto result = executable->Run({&buffer}, options); ASSERT_TRUE(result.ok()); } } diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc index 644cbbf40f296eb2a574ae568b4f32aa3d0bd12f..c8cc8e40aa32105e56e7e13489734af6ce56e0c7 100644 --- a/tensorflow/compiler/xla/tests/execution_profile_test.cc +++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc @@ -24,8 +24,7 @@ namespace { class ExecutionProfileTest : public ClientLibraryTestBase {}; -XLA_TEST_F(ExecutionProfileTest, - DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) { +XLA_TEST_F(ExecutionProfileTest, ExecuteWithExecutionProfile) { Shape shape = ShapeUtil::MakeShape(F32, {256, 256}); TF_ASSERT_OK_AND_ASSIGN( diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc index a292eab1d198fbf69c6dc81c780487ea46756f72..6f89e9164c8d44cbc6afcceb653b0e57841bf500 100644 --- a/tensorflow/compiler/xla/tests/fusion_test.cc +++ b/tensorflow/compiler/xla/tests/fusion_test.cc @@ -50,8 +50,6 @@ limitations under the License. using tensorflow::gtl::ArraySlice; -namespace se = ::perftools::gputools; - namespace xla { namespace { @@ -796,19 +794,19 @@ void BM_ParallelFusion(int num_iters) { // Transfer literals to device. auto param0_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1); - std::unique_ptr buffer0 = + ScopedShapedBuffer buffer0 = client->LiteralToShapedBuffer(*param0_literal, device_ordinal) .ConsumeValueOrDie(); auto param1_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1); - std::unique_ptr buffer1 = + ScopedShapedBuffer buffer1 = client->LiteralToShapedBuffer(*param1_literal, device_ordinal) .ConsumeValueOrDie(); auto param2_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1); - std::unique_ptr buffer2 = + ScopedShapedBuffer buffer2 = client->LiteralToShapedBuffer(*param2_literal, device_ordinal) .ConsumeValueOrDie(); @@ -816,8 +814,8 @@ void BM_ParallelFusion(int num_iters) { std::unique_ptr executable = client ->Compile(computation, - {&buffer0->on_host_shape(), &buffer1->on_host_shape(), - &buffer2->on_host_shape()}, + {&buffer0.on_host_shape(), &buffer1.on_host_shape(), + &buffer2.on_host_shape()}, ExecutableBuildOptions()) .ConsumeValueOrDie(); @@ -838,8 +836,7 @@ void BM_ParallelFusion(int num_iters) { // Run some warm-up executions. const int kWarmups = 2; for (int i = 0; i < kWarmups; ++i) { - auto result = - executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options); + auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options); ASSERT_TRUE(result.ok()); } @@ -852,8 +849,7 @@ void BM_ParallelFusion(int num_iters) { tensorflow::testing::UseRealTime(); tensorflow::testing::StartTiming(); for (int i = 0; i < num_iters; ++i) { - auto result = - executable->Run({buffer0.get(), buffer1.get(), buffer2.get()}, options); + auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options); ASSERT_TRUE(result.ok()); } } diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc index 90496d55e60b4f45fc2d46b2746f94d775cf9f94..4dd3acd9af16210408229d0c8980173314766278 100644 --- a/tensorflow/compiler/xla/tests/gather_operation_test.cc +++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc @@ -401,10 +401,7 @@ ENTRY main { class GatherClientLibraryTest : public ClientLibraryTestBase {}; -// TODO(b/30671675): Asynchronous execution on stream is not yet supported on -// GPU and CPU_PARALLEL. -XLA_TEST_F(GatherClientLibraryTest, - DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(Basic))) { +XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) { // We create this HLO, but using the XlaBuilder API. // // ENTRY main { diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc index 21f71fc91bb84540e5347811cb4643a8aeda445c..9984aba089be89072c5a49f93df68ec805658b68 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.cc +++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc @@ -35,8 +35,6 @@ limitations under the License. #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { @@ -115,11 +113,13 @@ StatusOr> HloTestBase::Execute( return test_runner_.Execute(std::move(module), arguments); } -StatusOr> HloTestBase::ExecuteNoHloPasses( +std::unique_ptr HloTestBase::ExecuteNoHloPasses( std::unique_ptr module, tensorflow::gtl::ArraySlice arguments) { - return test_runner_.Execute(std::move(module), arguments, - /*run_hlo_passes=*/false); + return test_runner_ + .Execute(std::move(module), arguments, + /*run_hlo_passes=*/false) + .ValueOrDie(); } std::unique_ptr HloTestBase::ExecuteAndTransfer( diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h index 3e8e2360bb3a87e127920cd222803c0f7b9161f4..79fcea9403e6e2dfc989c86ce8c6609f44acd12b 100644 --- a/tensorflow/compiler/xla/tests/hlo_test_base.h +++ b/tensorflow/compiler/xla/tests/hlo_test_base.h @@ -76,8 +76,7 @@ class HloTestBase : public ::testing::Test { // If your test doesn't use interpreter as the reference backend, you can use // this constructor. Note that your test target is responsible for linking in // both needed backends. - HloTestBase(::perftools::gputools::Platform* test_platform, - ::perftools::gputools::Platform* reference_platform); + HloTestBase(se::Platform* test_platform, se::Platform* reference_platform); ~HloTestBase() override {} @@ -100,7 +99,7 @@ class HloTestBase : public ::testing::Test { // Same as above, except the module will be executed without running any HLO // passes on it. - StatusOr> ExecuteNoHloPasses( + std::unique_ptr ExecuteNoHloPasses( std::unique_ptr module, tensorflow::gtl::ArraySlice arguments); diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc index 81630df34c58526b6d41492b2b4b3892a02a21c2..c28f79ae386670ca80d603a42f6629dfd30e0bc9 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util.cc +++ b/tensorflow/compiler/xla/tests/literal_test_util.cc @@ -39,6 +39,11 @@ limitations under the License. namespace xla { +using ::tensorflow::strings::Appendf; +using ::tensorflow::strings::Printf; +using ::tensorflow::strings::StrAppend; +using ::tensorflow::strings::StrCat; + /* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapes( const Shape& expected, const Shape& actual) { if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) { @@ -173,14 +178,11 @@ template auto lhs_double = static_cast(lhs); auto rhs_double = static_cast(rhs); if (ulhs != urhs) { - return ::testing::AssertionFailure() << tensorflow::strings::Printf( + return ::testing::AssertionFailure() << Printf( "floating values are not bitwise-equal; and equality testing " "was requested: %s=%g=%a vs %s=%g=%a", - tensorflow::strings::StrCat(tensorflow::strings::Hex(ulhs)) - .c_str(), - lhs_double, lhs_double, - tensorflow::strings::StrCat(tensorflow::strings::Hex(urhs)) - .c_str(), + StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double, + lhs_double, StrCat(tensorflow::strings::Hex(urhs)).c_str(), rhs_double, rhs_double); } return ::testing::AssertionSuccess(); @@ -264,9 +266,7 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual, << "expected:\n" << expected.ToString() << "\n\tvs actual:\n" << actual.ToString() - << (message.empty() - ? "" - : tensorflow::strings::StrCat("\nmessage: ", message)); + << (message.empty() ? "" : StrCat("\nmessage: ", message)); } /* static */ void LiteralTestUtil::ExpectNotEqual(const Literal& expected, @@ -321,9 +321,8 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual, case TUPLE: { bool tuple_match = true; for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) { - SCOPED_TRACE(tensorflow::strings::StrCat( - "Tuple index ", i, " in ", - ShapeUtil::HumanString(expected.shape()))); + SCOPED_TRACE(StrCat("Tuple index ", i, " in ", + ShapeUtil::HumanString(expected.shape()))); // Create LiteralViews of the expected and actual elements. auto result = Equal(LiteralView::Create(expected, {i}), @@ -350,227 +349,301 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual, namespace { +// Gets the total element count. For tuples, this is not the count of tuple +// elements, but the sum of elements of each tuple element. +int64 RecursiveElementCount(const Shape& shape) { + if (ShapeUtil::IsTuple(shape)) { + const int64 tuple_elements = ShapeUtil::TupleElementCount(shape); + int64 total = 0; + for (int64 i = 0; i < tuple_elements; ++i) { + total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i)); + } + return total; + } else { + return ShapeUtil::ElementsIn(shape); + } +} + +// Calling ToString on a literal with over 100 million elements takes around +// 3 minutes. The utility of printing a literal with >1000 elements is +// questionable, especially when writing the Literal proto to disk is orders +// of magnitude faster. +string TruncateHugeLiteral(const Literal& literal) { + return RecursiveElementCount(literal.shape()) < 1000 + ? literal.ToString() + : "[TRUNCATED, Literal with more than 1000 values]"; +} + +// Returns whether the actual and expected values are mismatched with respect to +// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec. +template +bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) { + if (relaxed_nans) { + return !std::isnan(expected) && std::isnan(actual); + } else { + return std::isnan(expected) != std::isnan(actual); + } +} + +template <> +bool NanMismatch(complex64 expected, complex64 actual, + bool relaxed_nans) { + return NanMismatch(expected.real(), actual.real(), relaxed_nans) || + NanMismatch(expected.imag(), actual.imag(), relaxed_nans); +} + +template <> +bool NanMismatch(half expected, half actual, bool relaxed_nans) { + return NanMismatch(static_cast(expected), + static_cast(actual), relaxed_nans); +} + +// Converts the given floating-point value to a string. +template +string FpValueToString(NativeT value) { + return Printf("%8.4g", static_cast(value)); +} + +template <> +string FpValueToString(complex64 value) { + return Printf("%8.4g + %8.4fi", value.real(), value.imag()); +} + +// Returns the absolute value of the given floating point value. This function +// is used instead of std::abs directly in order to allow type-dependent +// implementations for NearComparator. +template +float FpAbsoluteValue(NativeT value) { + return std::abs(value); +} + +template <> +float FpAbsoluteValue(bfloat16 value) { + return FpAbsoluteValue(static_cast(value)); +} + +template <> +float FpAbsoluteValue(half value) { + return FpAbsoluteValue(static_cast(value)); +} + // Helper class for comparing floating-point literals within an error bound. +template class NearComparator { public: - explicit NearComparator(ErrorSpec error) : error_(error) {} + // Compares the two array literals elementwise and returns an assertion + // result. The assertion result is successful if all actual and expected + // elements are within the given error bound. In case of error, the assertion + // result contains a detailed error message in case of failure. + static ::testing::AssertionResult Compare(const Literal& expected, + const Literal& actual, + ErrorSpec error, + bool detailed_message) { + NearComparator comparator(expected, actual, error, + detailed_message); + return comparator.Run(); + } + + private: + // Data structure encapsulating metadata about a single element mismatch. + struct Mismatch { + NativeT actual; + NativeT expected; + float rel_error; + float abs_error; + + // The linear index of the failure within the shape. This linear index is + // from the 'actual' literal. + int64 linear_index; + + bool operator<(const Mismatch& other) const { + return rel_error < other.rel_error; + } - // Compares the two literals elementwise. EXPECTs each pair of elements to be - // within the error bound. Emits useful log messages and dumps literals to - // temporary files on failure. Returns true if literals match. - bool ExpectNear(const Literal& expected, const Literal& actual) { + string ToString(const Shape& shape) const { + return Printf( + "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g", + FpValueToString(actual).c_str(), FpValueToString(expected).c_str(), + LiteralTestUtil::MultiIndexAsString( + IndexUtil::LinearIndexToMultidimensionalIndex(shape, + linear_index)) + .c_str(), + rel_error, abs_error); + } + }; + + explicit NearComparator(const Literal& expected, const Literal& actual, + ErrorSpec error, bool detailed_message) + : expected_(expected), + actual_(actual), + error_(error), + detailed_message_(detailed_message), + abs_value_buckets_(kAbsValueBucketBounds.size() - 1, {0, 0}), + abs_error_buckets_(kErrorBucketBounds.size(), 0), + rel_error_buckets_(kErrorBucketBounds.size(), 0) {} + + // Runs the comparison between expected and actual literals. + ::testing::AssertionResult Run() { VLOG(1) << "expected:"; - XLA_VLOG_LINES(1, TruncateHugeLiteral(expected)); + XLA_VLOG_LINES(1, TruncateHugeLiteral(expected_)); VLOG(1) << "actual:"; - XLA_VLOG_LINES(1, TruncateHugeLiteral(actual)); + XLA_VLOG_LINES(1, TruncateHugeLiteral(actual_)); // If the shapes mismatch, we simply fail the expectation instead of // printing out data, as it's a type error rather than a value error. ::testing::AssertionResult equal_shapes = - LiteralTestUtil::EqualShapes(expected.shape(), actual.shape()); + LiteralTestUtil::EqualShapes(expected_.shape(), actual_.shape()); if (!equal_shapes) { - EXPECT_TRUE(equal_shapes); - return false; + return equal_shapes; } - - // Set up members used during the comparison. - num_miscompares_ = 0; - abs_diff_sum_ = 0.0; - abs_expected_sum_ = 0.0; - abs_diff_miscompare_sum_ = 0.0; - abs_expected_miscompare_sum_ = 0.0; - max_rel_err_ = 0.0; - max_abs_err_ = 0.0; - first_linear_index_ = -1; - last_linear_index_ = -1; - max_rel_linear_index_ = -1; - max_abs_linear_index_ = -1; - miscompares_ = Literal(ShapeUtil::ChangeElementType(actual.shape(), PRED)); - miscompares_.PopulateWithValue(false); - multi_index_.resize(expected.shape().dimensions_size(), 0); - - switch (expected.shape().element_type()) { - case BF16: - ExpectLiteralsNear(expected, actual, 0); - break; - case F16: - ExpectLiteralsNear(expected, actual, 0); - break; - case F32: - ExpectLiteralsNear(expected, actual, 0); - break; - case F64: - ExpectLiteralsNear(expected, actual, 0); - break; - case C64: - ExpectLiteralsNear(expected, actual, 0); - break; - default: - LOG(FATAL) << "Unsupported primitive type in near comparator: " - << PrimitiveType_Name(expected.shape().element_type()) - << ". Must be floating-point type."; + if (!ShapeUtil::IsArray(expected_.shape())) { + return ::testing::AssertionFailure() << "Expected array shape"; } - if (num_miscompares_ > 0) { - if (!VLOG_IS_ON(1)) { - LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape()) - << " " << TruncateHugeLiteral(expected); - LOG(INFO) << "actual: " << ShapeUtil::HumanString(actual.shape()) - << " " << TruncateHugeLiteral(actual); - LOG(INFO) << "Dumping literals to temp files..."; - WriteLiteralToTempFile(expected, "expected"); - WriteLiteralToTempFile(actual, "actual"); - WriteLiteralToTempFile(miscompares_, "miscompares"); - } - EXPECT_TRUE(num_miscompares_ == 0) - << "\nmax relative mismatch at index " - << LiteralTestUtil::MultiIndexAsString( - IndexUtil::LinearIndexToMultidimensionalIndex( - actual.shape(), max_rel_linear_index_)) - << "\nmaximum relative error " << max_rel_err_ - << "\nmax absolute mismatch at index " - << LiteralTestUtil::MultiIndexAsString( - IndexUtil::LinearIndexToMultidimensionalIndex( - actual.shape(), max_abs_linear_index_)) - << "\nmaximum absolute error " << max_abs_err_ - << "\nfirst mismatch at index " - << LiteralTestUtil::MultiIndexAsString( - IndexUtil::LinearIndexToMultidimensionalIndex( - actual.shape(), first_linear_index_)) - << "\nlast mismatch at index " - << LiteralTestUtil::MultiIndexAsString( - IndexUtil::LinearIndexToMultidimensionalIndex( - actual.shape(), last_linear_index_)) - << "\ntotal absolute error " << abs_diff_sum_ - << "\ntotal absolute error of miscompares " - << abs_diff_miscompare_sum_ << "\ntotal relative error " - << (abs_diff_sum_ / abs_expected_sum_) - << "\ntotal relative error of miscompares " - << (abs_diff_miscompare_sum_ / abs_expected_miscompare_sum_) - << "\nfailure count " << num_miscompares_; + mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED)); + mismatches_.PopulateWithValue(false); + + CompareLiterals(); + + if (num_mismatches_ == 0) { + return ::testing::AssertionSuccess(); + } else if (!VLOG_IS_ON(1)) { + LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected_.shape()) + << " " << TruncateHugeLiteral(expected_); + LOG(INFO) << "actual: " << ShapeUtil::HumanString(actual_.shape()) + << " " << TruncateHugeLiteral(actual_); + LOG(INFO) << "Dumping literals to temp files..."; + WriteLiteralToTempFile(expected_, "expected"); + WriteLiteralToTempFile(actual_, "actual"); + WriteLiteralToTempFile(mismatches_, "mismatches"); } - return num_miscompares_ == 0; + return ::testing::AssertionFailure() << ErrorMessage(); } - private: - template - bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) { - if (relaxed_nans) { - return !std::isnan(expected) && std::isnan(actual); - } else { - return std::isnan(expected) != std::isnan(actual); + // Insert the given absolute value into the absolute value bucket vector. The + // bounds of the buckets are given by kAbsValueBucketBounds. + void UpdateAbsValueBucket(NativeT value, bool is_mismatch) { + // Adjust the bucket containing the absolute values of the 'actual' + // elements. + const float abs_value = FpAbsoluteValue(value); + for (int i = 0; i < abs_value_buckets_.size(); ++i) { + if (i == abs_value_buckets_.size() - 1 || + (abs_value >= kAbsValueBucketBounds[i] && + abs_value < kAbsValueBucketBounds[i + 1])) { + // The first value of the pair is the count of elements in the bucket, + // the second is the count of mismatches in the bucket. + abs_value_buckets_[i].first++; + if (is_mismatch) { + abs_value_buckets_[i].second++; + } + return; + } } } - template - void ExpectNear(NativeT expected, NativeT actual, - const ::testing::Message& message) { - EXPECT_NEAR(expected, actual, error_.abs) - << "expected:\n " << expected << "\n\tvs actual:\n " << actual << "\n" - << message; - } - - // EXPECTs that the two given scalar values are within the error bound. Keeps - // track of how many mismatches have occurred to keep the size of the output - // manageable. - template - bool ExpectValuesNear(NativeT expected, NativeT actual) { - if (expected == actual) { - return true; + // Insert the given error into the given error bucket vector. + void UpdateErrorBucket( + float error, tensorflow::gtl::MutableArraySlice error_buckets) { + CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size()); + for (int i = 0; i < error_buckets.size(); ++i) { + if (error >= kErrorBucketBounds[i]) { + error_buckets[i]++; + } } - - const float abs_diff = std::abs(actual - expected); - const float rel_err = abs_diff / std::abs(expected); - const bool nan_mismatch = - NanMismatch(expected, actual, error_.relaxed_nans); - const bool mismatch = - (nan_mismatch || (abs_diff >= error_.abs && rel_err >= error_.rel)); - return !mismatch; } - // Assumes that expected vs actual fail ExpectValuesNear. - template - void UpdateAndLogMiscompares(const NativeT expected, const NativeT actual, - const Shape& shape, const int64 linear_index) { - const float abs_diff = std::abs(actual - expected); - const float rel_err = abs_diff / std::abs(expected); - abs_diff_sum_ += abs_diff; - abs_expected_sum_ += std::abs(expected); - if (rel_err > max_rel_err_ || std::isnan(rel_err)) { - max_rel_err_ = rel_err; - max_rel_linear_index_ = linear_index; + // Compares the two given elements from the expected and actual literals at + // the given literal_index and keeps track of various mismatch statistics. + void CompareValues(NativeT expected, NativeT actual, int64 linear_index) { + const bool is_nan_mismatch = + NanMismatch(expected, actual, error_.relaxed_nans); + float abs_error; + float rel_error; + if (actual == expected) { + abs_error = 0; + rel_error = 0; + } else if (is_nan_mismatch) { + num_nan_mismatches_++; + // A nan mismatch is considered to have infinite error. rel_error is used + // for sorting a std::set of the top mismatchs, and a nan value here will + // result in undefined behavior because nan's do not satisfy the strict + // weak ordering requirement of std containers. + abs_error = std::numeric_limits::infinity(); + rel_error = std::numeric_limits::infinity(); + } else { + abs_error = FpAbsoluteValue(actual - expected); + rel_error = abs_error / FpAbsoluteValue(expected); } - if (abs_diff > max_abs_err_ || std::isnan(abs_diff)) { - max_abs_err_ = abs_diff; - max_abs_linear_index_ = linear_index; + const bool is_abs_mismatch = abs_error > error_.abs; + const bool is_rel_mismatch = rel_error > error_.rel; + const bool is_mismatch = + is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch); + + // Update the error of the relative bucket only if the *absolute* error + // bound is exceeded and vice versa. + if (is_abs_mismatch) { + num_abs_mismatches_++; + UpdateErrorBucket(rel_error, &rel_error_buckets_); } - if (VLOG_IS_ON(10)) { - VLOG(10) << tensorflow::strings::Printf( - "index %s abs_diff %f rel_err %f", - LiteralTestUtil::MultiIndexAsString( - IndexUtil::LinearIndexToMultidimensionalIndex(shape, - linear_index)) - .c_str(), - abs_diff, rel_err); + if (is_rel_mismatch) { + num_rel_mismatches_++; + UpdateErrorBucket(abs_error, &abs_error_buckets_); } - abs_diff_miscompare_sum_ += abs_diff; - abs_expected_miscompare_sum_ += std::abs(expected); - const int64 kMaxFailures = 2; - if (num_miscompares_ < kMaxFailures) { - const auto multi_index = - IndexUtil::LinearIndexToMultidimensionalIndex(shape, linear_index); - ::testing::Message msg; - msg << "mismatch at index " - << LiteralTestUtil::MultiIndexAsString(multi_index) << " abs diff " - << abs_diff << " rel err " << rel_err << " failure #" - << num_miscompares_; - ExpectNear(expected, actual, msg); - } else if (num_miscompares_ == kMaxFailures) { - LOG(ERROR) << "reached max 'loud' failure count; silently proceeding..."; + + UpdateAbsValueBucket(actual, is_mismatch); + + if (!is_mismatch) { + return; } - if (num_miscompares_ == 0) { - first_linear_index_ = linear_index; + + num_mismatches_++; + + // Keep track of the kTopRelativeErrorCount relative error mismatches. + if (top_rel_mismatches_.size() < kTopRelativeErrorCount || + rel_error > top_rel_mismatches_.begin()->rel_error) { + Mismatch mismatch = {actual, expected, rel_error, abs_error, + linear_index}; + top_rel_mismatches_.insert(mismatch); + if (top_rel_mismatches_.size() > kTopRelativeErrorCount) { + top_rel_mismatches_.erase(top_rel_mismatches_.begin()); + } } - num_miscompares_++; - last_linear_index_ = linear_index; - miscompares_.data()[linear_index] = true; + + mismatches_.data()[linear_index] = true; } - // Recursive function which compares the two given literals elementwise. - template - void ExpectLiteralsNear(const Literal& expected, const Literal& actual, - int64 dimension) { + // Compares the two literals elementwise. + void CompareLiterals() { // Fast path optimization for the case were layouts match. - if (LayoutUtil::Equal(actual.shape().layout(), expected.shape().layout())) { + if (LayoutUtil::Equal(actual_.shape().layout(), + expected_.shape().layout())) { tensorflow::gtl::ArraySlice expected_data = - expected.data(); + expected_.data(); tensorflow::gtl::ArraySlice actual_data = - actual.data(); + actual_.data(); const int64 len = expected_data.size(); for (int64 i = 0; i < len; ++i) { - const bool near = ExpectValuesNear(expected_data[i], actual_data[i]); - if (!near) { - UpdateAndLogMiscompares(expected_data[i], actual_data[i], - actual.shape(), i); - } + CompareValues(expected_data[i], actual_data[i], i); } return; } + std::vector multi_index(ShapeUtil::Rank(actual_.shape()), 0); + CompareLiteralsSlow(0, &multi_index); + } - if (dimension == expected.shape().dimensions_size()) { - bool near = ExpectValuesNear(expected.Get(multi_index_), - actual.Get(multi_index_)); - if (!near) { - UpdateAndLogMiscompares( - expected.Get(multi_index_), - actual.Get(multi_index_), actual.shape(), - IndexUtil::MultidimensionalIndexToLinearIndex(actual.shape(), - multi_index_)); - } + // Slow path for CompareLiterals when 'actual' and 'expected' literals have + // different layouts. In this case, multidimensional indices are constructed + // and indexed for each element. + void CompareLiteralsSlow(int64 dimension, std::vector* multi_index) { + if (dimension == multi_index->size()) { + CompareValues(expected_.Get(*multi_index), + actual_.Get(*multi_index), + IndexUtil::MultidimensionalIndexToLinearIndex( + actual_.shape(), *multi_index)); } else { - for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) { - multi_index_[dimension] = i; - ExpectLiteralsNear(expected, actual, dimension + 1); + for (int64 i = 0; i < expected_.shape().dimensions(dimension); ++i) { + (*multi_index)[dimension] = i; + CompareLiteralsSlow(dimension + 1, multi_index); } } } @@ -580,159 +653,247 @@ class NearComparator { int64 now_usec = tensorflow::Env::Default()->NowMicros(); string filename = tensorflow::io::JoinPath( tensorflow::testing::TmpDir(), - tensorflow::strings::Printf("tempfile-%s-%llx-%s", Hostname().c_str(), - now_usec, name.c_str())); + Printf("tempfile-%s-%llx-%s", Hostname().c_str(), now_usec, + name.c_str())); TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(), filename, literal.ToProto())); LOG(ERROR) << "wrote to " << name << " file: " << filename; } - // Gets the total element count. For tuples, this is not the count of tuple - // elements, but the sum of elements of each tuple element. - int64 RecursiveElementCount(const Shape& shape) { - if (ShapeUtil::IsTuple(shape)) { - const int64 tuple_elements = ShapeUtil::TupleElementCount(shape); - int64 total = 0; - for (int64 i = 0; i < tuple_elements; ++i) { - total += - RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i)); - } - return total; - } else { - return ShapeUtil::ElementsIn(shape); + // Returns an error message string with a detailed breakdown of the + // mismatches. Called after calling Run(). + string ErrorMessage() { + string out; + int64 element_count = ShapeUtil::ElementsIn(actual_.shape()); + + auto percent_string = [](float a, float b) { + float pct = b == 0.0 ? 0.0 : 100.0 * a / b; + return Printf("%0.4f%%", pct); + }; + + Appendf(&out, + "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound " + "%g, rel bound %g\n", + num_mismatches_, + percent_string(num_mismatches_, element_count).c_str(), + ShapeUtil::HumanString(actual_.shape()).c_str(), + ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel); + if (num_nan_mismatches_ > 0) { + StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n"); + } + Appendf(&out, "Top relative error mismatches:\n"); + for (auto it = top_rel_mismatches_.rbegin(); + it != top_rel_mismatches_.rend(); ++it) { + StrAppend(&out, " ", it->ToString(actual_.shape()).c_str(), "\n"); } - } - // Calling ToString on a literal with over 100 million elements takes around - // 3 minutes. The utility of printing a literal with >1000 elements is - // questionable, especially when writing the Literal proto to disk is orders - // of magnitude faster. - string TruncateHugeLiteral(const Literal& literal) { - return RecursiveElementCount(literal.shape()) < 1000 - ? literal.ToString() - : "[TRUNCATED, Literal with more than 1000 values]"; - } + if (!detailed_message_) { + return out; + } - ErrorSpec error_; + StrAppend(&out, "Absolute magnitude breakdown of actual values:\n"); + CHECK_EQ(abs_value_buckets_.size() + 1, kAbsValueBucketBounds.size()); + for (int i = 0; i < abs_value_buckets_.size(); ++i) { + const int64 bucket_size = abs_value_buckets_[i].first; + const int64 bucket_mismatches = abs_value_buckets_[i].second; + string mismatch_str = bucket_mismatches > 0 + ? Printf(", mismatches %lld", bucket_mismatches) + : ""; + Appendf(&out, " %-6g <= x < %-6g : %7lld (%9s)%s\n", + kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1], + bucket_size, percent_string(bucket_size, element_count).c_str(), + mismatch_str.c_str()); + } - // Number of element miscomparisons encountered so far. - int64 num_miscompares_; + auto print_accum_buckets = [&](const string& header, int64 total, + tensorflow::gtl::ArraySlice buckets) { + StrAppend(&out, header, ":\n"); + Appendf(&out, " < %-6g : %7lld (%s)\n", kErrorBucketBounds[0], + total - buckets[0], + percent_string(total - buckets[0], total).c_str()); + CHECK_EQ(buckets.size(), kErrorBucketBounds.size()); + for (int i = 0; i < kErrorBucketBounds.size(); ++i) { + Appendf(&out, " >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i], + buckets[i], percent_string(buckets[i], total).c_str()); + } + }; + Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n", + error_.abs, num_abs_mismatches_, + percent_string(num_abs_mismatches_, element_count).c_str()); + print_accum_buckets( + "Relative error breakdown of elements exceeding abs error bound", + num_abs_mismatches_, rel_error_buckets_); + Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n", + error_.rel, num_rel_mismatches_, + percent_string(num_rel_mismatches_, element_count).c_str()); + print_accum_buckets( + "Absolute error breakdown of elements exceeding rel error bound", + num_rel_mismatches_, abs_error_buckets_); + return out; + } - // A Literal containing which elements did not match in the expected and - // actual literals. miscompares_ contains PREDs and is of the same sizes as - // the comparison literals. - Literal miscompares_; - - // A multidimensional index used when performing the recursive comparison. - std::vector multi_index_; - - // Aggregated Statistics on input. - double abs_diff_sum_; - double abs_expected_sum_; - double abs_diff_miscompare_sum_; - double abs_expected_miscompare_sum_; - float max_rel_err_; - float max_abs_err_; - int64 first_linear_index_; - int64 last_linear_index_; - int64 max_rel_linear_index_; - int64 max_abs_linear_index_; -}; + // 'actual' and 'expected' literals being compared. + const Literal& expected_; + const Literal& actual_; -template <> -bool NearComparator::NanMismatch(complex64 expected, - complex64 actual, - bool relaxed_nans) { - return NanMismatch(expected.real(), actual.real(), relaxed_nans) || - NanMismatch(expected.imag(), actual.imag(), relaxed_nans); -} + // The error bounds of the comparison. + ErrorSpec error_; -template <> -void NearComparator::ExpectNear(complex64 expected, complex64 actual, - const ::testing::Message& message) { - EXPECT_NEAR(expected.real(), actual.real(), error_.abs) - << "expected:\n " << expected << "\n\tvs actual:\n " << actual << "\n" - << message; - EXPECT_NEAR(expected.imag(), actual.imag(), error_.abs) - << "expected:\n " << expected << "\n\tvs actual:\n " << actual << "\n" - << message; -} + // Whether to include detailed breakdown of mismatches in the error message. + bool detailed_message_; -template <> -bool NearComparator::ExpectValuesNear(bfloat16 expected, - bfloat16 actual) { - return ExpectValuesNear(static_cast(expected), - static_cast(actual)); -} + // Number of element element mismatches encountered so far. + int64 num_mismatches_ = 0; -template <> -bool NearComparator::ExpectValuesNear(half expected, half actual) { - return ExpectValuesNear(static_cast(std::move(expected)), - static_cast(std::move(actual))); -} + // Number of elements with a nan mismatch. + int64 num_nan_mismatches_ = 0; -template <> -void NearComparator::UpdateAndLogMiscompares( - const bfloat16 expected, const bfloat16 actual, const Shape& shape, - const int64 linear_index) { - UpdateAndLogMiscompares(static_cast(expected), - static_cast(actual), shape, linear_index); -} + // Number of elements which exceed the absolute/relative error bound. + int64 num_abs_mismatches_ = 0; + int64 num_rel_mismatches_ = 0; -template <> -void NearComparator::UpdateAndLogMiscompares(half expected, half actual, - const Shape& shape, - const int64 linear_index) { - UpdateAndLogMiscompares(static_cast(std::move(expected)), - static_cast(std::move(actual)), shape, - linear_index); -} - -} // namespace + // A Literal containing which elements did not match in the expected and + // actual literals. mismatches_ contains PREDs and is of the same sizes as + // the comparison literals. + Literal mismatches_; + + // The number of mismatches to report in the output, sorted by relative error + // magnitude. + static constexpr int64 kTopRelativeErrorCount = 5; + + // The set of mismatches with the largest relative error. The size of this set + // is bounded by kTopRelativeErrorCount. + std::multiset top_rel_mismatches_; + + // Actual values are bucketed by absolute value. kAbsValueBucketBounds is the + // bounds of these buckets. abs_value_buckets_ contains a pair for each + // bucket: the element count and failure count. + static constexpr std::array kAbsValueBucketBounds = { + 0.0, 0.0001, 0.001, 0.01, 0.1, 1, std::numeric_limits::infinity()}; + std::vector> abs_value_buckets_; + + // Buckets for relative and absolute errors. The relative error buckets only + // contains those elements which exceed the *absolute* error bound, and vice + // versa. This makes it easy to see the effect of adjusting the relative (or + // absolute) error bound on the success of the comparison. kErrorBucketBounds + // are the lower bounds of the buckets in both vectors. The error buckets are + // a cumulative distribution so an error value may appear in more than one + // bucket. For example an error value of 0.003 may appear in the buckets + // bounded by 0.01, 0.1, and 1.0. + static constexpr std::array kErrorBucketBounds = {0.0001, 0.001, + 0.01, 0.1, 1}; + std::vector abs_error_buckets_; + std::vector rel_error_buckets_; +}; -/* static */ ::testing::AssertionResult LiteralTestUtil::Near( - const Literal& expected, const Literal& actual, const ErrorSpec& error) { +template +constexpr std::array NearComparator::kAbsValueBucketBounds; +template +constexpr std::array NearComparator::kErrorBucketBounds; + +// Helper function for comparing two literals for nearness. Handles tuple-shapes +// via recursion. shape_index is the ShapeIndex of expected (or actual) +// currently being compared. +::testing::AssertionResult NearHelper(const Literal& expected, + const Literal& actual, + const ErrorSpec& error, + bool detailed_message, + const ShapeIndex& shape_index) { ::testing::AssertionResult err = - EqualShapes(expected.shape(), actual.shape()); + LiteralTestUtil::EqualShapes(expected.shape(), actual.shape()); if (!err) { return err; } if (ShapeUtil::IsTuple(expected.shape())) { for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) { - SCOPED_TRACE(tensorflow::strings::StrCat( - "Tuple index ", i, " in ", ShapeUtil::HumanString(expected.shape()))); const auto expected_element = LiteralView::Create(expected, {i}); const auto actual_element = LiteralView::Create(actual, {i}); - + ShapeIndex element_index = shape_index; + element_index.push_back(i); ::testing::AssertionResult res = - Near(expected_element, actual_element, error); - if (err && !res) { - err = res; + NearHelper(expected_element, actual_element, error, detailed_message, + element_index); + if (!res) { + string err_message = + Printf("\nArray at shape index %s%s", + element_index.ToString().c_str(), res.message()); + if (err) { + err = ::testing::AssertionFailure() << err_message; + } else { + err << err_message; + } } } + if (!err && shape_index.empty()) { + // Emit a top-level error message containing the top-level shape in case + // of mismatch. + int64 total_elements = RecursiveElementCount(actual.shape()); + err = ::testing::AssertionFailure() + << Printf("\nMismatches in shape %s (%lld elements):\n%s", + ShapeUtil::HumanString(actual.shape()).c_str(), + total_elements, err.message()); + } return err; } if (ShapeUtil::ElementIsFloating(expected.shape()) || ShapeUtil::ElementIsComplex(expected.shape())) { - NearComparator comparator(error); - return comparator.ExpectNear(expected, actual) - ? ::testing::AssertionSuccess() - : ::testing::AssertionFailure() << "values were not near"; + switch (expected.shape().element_type()) { + case BF16: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + case F16: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + case F32: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + case F64: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + case C64: + return NearComparator::Compare(expected, actual, error, + detailed_message); + break; + default: + LOG(FATAL) << "Unsupported primitive type in near comparator: " + << PrimitiveType_Name(expected.shape().element_type()) + << ". Must be floating-point type."; + } } - return Equal(expected, actual); + // Non-floating point literal. + return LiteralTestUtil::Equal(expected, actual); +} + +} // namespace + +/* static */ ::testing::AssertionResult LiteralTestUtil::Near( + const Literal& expected, const Literal& actual, const ErrorSpec& error, + bool detailed_message) { + return NearHelper(expected, actual, error, detailed_message, + /*shape_index=*/{}); } /* static */ void LiteralTestUtil::ExpectNear(const Literal& expected, const Literal& actual, const ErrorSpec& error, const string& message) { - EXPECT_TRUE(Near(expected, actual, error)) - << (message.empty() - ? "" - : tensorflow::strings::StrCat("\nmessage: ", message)); + ::testing::AssertionResult res = + Near(expected, actual, error, /*detailed_message=*/false); + if (!res) { + res << "Expected: " << TruncateHugeLiteral(expected) << "\n"; + res << "Actual: " << TruncateHugeLiteral(actual) << "\n"; + if (!message.empty()) { + res << StrCat("\nmessage: ", message); + } + } + EXPECT_TRUE(res); } /*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual( @@ -754,8 +915,7 @@ void NearComparator::UpdateAndLogMiscompares(half expected, half actual, /* static */ string LiteralTestUtil::MultiIndexAsString( tensorflow::gtl::ArraySlice multi_index) { - return tensorflow::strings::StrCat( - "{", tensorflow::str_util::Join(multi_index, ","), "}"); + return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}"); } /* static */ std::unique_ptr LiteralTestUtil::Reshape( diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h index 7b757a4bd7e7592583b7596b4305ddb7e6c52d75..a755568c0f098e15512bd1d3720269c867bc9c49 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util.h +++ b/tensorflow/compiler/xla/tests/literal_test_util.h @@ -122,16 +122,19 @@ class LiteralTestUtil { // bounds are equivalent. // // Tuples are matched recursively. When comparing tensors of - // non-floating-point type, checks for exact equality, ignoring the ErroSpec. + // non-floating-point type, checks for exact equality, ignoring the ErrorSpec. // // If the shape of the literals is neither a complex/floating-point tensor nor // a tuple which contains a complex/floating-point tensor, Near() is // equivalent to Equal(). We don't raise an error in this case, because we // want to allow callers to call Near() even if they have no preconceptions // about the shapes being compared. + // + // If detailed_message is true, then the error message in the assertion result + // will contain a more detailed breakdown of mismatches. static ::testing::AssertionResult Near( - const Literal& expected, const Literal& actual, - const ErrorSpec& error) TF_MUST_USE_RESULT; + const Literal& expected, const Literal& actual, const ErrorSpec& error, + bool detailed_message = false) TF_MUST_USE_RESULT; // Expects expected and actual to be Near with the given error. static void ExpectNear(const Literal& expected, const Literal& actual, diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc index 3a421f8458268a14dcdd84889bcae4990c095ea4..9d619a77c7e8d6398b559e8f562cd7f8194e0811 100644 --- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc +++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc @@ -89,7 +89,7 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) { EXPECT_EQ("2", literal->ToString()); } else if (result.find("actual") != string::npos) { EXPECT_EQ("4", literal->ToString()); - } else if (result.find("miscompares") != string::npos) { + } else if (result.find("mismatches") != string::npos) { EXPECT_EQ("true", literal->ToString()); } else { FAIL() << "unknown file in temporary directory: " << result; diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc index 7e92439c494b677f718a63c71c20828d65bebef4..2f46ee0be216d7dabf1c476d3cfb7d528f8ab6a4 100644 --- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc +++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc @@ -43,7 +43,7 @@ class LLVMCompilerTest : public ::testing::Test { ~LLVMCompilerTest() override {} protected: - using Platform = ::perftools::gputools::Platform; + using Platform = se::Platform; explicit LLVMCompilerTest(string platform_name) : platform_name_(std::move(platform_name)) {} @@ -95,7 +95,7 @@ class LLVMCompilerTest : public ::testing::Test { modules.push_back(hlo_module->Clone()); modules.push_back(std::move(hlo_module)); - std::vector> executors; + std::vector> executors; executors.push_back({backend_->default_stream_executor()}); executors.push_back({backend_->default_stream_executor()}); diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc index 3d30ceeaf1b0369b6fdc0cd9620c04aae287941c..f21f83992ffb7c07dff31c68a7e9e3f7944bf512 100644 --- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc @@ -15,9 +15,8 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" @@ -25,6 +24,7 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/local_client_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/core/lib/gtl/optional.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" @@ -37,7 +37,7 @@ class LocalClientAllocationTest : public LocalClientTestBase { }; XLA_TEST_F(LocalClientAllocationTest, AddVectors) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({0.0f, 1.0f, 2.0f}); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); @@ -53,7 +53,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) { // deallocation happen on the right allocator. ExecutableRunOptions options; options.set_allocator(allocator); - std::unique_ptr result = + tensorflow::gtl::optional result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}, DefaultExecutableBuildOptions(), options); @@ -66,14 +66,14 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) { // Deallocate result and verify that deallocate was called once. int64 deallocation_count_before = allocator_->deallocation_count(); - result = nullptr; + result.reset(); EXPECT_EQ(deallocation_count_before + 1, allocator_->deallocation_count()); } XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) { // Run a computation on every device on the system. Verify that allocation // occurs on the proper device. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({0.0f, 1.0f, 2.0f}); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); @@ -92,7 +92,7 @@ XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) { computation, {}, ExecutableBuildOptions().set_device_ordinal(d), ExecutableRunOptions().set_device_ordinal(d).set_allocator(allocator)); LiteralTestUtil::ExpectR1Near( - {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_); + {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_); // At least one allocation should have been performed when executing the // computation. diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc index 2462ea39f914b1dbb525ea777a48d9ce66035638..44c6811df84f49b6c1b24c11449939e2d375a9d1 100644 --- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc @@ -18,9 +18,8 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" @@ -43,8 +42,6 @@ limitations under the License. #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { @@ -56,61 +53,57 @@ class LocalClientExecuteTest : public LocalClientTestBase { }; XLA_TEST_F(LocalClientExecuteTest, Constant) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto y = builder.ConstantR0(123.0f); - std::unique_ptr result = + ScopedShapedBuffer result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}); - - LiteralTestUtil::ExpectR0Near(123.f, *ShapedBufferToLiteral(*result), + LiteralTestUtil::ExpectR0Near(123.f, *ShapedBufferToLiteral(result), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, AddScalars) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); auto y = builder.ConstantR0(123.0f); builder.Add(x, y); auto x_value = LiteralToShapedBuffer(*Literal::CreateR0(42.0f)); - std::unique_ptr result = - ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_value.get()}); - - LiteralTestUtil::ExpectR0Near(165.f, *ShapedBufferToLiteral(*result), + ScopedShapedBuffer result = + ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_value}); + LiteralTestUtil::ExpectR0Near(165.f, *ShapedBufferToLiteral(result), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0}), "x"); auto y = builder.ConstantR1({}); builder.Add(x, y); auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({})); - std::unique_ptr result = - ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()}); - - LiteralTestUtil::ExpectR1Near({}, *ShapedBufferToLiteral(*result), + ScopedShapedBuffer result = + ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array}); + LiteralTestUtil::ExpectR1Near({}, *ShapedBufferToLiteral(result), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, AddVectors) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); - std::unique_ptr result = - ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {x_array.get()}); - + ScopedShapedBuffer result = + ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&x_array}); LiteralTestUtil::ExpectR1Near( - {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_); + {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); @@ -118,18 +111,17 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) { auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); ExecutionProfile profile; - std::unique_ptr result = ExecuteLocallyOrDie( - builder.Build().ValueOrDie(), {x_array.get()}, - DefaultExecutableBuildOptions(), + ScopedShapedBuffer result = ExecuteLocallyOrDie( + builder.Build().ValueOrDie(), {&x_array}, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions().set_execution_profile(&profile)); LiteralTestUtil::ExpectR1Near( - {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_); + {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_); EXPECT_GT(profile.compute_and_transfer_time_ns(), 0); } XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); builder.Add(x, y); @@ -138,31 +130,31 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) { // Create x as a col-major array. auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1}))); - EXPECT_TRUE(LayoutUtil::Equal(x_array->on_device_shape().layout(), + EXPECT_TRUE(LayoutUtil::Equal(x_array.on_device_shape().layout(), LayoutUtil::MakeLayout({0, 1}))); // Create y as a row-major array. auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout( {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0}))); - EXPECT_TRUE(LayoutUtil::Equal(y_array->on_device_shape().layout(), + EXPECT_TRUE(LayoutUtil::Equal(y_array.on_device_shape().layout(), LayoutUtil::MakeLayout({1, 0}))); - std::unique_ptr result_colmaj = - ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()}); + ScopedShapedBuffer result_colmaj = + ExecuteLocallyOrDie(computation, {&x_array, &y_array}); LiteralTestUtil::ExpectR2Near({{11.0f, 22.0f}, {33.0f, 44.0f}}, - *ShapedBufferToLiteral(*result_colmaj), + *ShapedBufferToLiteral(result_colmaj), error_spec_); // Run with the parameter values in a different order. - std::unique_ptr result_param_swap = - ExecuteLocallyOrDie(computation, {y_array.get(), x_array.get()}); + ScopedShapedBuffer result_param_swap = + ExecuteLocallyOrDie(computation, {&y_array, &x_array}); LiteralTestUtil::ExpectR2Near( {{11.0f, 22.0f}, {33.0f, 44.0f}}, - *ShapedBufferToLiteral(*result_param_swap), error_spec_); + *ShapedBufferToLiteral(result_param_swap), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); builder.Add(x, y); @@ -174,32 +166,32 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) { *Literal::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); // Run with col-major result layout. - std::unique_ptr result_colmaj = ExecuteLocallyOrDie( - computation, {x_array.get(), y_array.get()}, + ScopedShapedBuffer result_colmaj = ExecuteLocallyOrDie( + computation, {&x_array, &y_array}, DefaultExecutableBuildOptions().set_result_layout( ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {0, 1})), DefaultExecutableRunOptions()); - EXPECT_TRUE(LayoutUtil::Equal(result_colmaj->on_device_shape().layout(), + EXPECT_TRUE(LayoutUtil::Equal(result_colmaj.on_device_shape().layout(), LayoutUtil::MakeLayout({0, 1}))); LiteralTestUtil::ExpectR2Near({{11.0f, 22.0f}, {33.0f, 44.0f}}, - *ShapedBufferToLiteral(*result_colmaj), + *ShapedBufferToLiteral(result_colmaj), error_spec_); // Run with row-major result layout. - std::unique_ptr result_rowmaj = ExecuteLocallyOrDie( - computation, {x_array.get(), y_array.get()}, + ScopedShapedBuffer result_rowmaj = ExecuteLocallyOrDie( + computation, {&x_array, &y_array}, DefaultExecutableBuildOptions().set_result_layout( ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, {1, 0})), DefaultExecutableRunOptions()); - EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj->on_device_shape().layout(), + EXPECT_TRUE(LayoutUtil::Equal(result_rowmaj.on_device_shape().layout(), LayoutUtil::MakeLayout({1, 0}))); LiteralTestUtil::ExpectR2Near({{11.0f, 22.0f}, {33.0f, 44.0f}}, - *ShapedBufferToLiteral(*result_rowmaj), + *ShapedBufferToLiteral(result_rowmaj), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, TupleResult) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); builder.Tuple({x, y, x}); @@ -210,13 +202,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) { auto y_array = LiteralToShapedBuffer( *Literal::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()}); + ScopedShapedBuffer result = + ExecuteLocallyOrDie(computation, {&x_array, &y_array}); - EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape())); - EXPECT_EQ(3, ShapeUtil::TupleElementCount(result->on_host_shape())); + EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape())); + EXPECT_EQ(3, ShapeUtil::TupleElementCount(result.on_host_shape())); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0})); LiteralTestUtil::ExpectR2Equal( @@ -227,7 +219,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) { } XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); auto inner_tuple = builder.Tuple({x, y, x}); @@ -239,13 +231,13 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) { auto y_array = LiteralToShapedBuffer( *Literal::CreateR2({{10.0f, 20.0f}, {30.0f, 40.0f}})); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {x_array.get(), y_array.get()}); + ScopedShapedBuffer result = + ExecuteLocallyOrDie(computation, {&x_array, &y_array}); - EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape())); - EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape())); + EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape())); + EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape())); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1})); LiteralTestUtil::ExpectR2Equal( @@ -261,7 +253,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) { XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) { // Verify setting the result layout of a computation with a tuple output. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); builder.Tuple({x, y}); @@ -276,11 +268,11 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) { ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{2, 2}, /*minor_to_major=*/{1, 0})}); options.set_result_layout(shape_with_layout); - std::unique_ptr result = ExecuteLocallyOrDie( - builder.Build().ValueOrDie(), {array.get(), array.get()}, options, - DefaultExecutableRunOptions()); + ScopedShapedBuffer result = + ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {&array, &array}, + options, DefaultExecutableRunOptions()); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0})); LiteralTestUtil::ExpectR2Equal( @@ -298,7 +290,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) { // Computation adds the respective array and vector elements from each tuple // argument and returns the results as a tuple. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, tuple_shape0, "x"); auto y = builder.Parameter(1, tuple_shape1, "y"); auto x_0 = builder.GetTupleElement(x, 0); @@ -320,13 +312,13 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) { auto x_buffer = LiteralToShapedBuffer(*x_literal); auto y_buffer = LiteralToShapedBuffer(*y_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {x_buffer.get(), y_buffer.get()}); + ScopedShapedBuffer result = + ExecuteLocallyOrDie(computation, {&x_buffer, &y_buffer}); - EXPECT_TRUE(ShapeUtil::IsTuple(result->on_host_shape())); - EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->on_host_shape())); + EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape())); + EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape())); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{56.0f, 46.0f}, {36.0f, 26.0f}}, LiteralView::Create(*result_literal, {0})); @@ -345,7 +337,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) { // Computation negates the array element and sums the two vector elements in // the nested tuple. The resulting array and vector are returned as a tuple. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto param = builder.Parameter(0, nested_tuple_shape, "param"); auto inner_tuple = builder.GetTupleElement(param, 0); auto inner_array = builder.GetTupleElement(inner_tuple, 0); @@ -365,10 +357,9 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) { Literal::CreateR1({222.0, -2.0, 10.0}).get()}); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); + ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR2Equal( {{-1.0, -2.0}, {-3.0, -4}}, LiteralView::Create(*result_literal, {0})); LiteralTestUtil::ExpectR1Equal( @@ -384,7 +375,7 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) { const Shape tuple_shape = ShapeUtil::MakeTupleShape({array_shape, array_shape}); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto param = builder.Parameter(0, tuple_shape, "param"); auto element_0 = builder.GetTupleElement(param, 0); auto element_1 = builder.GetTupleElement(param, 1); @@ -396,18 +387,16 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) { Literal::CreateR2({{11.0, 3.0}, {4.0, 5.0}}).get()}); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result_0 = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); - std::unique_ptr result_0_literal = ShapedBufferToLiteral(*result_0); + ScopedShapedBuffer result_0 = ExecuteLocallyOrDie(computation, {&arg_buffer}); + std::unique_ptr result_0_literal = ShapedBufferToLiteral(result_0); LiteralTestUtil::ExpectR2Equal( {{-1.0, -2.0}, {-3.0, -4.0}}, LiteralView::Create(*result_0_literal, {0})); LiteralTestUtil::ExpectR2Equal( {{22.0, 6.0}, {8.0, 10}}, LiteralView::Create(*result_0_literal, {1})); - std::unique_ptr result_1 = - ExecuteLocallyOrDie(computation, {result_0.get()}); - std::unique_ptr result_1_literal = ShapedBufferToLiteral(*result_1); + ScopedShapedBuffer result_1 = ExecuteLocallyOrDie(computation, {&result_0}); + std::unique_ptr result_1_literal = ShapedBufferToLiteral(result_1); LiteralTestUtil::ExpectR2Equal( {{1.0, 2.0}, {3.0, 4.0}}, LiteralView::Create(*result_1_literal, {0})); LiteralTestUtil::ExpectR2Equal( @@ -430,11 +419,11 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) { std::vector element_shapes(kElementCount, element_shape); const Shape tuple_shape = ShapeUtil::MakeTupleShape(element_shapes); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto param = builder.Parameter(0, tuple_shape, "param"); // Add each element's tuple index value to every element. - std::vector result_elements; + std::vector result_elements; for (int i = 0; i < kElementCount; ++i) { auto element = builder.GetTupleElement(param, i); result_elements.push_back( @@ -453,10 +442,8 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) { Literal::MakeTupleOwned(std::move(arg_elements)); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); - - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); for (int i = 0; i < kElementCount; ++i) { LiteralTestUtil::ExpectR1Near( @@ -465,9 +452,7 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) { } } -// TODO(b/66968986): Test times out on CPU parallel backend. Disabled -// 2017-09-26. -XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) { +XLA_TEST_F(LocalClientExecuteTest, LargeNestedTuple) { // Construct and run a computation which takes a two-level nested tuple // parameter with a large fanout. const int kFanout = 40; @@ -479,15 +464,15 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) { std::vector inner_tuple_shapes(kFanout, inner_tuple_shape); const Shape tuple_shape = ShapeUtil::MakeTupleShape(inner_tuple_shapes); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto param = builder.Parameter(0, tuple_shape, "param"); // The computation increments each leaf value by an amount equal to the leaf's // ordinal position in a traversal of the tuple. - std::vector result_elements; + std::vector result_elements; for (int i = 0; i < kFanout; ++i) { auto outer_element = builder.GetTupleElement(param, i); - std::vector inner_result_elements; + std::vector inner_result_elements; for (int j = 0; j < kFanout; ++j) { auto inner_element = builder.GetTupleElement(outer_element, j); inner_result_elements.push_back(builder.Add( @@ -511,9 +496,8 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) { auto arg_literal = Literal::MakeTupleOwned(std::move(outer_tuple_elements)); auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); for (int i = 0; i < kFanout; ++i) { for (int j = 0; j < kFanout; ++j) { @@ -535,7 +519,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) { shape = ShapeUtil::MakeTupleShape({shape}); } - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto element = builder.Parameter(0, shape, "param"); for (int i = 0; i < kTupleDepth; ++i) { element = builder.GetTupleElement(element, 0); @@ -556,9 +540,8 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) { } auto arg_buffer = LiteralToShapedBuffer(*arg_literal); - std::unique_ptr result = - ExecuteLocallyOrDie(computation, {arg_buffer.get()}); - std::unique_ptr result_literal = ShapedBufferToLiteral(*result); + ScopedShapedBuffer result = ExecuteLocallyOrDie(computation, {&arg_buffer}); + std::unique_ptr result_literal = ShapedBufferToLiteral(result); ShapeIndex index; for (int i = 0; i < kTupleDepth; ++i) { @@ -570,7 +553,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) { XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) { // Test passing in an invalid number of arguments. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {3}), "y"); builder.Add(x, y); @@ -578,7 +561,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) { auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({1.0f, 2.0f, 3.0f})); auto execute_status = - ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()}); + ExecuteLocally(builder.Build().ValueOrDie(), {&x_array}); EXPECT_FALSE(execute_status.ok()); EXPECT_THAT(execute_status.status().error_message(), @@ -587,14 +570,14 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) { XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) { // Test passing in an argument with the wrong shape. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); builder.Neg(x); auto x_array = LiteralToShapedBuffer( *Literal::CreateR2({{0.0f, 1.0f}, {2.0f, 3.0f}})); auto execute_status = - ExecuteLocally(builder.Build().ValueOrDie(), {x_array.get()}); + ExecuteLocally(builder.Build().ValueOrDie(), {&x_array}); EXPECT_FALSE(execute_status.ok()); EXPECT_THAT(execute_status.status().error_message(), @@ -604,14 +587,14 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) { XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) { // Test passing in an invalid result layout parameter. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); builder.Neg(x); auto x_array = LiteralToShapedBuffer( *Literal::CreateR2({{0.0f, 1.0f}, {2.0f, 3.0f}})); auto execute_status = ExecuteLocally( - builder.Build().ValueOrDie(), {x_array.get()}, + builder.Build().ValueOrDie(), {&x_array}, DefaultExecutableBuildOptions().set_result_layout( ShapeUtil::MakeShapeWithLayout(F32, /*dimensions=*/{1, 2, 3, 4}, @@ -627,7 +610,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) { XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) { // Try to run a trivial computation on every device on the system. If a // specific device is not supported, check that the right error is returned. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); auto computation = builder.Build().ConsumeValueOrDie(); for (int d = 0; d < local_client_->device_count(); ++d) { @@ -644,9 +627,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) { computation, {}, DefaultExecutableBuildOptions().set_device_ordinal(d), DefaultExecutableRunOptions().set_device_ordinal(d)); - EXPECT_EQ(d, result->device_ordinal()); + EXPECT_EQ(d, result.device_ordinal()); LiteralTestUtil::ExpectR0Equal(42.0f, - *ShapedBufferToLiteral(*result)); + *ShapedBufferToLiteral(result)); } } } @@ -654,7 +637,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) { XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) { // Try running computations on devices with device ordinal values which do not // exist. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); auto computation = builder.Build().ConsumeValueOrDie(); @@ -671,7 +654,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) { XLA_TEST_F(LocalClientExecuteTest, RunOnStream) { // Run a computation on a specific stream on each device on the system. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); auto computation = builder.Build().ConsumeValueOrDie(); @@ -689,9 +672,9 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnStream) { DefaultExecutableRunOptions().set_stream(&stream)); // As a check to verify that the computation ran of the device associated // with the stream. This is a weak check, but stronger verification is hard. - EXPECT_EQ(d, result->device_ordinal()); + EXPECT_EQ(d, result.device_ordinal()); LiteralTestUtil::ExpectR0Equal(42.0f, - *ShapedBufferToLiteral(*result)); + *ShapedBufferToLiteral(result)); } } @@ -707,7 +690,7 @@ XLA_TEST_F(LocalClientExecuteTest, se::Stream wrong_stream(wrong_platform->ExecutorForDevice(0).ValueOrDie()); wrong_stream.Init(); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); auto execute_status = ExecuteLocally( builder.Build().ValueOrDie(), {}, DefaultExecutableBuildOptions(), @@ -724,7 +707,7 @@ XLA_TEST_F(LocalClientExecuteTest, .ValueOrDie(); TestAllocator allocator(wrong_platform); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto y = builder.ConstantR0(123.0f); auto execute_status = ExecuteLocally( @@ -737,7 +720,7 @@ XLA_TEST_F(LocalClientExecuteTest, XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) { // Try to run a computation on a stream that has not been initialized. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); LOG(INFO) << "default device = " << local_client_->default_device_ordinal(); @@ -757,7 +740,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) { } XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); std::initializer_list vec1 = {1.f, 2.f, 3.f}; std::initializer_list vec2 = {2.f, 4.f, 6.f}; @@ -767,9 +750,9 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) { {builder.ConstantR1(vec2), builder.ConstantR1(vec1)}); builder.Select(builder.ConstantR0(false), tuple12, tuple21); - std::unique_ptr result = + ScopedShapedBuffer result = ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}); - std::unique_ptr tuple_literal = ShapedBufferToLiteral(*result); + std::unique_ptr tuple_literal = ShapedBufferToLiteral(result); LiteralTestUtil::ExpectR1Equal( {2.0f, 4.0f, 6.0f}, LiteralView::Create(*tuple_literal, {0})); LiteralTestUtil::ExpectR1Equal( @@ -777,7 +760,7 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) { } XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); @@ -793,12 +776,12 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) { auto x_array = LiteralToShapedBuffer(*Literal::CreateR1({0.0f, 1.0f, 2.0f})); - std::unique_ptr result = - executable->Run({x_array.get()}, DefaultExecutableRunOptions()) + ScopedShapedBuffer result = + executable->Run({&x_array}, DefaultExecutableRunOptions()) .ConsumeValueOrDie(); LiteralTestUtil::ExpectR1Near( - {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(*result), error_spec_); + {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_); } XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) { @@ -811,7 +794,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion) { literal, local_client_->default_device_ordinal(), allocator_)); TF_ASSERT_OK_AND_ASSIGN( auto transferred_literal, - local_client_->ShapedBufferToLiteral(*shaped_buffer)); + local_client_->ShapedBufferToLiteral(shaped_buffer)); EXPECT_EQ(literal, *transferred_literal); }; @@ -851,7 +834,7 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) { literal, local_client_->default_device_ordinal(), allocator_)); TF_ASSERT_OK_AND_ASSIGN( auto transferred_literal, - local_client_->ShapedBufferToLiteral(*shaped_buffer)); + local_client_->ShapedBufferToLiteral(shaped_buffer)); EXPECT_EQ(literal, *transferred_literal); }; @@ -867,9 +850,8 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) { // TODO(b/34359662): Support infeed/outfeed on GPU and CPU parallel. // 2017-10-18. -XLA_TEST_F(LocalClientExecuteTest, - DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(InfeedOutfeedTest))) { - ComputationBuilder builder(local_client_, TestName()); +XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_GPU(InfeedOutfeedTest)) { + XlaBuilder builder(TestName()); const Shape shape = ShapeUtil::MakeShape(F32, {3}); auto in = builder.Infeed(shape); auto constant = builder.ConstantR1({1.0f, 2.0f, 3.0f}); @@ -907,7 +889,7 @@ void BM_LocalClientOverhead(int num_iters) { int device_ordinal = client->default_device_ordinal(); // Use a tiny add operation as the computation. - ComputationBuilder builder(client, "Add"); + XlaBuilder builder("Add"); auto shape = ShapeUtil::MakeShape(F32, {2, 3}); auto x = builder.Parameter(0, shape, "x"); builder.Add(x, x); @@ -919,12 +901,12 @@ void BM_LocalClientOverhead(int num_iters) { .ConsumeValueOrDie(); auto literal = Literal::CreateR2({{0, 0, 0}, {0, 0, 0}}); ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice( - executors[device_ordinal], *literal, *buffer)); + executors[device_ordinal], *literal, buffer)); const int kWarmups = 2; auto executable_status = client->Compile( - computation, {&buffer->on_host_shape()}, ExecutableBuildOptions()); + computation, {&buffer.on_host_shape()}, ExecutableBuildOptions()); ASSERT_IS_OK(executable_status); std::unique_ptr executable = executable_status.ConsumeValueOrDie(); @@ -936,13 +918,13 @@ void BM_LocalClientOverhead(int num_iters) { run_options.set_allocator(&allocator).set_stream(&stream); for (int i = 0; i < kWarmups; ++i) { - auto result = executable->Run({buffer.get()}, run_options); + auto result = executable->Run({&buffer}, run_options); ASSERT_IS_OK(result); } tensorflow::testing::StartTiming(); for (int i = 0; i < num_iters; ++i) { - auto result = executable->Run({buffer.get()}, run_options); + auto result = executable->Run({&buffer}, run_options); ASSERT_IS_OK(result); } } diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc index 96b976d25d75d35f46adfd104a03aceb363661eb..ca8e4cdbdb6a8fdb2c70fbd39f77ffc18c995aa6 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.cc +++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc @@ -27,7 +27,7 @@ limitations under the License. #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/core/common_runtime/eigen_thread_pool.h" #include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" @@ -35,19 +35,21 @@ namespace xla { /* static */ TestAllocator* LocalClientTestBase::allocator_; -StatusOr TestAllocator::Allocate( - int device_ordinal, uint64 size, bool retry_on_failure) { +StatusOr TestAllocator::Allocate(int device_ordinal, + uint64 size, + bool retry_on_failure) { VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")"; { tensorflow::mutex_lock lock(count_mutex_); allocation_count_++; device_allocation_count_[device_ordinal]++; } - return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size); + return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size, + retry_on_failure); } -tensorflow::Status TestAllocator::Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) { +tensorflow::Status TestAllocator::Deallocate(int device_ordinal, + se::DeviceMemoryBase* mem) { VLOG(2) << "Deallocate(" << device_ordinal << ")"; { tensorflow::mutex_lock lock(count_mutex_); @@ -88,7 +90,7 @@ int64 TestAllocator::deallocation_count(int device_ordinal) const { } /* static */ TestAllocator* LocalClientTestBase::GetOrCreateAllocator( - perftools::gputools::Platform* platform) { + se::Platform* platform) { static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED); tensorflow::mutex_lock lock(mu); @@ -115,8 +117,7 @@ struct LocalClientTestBase::EigenThreadPoolWrapper { std::unique_ptr device; }; -LocalClientTestBase::LocalClientTestBase( - perftools::gputools::Platform* platform) +LocalClientTestBase::LocalClientTestBase(se::Platform* platform) : local_client_( ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie()), thread_pool_wrapper_(new EigenThreadPoolWrapper()) { @@ -128,7 +129,7 @@ LocalClientTestBase::LocalClientTestBase( LocalClientTestBase::~LocalClientTestBase() {} -std::unique_ptr LocalClientTestBase::LiteralToShapedBuffer( +ScopedShapedBuffer LocalClientTestBase::LiteralToShapedBuffer( const Literal& literal) { return local_client_ ->LiteralToShapedBuffer(literal, local_client_->default_device_ordinal()) @@ -155,16 +156,16 @@ ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const { return run_options; } -std::unique_ptr LocalClientTestBase::ExecuteLocallyOrDie( - const Computation& computation, +ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments) { return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions()) .ConsumeValueOrDie(); } -std::unique_ptr LocalClientTestBase::ExecuteLocallyOrDie( - const Computation& computation, +ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options) { @@ -172,17 +173,15 @@ std::unique_ptr LocalClientTestBase::ExecuteLocallyOrDie( .ConsumeValueOrDie(); } -StatusOr> -LocalClientTestBase::ExecuteLocally( - const Computation& computation, +StatusOr LocalClientTestBase::ExecuteLocally( + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments) { return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions()); } -StatusOr> -LocalClientTestBase::ExecuteLocally( - const Computation& computation, +StatusOr LocalClientTestBase::ExecuteLocally( + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options) { diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h index f0c73f04f6eb67b2e9cb5e111eccdc3818059b2b..3bbb760c806412a671bc2502846e123e2582fd16 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.h +++ b/tensorflow/compiler/xla/tests/local_client_test_base.h @@ -21,8 +21,8 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/computation.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/platform_util.h" @@ -41,15 +41,15 @@ namespace xla { class TestAllocator : public StreamExecutorMemoryAllocator { public: - explicit TestAllocator(perftools::gputools::Platform* platform) + explicit TestAllocator(se::Platform* platform) : StreamExecutorMemoryAllocator( platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) { } - StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure) override; - tensorflow::Status Deallocate( - int device_ordinal, perftools::gputools::DeviceMemoryBase* mem) override; + StatusOr Allocate(int device_ordinal, uint64 size, + bool retry_on_failure) override; + tensorflow::Status Deallocate(int device_ordinal, + se::DeviceMemoryBase* mem) override; // Return the number of allocations that have been performed. int64 allocation_count() const; @@ -75,18 +75,15 @@ class TestAllocator : public StreamExecutorMemoryAllocator { class LocalClientTestBase : public ::testing::Test { protected: struct EigenThreadPoolWrapper; - explicit LocalClientTestBase( - perftools::gputools::Platform* platform = nullptr); + explicit LocalClientTestBase(se::Platform* platform = nullptr); virtual ~LocalClientTestBase(); - static TestAllocator* GetOrCreateAllocator( - perftools::gputools::Platform* platform); + static TestAllocator* GetOrCreateAllocator(se::Platform* platform); // Copy the given literal onto the default device and return a // ScopedShapedBuffer. Convenience wrapper around // LocalClient::LiteralToShapedBuffer. - std::unique_ptr LiteralToShapedBuffer( - const Literal& literal); + ScopedShapedBuffer LiteralToShapedBuffer(const Literal& literal); // Construct and return a literal containing the array represented by // shaped_buffer. @@ -95,20 +92,20 @@ class LocalClientTestBase : public ::testing::Test { // Execute the given computation on the local client. With and without // options. - StatusOr> ExecuteLocally( - const Computation& computation, + StatusOr ExecuteLocally( + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments); - StatusOr> ExecuteLocally( - const Computation& computation, + StatusOr ExecuteLocally( + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options); - std::unique_ptr ExecuteLocallyOrDie( - const Computation& computation, + ScopedShapedBuffer ExecuteLocallyOrDie( + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments); - std::unique_ptr ExecuteLocallyOrDie( - const Computation& computation, + ScopedShapedBuffer ExecuteLocallyOrDie( + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options); @@ -128,7 +125,7 @@ class LocalClientTestBase : public ::testing::Test { // of the process. So make the allocator static. static TestAllocator* allocator_; - perftools::gputools::StreamExecutor* stream_executor_; + se::StreamExecutor* stream_executor_; TransferManager* transfer_manager_; LocalClient* local_client_; diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc index efe6cc67872713a8aeecc11aeafe4902676817a6..7df45bebebdd3eb2e71f27d831a8e2ac9e3b5f7c 100644 --- a/tensorflow/compiler/xla/tests/map_test.cc +++ b/tensorflow/compiler/xla/tests/map_test.cc @@ -16,8 +16,6 @@ limitations under the License. #include #include "tensorflow/compiler/xla/array2d.h" -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" @@ -41,7 +39,7 @@ namespace { class MapTest : public ClientLibraryTestBase { public: - explicit MapTest(perftools::gputools::Platform* platform = nullptr) + explicit MapTest(se::Platform* platform = nullptr) : ClientLibraryTestBase(platform) { mutable_debug_options()->add_xla_disable_hlo_passes("algsimp"); mutable_debug_options()->add_xla_disable_hlo_passes("inline"); @@ -341,48 +339,6 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) { ComputeAndCompareR0(&builder, 73.0, {}, ErrorSpec(0.01f)); } -TEST_F(MapTest, VersionedEmbeddedComputation) { - // Build a computation X, use it in a map, then add an additional operation to - // computation X and use it again in a different map. Verify that the proper - // versions of computation X are used in each of the maps. - - // Create a (embedded) computation which adds one to its parameter argument. - ComputationBuilder embedded_builder(client_, "EmbeddedComputation"); - auto param_0 = - embedded_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0"); - auto constant_one = embedded_builder.ConstantR0(1.0); - auto adder_to_one = embedded_builder.Add(param_0, constant_one); - auto computation_status = embedded_builder.Build(); - ASSERT_IS_OK(computation_status.status()); - auto embedded_computation = computation_status.ConsumeValueOrDie(); - - ComputationBuilder builder(client_, TestName()); - auto constant_vector = builder.ConstantR1({1.0, 2.0, 3.0, 4.0}); - auto map_plus_1 = builder.Map({constant_vector}, embedded_computation, {0}); - - // Add another Add(1) operation to the existing embedded computation. This - // requires using the stub interface because the ComputationBuilder does not - // allow modification to the XlaComputation objects after they have been - // built. - BinaryOpRequest request; - request.set_binop(BINOP_ADD); - *request.mutable_lhs() = adder_to_one; - *request.mutable_rhs() = constant_one; - OpRequest op_request; - *op_request.mutable_computation() = embedded_computation.handle(); - *op_request.mutable_binary_op_request() = request; - OpResponse response; - tensorflow::Status s = client_->stub()->Op(&op_request, &response); - ASSERT_TRUE(s.ok()); - - auto map_plus_2 = builder.Map({map_plus_1}, embedded_computation, {0}); - - // The original vector has Add(1) applied to it with a map, followed by - // Add(1+1) resulting in a net Add(3). - ComputeAndCompareR1(&builder, {4.0, 5.0, 6.0, 7.0}, {}, - ErrorSpec(0.01f)); -} - TEST_F(MapTest, MapBinaryAdder) { // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors. XlaBuilder builder(TestName()); diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc index c42f71388baba73e08a361d817e41b03e03bf133..7fa61eb33c2930ac8192ac965a71122001f808d3 100644 --- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc +++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc @@ -19,8 +19,9 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/reference_util.h" @@ -60,7 +61,7 @@ TYPED_TEST_CASE(MatOpsSimpleTest_F16F32, TypesF16F32); XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) { using T = TypeParam; - ComputationBuilder builder(this->client_, "exp_2x2"); + XlaBuilder builder("exp_2x2"); auto data = builder.ConstantR2FromArray2D({ {1.0f, 0.0f}, // row 0 {-1.0f, 0.5f}, // row 1 @@ -77,10 +78,10 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) { XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) { using T = TypeParam; - Computation add_half; + XlaComputation add_half; { // add_half(x) = x + 0.5 - ComputationBuilder builder(this->client_, "add_half"); + XlaBuilder builder("add_half"); auto x_value = builder.Parameter(0, ShapeUtil::MakeShapeWithType({}), "x_value"); auto half = builder.ConstantR0(static_cast(0.5)); @@ -90,7 +91,7 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) { add_half = computation_status.ConsumeValueOrDie(); } - ComputationBuilder builder(this->client_, "map_2x2"); + XlaBuilder builder("map_2x2"); auto data = builder.ConstantR2FromArray2D({ {1.0f, 0.0f}, // row 0 {-1.0f, 0.5f}, // row 1 @@ -106,7 +107,7 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) { XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MaxTwoByTwoValues) { using T = TypeParam; - ComputationBuilder builder(this->client_, "max_2x2"); + XlaBuilder builder("max_2x2"); auto lhs = builder.ConstantR2FromArray2D({ {7.0f, 2.0f}, // row 0 {3.0f, -4.0f}, // row 1 @@ -143,8 +144,7 @@ class TestLinspaceMaxParametric MakeLinspaceArray2D(from, to, rows, cols); auto arhs = MakeUnique>(rows, cols, static_cast(1.0f)); - ComputationBuilder builder( - client_, + XlaBuilder builder( tensorflow::strings::Printf("max_%lldx%lld_linspace", rows, cols)); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); @@ -219,7 +219,7 @@ class MatOpsDotAddTest client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout( rhs, LayoutUtil::MakeLayout(minor_to_major(row_major))))); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs"); auto lhs_mat_arg = lhs_arg; if (transpose) { diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc index 11c0bf7a5a5bde9edcfb7f76a5c10ac4dd77bcee..0791a71aacf7614286fe964623a3172a174d4722 100644 --- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc +++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc @@ -19,8 +19,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -32,7 +32,7 @@ namespace { class SliceTest : public ClientLibraryTestBase {}; XLA_TEST_F(SliceTest, Slice2D) { - ComputationBuilder builder(client_, "slice_2d"); + XlaBuilder builder("slice_2d"); auto original = builder.ConstantR2( {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}); builder.Slice(original, {2, 1}, {4, 3}, {1, 1}); @@ -42,7 +42,7 @@ XLA_TEST_F(SliceTest, Slice2D) { } XLA_TEST_F(SliceTest, Slice3D) { - ComputationBuilder builder(client_, "slice_3d"); + XlaBuilder builder("slice_3d"); Array3D array_3d( {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}); auto original = builder.ConstantR3FromArray3D(array_3d); diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc index bb7e800df84121f2045141bc366c34b94ba694ea..97dab860c06bddb2a0ffd45e48c4912c5f55d574 100644 --- a/tensorflow/compiler/xla/tests/params_test.cc +++ b/tensorflow/compiler/xla/tests/params_test.cc @@ -20,9 +20,10 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -41,7 +42,7 @@ namespace { class ParamsTest : public ClientLibraryTestBase {}; XLA_TEST_F(ParamsTest, ConstantR0F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR0(3.14159f); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); @@ -53,7 +54,7 @@ XLA_TEST_F(ParamsTest, ConstantR0F32Param) { } XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR1({}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); @@ -65,7 +66,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) { } XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR1({3.14f, -100.25f}); std::unique_ptr param0_data = @@ -78,7 +79,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) { } XLA_TEST_F(ParamsTest, ConstantR1U8Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); string str("hello world"); std::unique_ptr param0_literal = Literal::CreateR1U8(str); std::unique_ptr param0_data = @@ -91,7 +92,7 @@ XLA_TEST_F(ParamsTest, ConstantR1U8Param) { } XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR2FromArray2D(Array2D(3, 0)); std::unique_ptr param0_data = @@ -104,7 +105,7 @@ XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) { } XLA_TEST_F(ParamsTest, ConstantR2F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR2( {{3.14f, -100.25f}, {7e8f, 7e-9f}, {30.3f, -100.0f}}); std::unique_ptr param0_data = @@ -119,7 +120,7 @@ XLA_TEST_F(ParamsTest, ConstantR2F32Param) { } XLA_TEST_F(ParamsTest, TwoParameters) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr literal0 = Literal::CreateR1({1, 2}); std::unique_ptr param0_data = @@ -156,19 +157,15 @@ XLA_TEST_F(ParamsTest, MissingParameter) { std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto p = builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "param2"); - auto computation = builder.Build().ConsumeValueOrDie(); + auto computation_status = builder.Build(); - auto execute_status = client_->Execute(computation, {data.get(), data.get()}, - /*execution_options=*/nullptr, - /*execution_profile=*/nullptr); - ASSERT_EQ(execute_status.status().code(), - tensorflow::error::FAILED_PRECONDITION); + ASSERT_NE(computation_status.status(), tensorflow::Status::OK()); } XLA_TEST_F(ParamsTest, UnusedParameter) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr literal0 = Literal::CreateR1({1, 2}); std::unique_ptr param0_data = @@ -188,7 +185,7 @@ XLA_TEST_F(ParamsTest, UnusedParameter) { XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) { // Build a computation with a couple unused parameters which are used in an // unused expression. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr literal0 = Literal::CreateR1({1, 2}); std::unique_ptr param0_data = @@ -214,12 +211,12 @@ XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) { } XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); constexpr int size = 8 * 128 * 2; std::vector init_value = {{0, 1}}; init_value.resize(size); - ComputationDataHandle sum_handle = builder.ConstantR1(init_value); + XlaOp sum_handle = builder.ConstantR1(init_value); std::vector sum = {{0, 1}}; sum.resize(size); @@ -237,8 +234,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) { std::unique_ptr literal = Literal::CreateR1(sum_value); param_data_owner.push_back( client_->TransferToServer(*literal).ConsumeValueOrDie()); - ComputationDataHandle param = - builder.Parameter(i, literal->shape(), "param"); + XlaOp param = builder.Parameter(i, literal->shape(), "param"); sum_handle = builder.Add(sum_handle, param); } @@ -262,10 +258,10 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) { // compilation. XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(ThreeThousandParameters))) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector> param_data_owner; - ComputationDataHandle sum_handle = builder.ConstantR0(0.0f); + XlaOp sum_handle = builder.ConstantR0(0.0f); float target = 0.0; constexpr int kParamCount = 3000; for (int i = 0; i < kParamCount; ++i) { @@ -273,8 +269,7 @@ XLA_TEST_F(ParamsTest, std::unique_ptr literal = Literal::CreateR0(i); param_data_owner.push_back( std::move(client_->TransferToServer(*literal)).ValueOrDie()); - ComputationDataHandle param = - builder.Parameter(i, literal->shape(), "param"); + XlaOp param = builder.Parameter(i, literal->shape(), "param"); sum_handle = builder.Add(sum_handle, param); } @@ -294,25 +289,24 @@ XLA_TEST_F(ParamsTest, // compilation. XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU( ThreeThousandParametersAndOutputElements))) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector> param_data_owner; - ComputationDataHandle sum_handle = builder.ConstantR1({0, 0}); + XlaOp sum_handle = builder.ConstantR1({0, 0}); int32 target = 0; constexpr int kParamCount = 3000; - std::vector params; + std::vector params; for (int i = 0; i < kParamCount; ++i) { target += i; std::unique_ptr literal = Literal::CreateR1({i, i}); param_data_owner.push_back( std::move(client_->TransferToServer(*literal)).ValueOrDie()); - ComputationDataHandle param = - builder.Parameter(i, literal->shape(), "param"); + XlaOp param = builder.Parameter(i, literal->shape(), "param"); params.push_back(param); sum_handle = builder.Add(sum_handle, param); } - std::vector outputs; + std::vector outputs; for (int i = 0; i < kParamCount; ++i) { outputs.push_back(builder.Add(params[i], sum_handle)); } @@ -353,18 +347,17 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU( // 2017-12-12. XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(ManyParametersIntoWhileLoop))) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector> param_data_owner; constexpr int kParamCount = 1900; - std::vector params; + std::vector params; std::vector parameter_shapes; for (int i = 0; i < kParamCount; ++i) { std::unique_ptr literal = Literal::CreateR1({i, i}); param_data_owner.push_back( std::move(client_->TransferToServer(*literal)).ValueOrDie()); - ComputationDataHandle param = - builder.Parameter(i, literal->shape(), "param"); + XlaOp param = builder.Parameter(i, literal->shape(), "param"); params.push_back(param); parameter_shapes.push_back(literal->shape()); } @@ -374,7 +367,7 @@ XLA_TEST_F(ParamsTest, std::unique_ptr bool_literal = Literal::CreateR0(false); param_data_owner.push_back( std::move(client_->TransferToServer(*bool_literal)).ValueOrDie()); - ComputationDataHandle bool_param = + XlaOp bool_param = builder.Parameter(kParamCount, bool_literal->shape(), "bool_param"); params.push_back(bool_param); parameter_shapes.push_back(bool_literal->shape()); @@ -383,9 +376,9 @@ XLA_TEST_F(ParamsTest, // Create a computation for the condition: while(bool_param). Shape while_shape = ShapeUtil::MakeTupleShape(parameter_shapes); - Computation condition; + XlaComputation condition; { - ComputationBuilder builder(client_, "condition"); + XlaBuilder builder("condition"); auto condition_parameter = builder.Parameter(0, while_shape, "condition_parameter"); builder.GetTupleElement(condition_parameter, kParamCount); @@ -394,11 +387,11 @@ XLA_TEST_F(ParamsTest, // Create a computation for the body. // Add {1, 1} to the each tuple element. - Computation body; + XlaComputation body; { - ComputationBuilder builder(client_, "body"); + XlaBuilder builder("body"); auto body_parameter = builder.Parameter(0, while_shape, "body_parameter"); - std::vector updates; + std::vector updates; for (int i = 0; i < kParamCount; ++i) { auto add = builder.Add(builder.GetTupleElement(body_parameter, i), builder.ConstantR1({1, 1})); @@ -413,7 +406,7 @@ XLA_TEST_F(ParamsTest, auto loop = builder.While(condition, body, init); - std::vector outputs; + std::vector outputs; for (int i = 0; i < kParamCount; ++i) { outputs.push_back(builder.GetTupleElement(loop, i)); } @@ -437,7 +430,7 @@ XLA_TEST_F(ParamsTest, #endif XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Shape r1f32_3 = ShapeUtil::MakeShape(F32, {3}); Shape tuple_shape = ShapeUtil::MakeTupleShape({r1f32_3, r1f32_3}); @@ -464,7 +457,7 @@ XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) { XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) { std::unique_ptr literal = Literal::CreateR2WithLayout( {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({0, 1})); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.Parameter(0, literal->shape(), "input"); std::unique_ptr data = @@ -476,7 +469,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) { XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) { std::unique_ptr literal = Literal::CreateR2WithLayout( {{1, 3}, {2, 4}}, LayoutUtil::MakeLayout({1, 0})); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.Parameter(0, literal->shape(), "input"); std::unique_ptr data = @@ -501,7 +494,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) { ASSERT_EQ(2, literal->Get({0, 1})); } // Use the original shape in building the computation. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.Parameter(0, original, "input"); // Use the slice operator to get an off-diagonal element. builder.Slice(input, {0, 1}, {1, 2}, {1, 1}); diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc index 10e44b274a8a9f3ac28dc40d7b1938d24a9ee40c..77159efb26f3b7dd4918f24305f7269a2d6ff647 100644 --- a/tensorflow/compiler/xla/tests/pred_test.cc +++ b/tensorflow/compiler/xla/tests/pred_test.cc @@ -17,9 +17,9 @@ limitations under the License. #include #include "tensorflow/compiler/xla/array2d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" @@ -29,63 +29,62 @@ namespace { class PredTest : public ClientLibraryTestBase { protected: - void TestCompare(bool lhs, bool rhs, bool expected, - ComputationDataHandle (ComputationBuilder::*op)( - const ComputationDataHandle&, - const ComputationDataHandle&, - tensorflow::gtl::ArraySlice)) { - ComputationBuilder builder(client_, TestName()); - ComputationDataHandle lhs_op = builder.ConstantR0(lhs); - ComputationDataHandle rhs_op = builder.ConstantR0(rhs); - ComputationDataHandle result = (builder.*op)(lhs_op, rhs_op, {}); + void TestCompare( + bool lhs, bool rhs, bool expected, + XlaOp (XlaBuilder::*op)(const xla::XlaOp&, const xla::XlaOp&, + tensorflow::gtl::ArraySlice)) { + XlaBuilder builder(TestName()); + XlaOp lhs_op = builder.ConstantR0(lhs); + XlaOp rhs_op = builder.ConstantR0(rhs); + XlaOp result = (builder.*op)(lhs_op, rhs_op, {}); ComputeAndCompareR0(&builder, expected, {}); } }; TEST_F(PredTest, ConstantR0PredTrue) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR0(true); ComputeAndCompareR0(&builder, true, {}); } TEST_F(PredTest, ConstantR0PredFalse) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR0(false); ComputeAndCompareR0(&builder, false, {}); } TEST_F(PredTest, ConstantR0PredCompareEq) { - TestCompare(true, false, false, &ComputationBuilder::Eq); + TestCompare(true, false, false, &XlaBuilder::Eq); } TEST_F(PredTest, ConstantR0PredCompareNe) { - TestCompare(true, false, true, &ComputationBuilder::Ne); + TestCompare(true, false, true, &XlaBuilder::Ne); } TEST_F(PredTest, ConstantR0PredCompareLe) { - TestCompare(true, false, false, &ComputationBuilder::Le); + TestCompare(true, false, false, &XlaBuilder::Le); } TEST_F(PredTest, ConstantR0PredCompareLt) { - TestCompare(true, false, false, &ComputationBuilder::Lt); + TestCompare(true, false, false, &XlaBuilder::Lt); } TEST_F(PredTest, ConstantR0PredCompareGe) { - TestCompare(true, false, true, &ComputationBuilder::Ge); + TestCompare(true, false, true, &XlaBuilder::Ge); } TEST_F(PredTest, ConstantR0PredCompareGt) { - TestCompare(true, false, true, &ComputationBuilder::Gt); + TestCompare(true, false, true, &XlaBuilder::Gt); } TEST_F(PredTest, ConstantR1Pred) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR1({true, false, false, true}); ComputeAndCompareR1(&builder, {true, false, false, true}, {}); } TEST_F(PredTest, ConstantR2Pred) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR2({{false, true, true}, {true, false, false}}); const string expected = R"(pred[2,3] { @@ -96,28 +95,28 @@ TEST_F(PredTest, ConstantR2Pred) { } TEST_F(PredTest, AnyR1True) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR1({true, false}); TF_ASSERT_OK(Any(a, &builder).status()); ComputeAndCompareR0(&builder, true, {}); } TEST_F(PredTest, AnyR1False) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR1({false, false}); TF_ASSERT_OK(Any(a, &builder).status()); ComputeAndCompareR0(&builder, false, {}); } TEST_F(PredTest, AnyR1VacuouslyFalse) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR1({}); TF_ASSERT_OK(Any(a, &builder).status()); ComputeAndCompareR0(&builder, false, {}); } TEST_F(PredTest, AnyR2True) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR2({ {false, false, false}, {false, false, false}, @@ -128,7 +127,7 @@ TEST_F(PredTest, AnyR2True) { } TEST_F(PredTest, AnyR2False) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR2({ {false, false, false}, {false, false, false}, diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc index 6aafb9fa6cb2175c478f0e9a5e16f5808cbea590..29a4f75001c688f2215745ab913df68bf2f62b76 100644 --- a/tensorflow/compiler/xla/tests/prng_test.cc +++ b/tensorflow/compiler/xla/tests/prng_test.cc @@ -16,8 +16,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -52,13 +52,14 @@ class PrngTest : public ClientLibraryTestBase { template std::unique_ptr PrngTest::UniformTest( T a, T b, tensorflow::gtl::ArraySlice dims, int64 seed) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.RngUniform( builder.ConstantR0(a), builder.ConstantR0(b), ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType(), dims)); SetSeed(seed); - auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{}); + auto actual = + ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions())); actual->EachCell([=](tensorflow::gtl::ArraySlice, T value) { EXPECT_LE(a, value); @@ -81,8 +82,7 @@ XLA_TEST_F(PrngTest, LargeU01) { UniformTest(0, 1, {0x100, 0x100}); } XLA_TEST_F(PrngTest, TwelveValuesU524) { UniformTest(5, 24, {12}); } // TODO(b/71543667): Fix Rng ops on LLVM backends. -XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL( - DISABLED_ON_CPU(ScalarBF16Tests)))) { +XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) { for (int64 seed = 0; seed < 100; ++seed) { // The largest negative number smaller than zero in bf16 that's not // denormalized. @@ -105,8 +105,7 @@ XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL( } // TODO(b/71543667): Fix Rng ops on LLVM backends. -XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU( - DISABLED_ON_CPU_PARALLEL(ScalarBF16CountTests)))) { +XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16CountTests))) { // There are 3 BF16 values in the range of [32.25, 33): 32.25, 32.5, 32.75, // they should get similar counts. bfloat16 low = static_cast(32.25); @@ -141,13 +140,14 @@ double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count, int64 seed) { int32 sample_size = range_size * expected_count; - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.RngUniform(builder.ConstantR0(0), builder.ConstantR0(range_size), ShapeUtil::MakeShape(S32, {sample_size})); SetSeed(seed); - auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{}); + auto actual = + ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); std::vector counts(range_size, 0); actual->EachCell([&counts](tensorflow::gtl::ArraySlice, int32 value) { ++counts[value]; }); @@ -182,16 +182,15 @@ XLA_TEST_F(PrngTest, Uniformity256) { XLA_TEST_F(PrngTest, MapUsingRng) { // Build a x -> (x + U[0,1)) computation. - auto build_sum_rng = [this](ComputationBuilder& builder) { + auto build_sum_rng = [this](XlaBuilder& builder) { auto b = builder.CreateSubBuilder("sum_with_rng"); auto x = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "input"); - b->Add(x, - b->RngUniform(b->ConstantR0(0), b->ConstantR0(1), - ShapeUtil::MakeShape(F32, {}))); + b->Add(x, b->RngUniform(b->ConstantR0(0), b->ConstantR0(1), + ShapeUtil::MakeShape(F32, {}))); return b->BuildAndNoteError(); }; - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR1({2.2f, 5.3f, 4.4f, 5.5f}); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr param0_data, @@ -226,7 +225,7 @@ XLA_TEST_F(PrngTest, MapUsingRng) { XLA_TEST_F(PrngTest, PassInGlobalRngSeed) { // Build a U[0,1) computation. auto build_computation = [this]() { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.RngUniform(builder.ConstantR0(0), builder.ConstantR0(1), ShapeUtil::MakeShape(F32, {10})); @@ -282,24 +281,24 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) { } XLA_TEST_F(PrngTest, TenValuesN01) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.RngNormal(builder.ConstantR0(0), builder.ConstantR0(1), ShapeUtil::MakeShape(F32, {10})); SetSeed(42); - ExecuteAndTransferOrDie(&builder, /*arguments=*/{}); + ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); // TODO(b/25995601): Test that resultant values are reasonable } XLA_TEST_F(PrngTest, RngUniformCrash) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // This used to crash XLA during LLVM IR generation for CPUs. auto rng_uniform = builder.RngUniform(builder.ConstantR0(0), builder.ConstantR0(1000 * 1000), ShapeUtil::MakeShape(S32, {})); SetSeed(0); - ExecuteAndTransferOrDie(&builder, /*arguments=*/{}); + ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); } } // namespace diff --git a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc index 212512207cfdc4d2ebdc4e7fd8f5794852cc6a79..f95e75648343aa88bd7c39de4ee9f387f2b60506 100644 --- a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc +++ b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc @@ -15,8 +15,8 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -30,13 +30,13 @@ namespace { class QueryInferredShapeTest : public ClientLibraryTestBase {}; TEST_F(QueryInferredShapeTest, OnePlusOneShape) { - ComputationBuilder builder(client_, "one_plus_one"); + XlaBuilder builder("one_plus_one"); auto one = builder.ConstantR0(1.0); auto result = builder.Add(one, one); - StatusOr> shape_status = builder.GetShape(result); + StatusOr shape_status = builder.GetShape(result); ASSERT_IS_OK(shape_status.status()); auto shape = shape_status.ConsumeValueOrDie(); - ASSERT_TRUE(ShapeUtil::Equal(*shape, ShapeUtil::MakeShape(F32, {}))); + ASSERT_TRUE(ShapeUtil::Equal(shape, ShapeUtil::MakeShape(F32, {}))); } } // namespace diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc index 423ccadb5b3b7df950824349737a833c08870d77..bcc05c2d41d8439b021cdf6533b5ca87c19aec1f 100644 --- a/tensorflow/compiler/xla/tests/reduce_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_test.cc @@ -35,7 +35,6 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" @@ -60,10 +59,9 @@ limitations under the License. namespace xla { namespace { -using FuncGeneratorForType = Computation (*)(PrimitiveType, - ComputationBuilder*); +using FuncGeneratorForType = XlaComputation (*)(PrimitiveType, XlaBuilder*); -using FuncGenerator = Computation (*)(ComputationBuilder*); +using FuncGenerator = XlaComputation (*)(XlaBuilder*); class ReduceTest : public ClientLibraryTestBase { protected: @@ -89,8 +87,8 @@ class ReduceTest : public ClientLibraryTestBase { // Runs an R1 => R0 reduction test with the given number of elements. void RunR1ToR0Test(int64 element_count) { - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {element_count}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -119,13 +117,13 @@ class ReduceTest : public ClientLibraryTestBase { void RunR1ToR0PredTest(bool and_reduce, tensorflow::gtl::ArraySlice input_data) { const int element_count = input_data.size(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const Shape input_shape = ShapeUtil::MakeShape(S32, {element_count}); auto input_par = builder.Parameter(0, input_shape, "input"); auto pred_values = builder.Eq(input_par, builder.ConstantR1(element_count, 1)); - ComputationDataHandle init_value; - Computation reduce; + XlaOp init_value; + XlaComputation reduce; if (and_reduce) { init_value = builder.ConstantR0(true); reduce = CreateScalarAndComputation(&builder); @@ -157,13 +155,13 @@ class ReduceTest : public ClientLibraryTestBase { template void RunR2ToR1PredTest(bool and_reduce, int64 rows, int64 minor = 1, int64 major = 0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const Shape input_shape = ShapeUtil::MakeShape(U8, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto input_pred = builder.Eq(input, builder.ConstantR0(1)); - ComputationDataHandle init_value; - Computation reduce_op; + XlaOp init_value; + XlaComputation reduce_op; if (and_reduce) { init_value = builder.ConstantR0(true); reduce_op = CreateScalarAndComputation(&builder); @@ -202,8 +200,8 @@ class ReduceTest : public ClientLibraryTestBase { // Runs an R2 => R0 reduction test with the given number of (rows, cols). void RunR2ToR0Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) { - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -230,8 +228,8 @@ class ReduceTest : public ClientLibraryTestBase { // Runs an R2 => R1 reduction test with the given number of (rows, cols). void RunR2ToR1Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) { - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -261,7 +259,7 @@ class ReduceTest : public ClientLibraryTestBase { template void ComputeAndCompareGeneric( typename std::enable_if::value, - ComputationBuilder>::type* builder, + XlaBuilder>::type* builder, tensorflow::gtl::ArraySlice expected, tensorflow::gtl::ArraySlice arguments) { ComputeAndCompareR1(builder, expected, arguments, @@ -271,7 +269,7 @@ class ReduceTest : public ClientLibraryTestBase { template void ComputeAndCompareGeneric( typename std::enable_if::value, - ComputationBuilder>::type* builder, + XlaBuilder>::type* builder, tensorflow::gtl::ArraySlice expected, tensorflow::gtl::ArraySlice arguments) { ComputeAndCompareR1(builder, expected, arguments); @@ -279,15 +277,15 @@ class ReduceTest : public ClientLibraryTestBase { template void RunVectorizedReduceTestForType( - const std::function& + const std::function& reduction_function_generator, const std::function& reference_reduction_function, const NativeT& initial_value) { const int rows = 64, cols = 128; const int minor = 1, major = 0; - ComputationBuilder builder(client_, TestName()); - Computation reduction_function = reduction_function_generator(&builder); + XlaBuilder builder(TestName()); + XlaComputation reduction_function = reduction_function_generator(&builder); const Shape input_shape = ShapeUtil::MakeShape( xla::primitive_util::NativeToPrimitiveType(), {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); @@ -322,7 +320,7 @@ class ReduceTest : public ClientLibraryTestBase { } void RunVectorizedReduceTest( - const std::function& + const std::function& reduction_function_generator_for_type, const std::function& reference_reduction_function_for_floats, @@ -334,21 +332,21 @@ class ReduceTest : public ClientLibraryTestBase { uint32 unsigned_int_identity) { // Float version RunVectorizedReduceTestForType( - [&](ComputationBuilder* builder) { + [&](XlaBuilder* builder) { return reduction_function_generator_for_type(F32, builder); }, reference_reduction_function_for_floats, floating_point_identity); // Signed int version RunVectorizedReduceTestForType( - [&](ComputationBuilder* builder) { + [&](XlaBuilder* builder) { return reduction_function_generator_for_type(S32, builder); }, reference_reduction_function_for_ints, signed_int_identity); // Unsigned int version RunVectorizedReduceTestForType( - [&](ComputationBuilder* builder) { + [&](XlaBuilder* builder) { return reduction_function_generator_for_type(U32, builder); }, reference_reduction_function_for_uints, unsigned_int_identity); @@ -442,8 +440,8 @@ XLA_TEST_F(ReduceTest, OrReduceOnesAndZerosR1_10_Pred) { XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) { const int64 rows = 111, cols = 50; - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -473,8 +471,8 @@ XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) { XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) { const int64 rows = 111, cols = 50; - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -522,8 +520,8 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) { XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) { const int64 rows = 111, cols = 50; - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, 2, cols / 2}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -569,7 +567,7 @@ void PrintTo(const BoundsLayout& spec, std::ostream* os) { // Add-reduces a broadcasted scalar matrix among dimension 1 and 0. XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto add = CreateScalarAddComputation(F32, &builder); auto scalar = builder.ConstantR0(42.0); auto broadcasted = builder.Broadcast(scalar, {500, 500}); @@ -581,7 +579,7 @@ XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) { // Max-reduces a broadcasted scalar matrix among dimension 1 and 0. XLA_TEST_F(ReduceTest, MaxReduce2DScalarToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto max = CreateScalarMaxComputation(F32, &builder); auto scalar = builder.ConstantR0(42.0); auto broadcasted = builder.Broadcast(scalar, {500, 500}); @@ -593,7 +591,7 @@ XLA_TEST_F(ReduceTest, MaxReduce2DScalarToR0) { // Max-reduces a matrix among dimension 1 and 0. XLA_TEST_F(ReduceTest, MaxReduce2DToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto max = CreateScalarMaxComputation(F32, &builder); Array2D input(300, 250); input.FillRandom(214.0f); @@ -608,7 +606,7 @@ XLA_TEST_F(ReduceTest, MaxReduce2DToR0) { // Min-reduces matrix among dimension 1 and 0. XLA_TEST_F(ReduceTest, MinReduce2DToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto min = CreateScalarMinComputation(F32, &builder); Array2D input(150, 130); input.FillRandom(214.0f); @@ -623,7 +621,7 @@ XLA_TEST_F(ReduceTest, MinReduce2DToR0) { } XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array2D input({{1}, {2}}); auto min = CreateScalarMinComputation(U32, &builder); auto input_literal = Literal::CreateR2FromArray2D(input); @@ -636,7 +634,7 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) { } XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array2D input({{1}, {2}}); auto max = CreateScalarMaxComputation(U32, &builder); auto input_literal = Literal::CreateR2FromArray2D(input); @@ -650,7 +648,7 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) { // Reduces a matrix among dimension 1. XLA_TEST_F(ReduceTest, Reduce2DAmong1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_2d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {1}); @@ -661,7 +659,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong1) { XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) { // Reduce a matrix among dimensions 0 and 1 (sum it up to a scalar). - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_2d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0, 1}); @@ -671,7 +669,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) { // Tests 2D matrix ReduceToRow operation. XLA_TEST_F(ReduceTest, Reduce2DAmongY) { - ComputationBuilder builder(client_, "reduce_among_y"); + XlaBuilder builder("reduce_among_y"); auto m = builder.ConstantLiteral(*literal_2d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0}); @@ -681,7 +679,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmongY) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {1, 2}); @@ -691,7 +689,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0, 1}); @@ -701,7 +699,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) { } XLA_TEST_F(ReduceTest, ReduceR3ToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0, 1, 2}); @@ -711,7 +709,7 @@ XLA_TEST_F(ReduceTest, ReduceR3ToR0) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0}); @@ -726,7 +724,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {1}); @@ -743,7 +741,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {2}); @@ -817,7 +815,7 @@ class ReduceR3ToR2Test : public ReduceTest, public ::testing::WithParamInterface {}; XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const auto& bounds = GetParam().bounds; Array3D input_array(bounds[0], bounds[1], bounds[2]); // input_array.FillRandom(3.14f, 0.05); @@ -831,7 +829,7 @@ XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) { auto input_activations = builder.Parameter(0, input_literal->shape(), "input"); - Computation add = CreateScalarAddComputation(F32, &builder); + XlaComputation add = CreateScalarAddComputation(F32, &builder); auto sum = builder.Reduce(input_activations, builder.ConstantR0(0.0f), add, GetParam().reduce_dims); @@ -871,8 +869,8 @@ INSTANTIATE_TEST_CASE_P( // IrEmitterUnnested::EmitInitializer() for the Reduce operator. Failed on // 2017-07-26. XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OperationOnConstantAsInitValue)) { - ComputationBuilder builder(client_, TestName()); - Computation max_f32 = CreateScalarMaxComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation max_f32 = CreateScalarMaxComputation(F32, &builder); auto a = builder.ConstantR0(2.0f); auto a2 = builder.Abs(a); @@ -899,8 +897,8 @@ class ReduceInitializerTest : public ReduceTest { protected: template void DoTest(T initializer, int num_elems) { - ComputationBuilder builder(client_, TestName()); - Computation max_fn = CreateScalarMaxComputation( + XlaBuilder builder(TestName()); + XlaComputation max_fn = CreateScalarMaxComputation( primitive_util::NativeToPrimitiveType(), &builder); auto init = builder.ConstantR0(initializer); @@ -940,7 +938,7 @@ XLA_TEST_F(ReduceInitializerTest, U64InitializerBigValue) { // returns one of the parameters). In this case, we return the rhs, which for // a 1D array with one element, should not be the init value. XLA_TEST_F(ReduceTest, ReduceIdentity) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Shape single_float = ShapeUtil::MakeShape(F32, {}); builder.Parameter(0, single_float, "lhs-unused"); builder.Parameter(1, single_float, "rhs-used"); diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc index 0a097667222d8c315cb9943688d6dbcb4426a8b3..10a3da3a387641ec45baf02d15790e32371601fa 100644 --- a/tensorflow/compiler/xla/tests/reduce_window_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc @@ -861,8 +861,7 @@ INSTANTIATE_TEST_CASE_P( class R4ReduceWindowAnyDimsTest : public R4ReduceWindowTest {}; // TODO(b/72234705): Fix the test cases failed on CPU and GPU. -XLA_TEST_P(R4ReduceWindowAnyDimsTest, - DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) { +XLA_TEST_P(R4ReduceWindowAnyDimsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) { DoIt(); } @@ -1151,7 +1150,7 @@ class R2ReduceWindowFailingCpuGpuBf16Test : public R2ReduceWindowTest {}; // TODO(b/72234705): Fix the test cases failed on CPU and GPU. XLA_TEST_P(R2ReduceWindowFailingCpuGpuBf16Test, - DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) { + DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) { DoIt(); } diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc index 6d063ffc363c092a1fbc40cbc22e87181d0c2502..36d763b0f7f4267ede076c0b25cfaf9654e96e0d 100644 --- a/tensorflow/compiler/xla/tests/replay_test.cc +++ b/tensorflow/compiler/xla/tests/replay_test.cc @@ -15,13 +15,13 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/protobuf_util.h" -#include "tensorflow/compiler/xla/service/session.pb.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -38,17 +38,17 @@ class ReplayTest : public ClientLibraryTestBase {}; TEST_F(ReplayTest, TwoPlusTwoReplay) { // Make 2+2 computation. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto two = builder.ConstantR0(2); builder.Add(two, two); - Computation computation = builder.Build().ConsumeValueOrDie(); + XlaComputation computation = builder.Build().ConsumeValueOrDie(); // Serialize it out. - std::unique_ptr module = + std::unique_ptr module = computation.Snapshot().ConsumeValueOrDie(); // Replay it. - Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); + XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); // Check signature is the same. std::unique_ptr original_shape = @@ -69,18 +69,18 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) { XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) { // Make computation. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(S32, {}), "y"); builder.Add(x, y); - Computation computation = builder.Build().ConsumeValueOrDie(); + XlaComputation computation = builder.Build().ConsumeValueOrDie(); // Serialize it out. - std::unique_ptr module = + std::unique_ptr module = computation.Snapshot().ConsumeValueOrDie(); // Replay it. - Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); + XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); // Check signature is the same. std::unique_ptr original_shape = @@ -109,24 +109,24 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) { TEST_F(ReplayTest, MapPlusTwoOverR1) { // As above, but with map(+2) over some constant array. - ComputationBuilder plus_two_builder(client_, "plus two"); + XlaBuilder plus_two_builder("plus two"); auto input = plus_two_builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "input"); plus_two_builder.Add(input, plus_two_builder.ConstantR0(2)); - Computation plus_two = plus_two_builder.Build().ConsumeValueOrDie(); + XlaComputation plus_two = plus_two_builder.Build().ConsumeValueOrDie(); - ComputationBuilder mapper_builder(client_, TestName()); + XlaBuilder mapper_builder(TestName()); auto original = mapper_builder.ConstantR1({1, 2, 3}); mapper_builder.Map({original}, plus_two, {0}); - Computation computation = mapper_builder.Build().ConsumeValueOrDie(); + XlaComputation computation = mapper_builder.Build().ConsumeValueOrDie(); // Serialize it out. - std::unique_ptr module = + std::unique_ptr module = computation.Snapshot().ConsumeValueOrDie(); // Replay it. - Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); + XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); // Check signature is the same. std::unique_ptr original_shape = @@ -135,10 +135,6 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) { client_->GetComputationShape(replayed).ConsumeValueOrDie(); ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape)); - // Destroy the originals. - computation.Reset(); - plus_two.Reset(); - // Run it. std::unique_ptr literal = client_ diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc index e045e164e2e2db7d3480e7c2d1e20f461820ae67..5ebd5268992846e80dcce2675f8e92038e190ecf 100644 --- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc +++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc @@ -20,10 +20,9 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/reference_util.h" @@ -45,7 +44,7 @@ namespace { using ReshapeMotionTest = ClientLibraryTestBase; TEST_F(ReshapeMotionTest, ElementwiseOfReshapesWithNonSameInputShapes) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR2({{2, 3, 5}, {7, 11, 13}}); auto b = builder.ConstantR2({{17, 19}, {23, 29}, {31, 37}}); auto c = builder.Reshape(a, {6}); diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc index 6959c95502cb7af6b720592e7836c6789719a528..e7bd142dc9ddefbd8bebfb77d72218d662645c31 100644 --- a/tensorflow/compiler/xla/tests/reverse_test.cc +++ b/tensorflow/compiler/xla/tests/reverse_test.cc @@ -114,7 +114,7 @@ class ReverseTest : public ClientLibraryTestBase {}; // Tests the reverse operation on a 4D U8 array on dimension 0 and 3. XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); // Input shape is U8[1x2x3x4]. // clang-format off Array4D input({{ @@ -144,7 +144,7 @@ XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) { // Tests the reverse operation on a 4D float array on dimension 0 and 1. TEST_F(ReverseTest, Reverse4DFloatArrayOnDim01) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); // Input shape is float[4x3x2x1]. // clang-format off Array4D input({ diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h index e2d406f66d94f8ec76faa5b7d2d2e84dcaf6db57..7ca99a91635e85cd0888e59ecde31e47fec21844 100644 --- a/tensorflow/compiler/xla/tests/test_macros.h +++ b/tensorflow/compiler/xla/tests/test_macros.h @@ -34,7 +34,6 @@ limitations under the License. #include "tensorflow/core/platform/test.h" #define DISABLED_ON_CPU(X) X -#define DISABLED_ON_CPU_PARALLEL(X) X #define DISABLED_ON_GPU(X) X #define DISABLED_ON_INTERPRETER(X) X @@ -51,13 +50,6 @@ limitations under the License. # define DISABLED_ON_CPU(X) XLA_TEST_PASTE(DISABLED_, X) #endif // XLA_TEST_BACKEND_CPU -#ifdef XLA_TEST_BACKEND_CPU_PARALLEL -# undef DISABLED_ON_CPU -# define DISABLED_ON_CPU(X) XLA_TEST_PASTE(DISABLED_, X) -# undef DISABLED_ON_CPU_PARALLEL -# define DISABLED_ON_CPU_PARALLEL(X) XLA_TEST_PASTE(DISABLED_, X) -#endif // XLA_TEST_BACKEND_CPU_PARALLEL - #ifdef XLA_TEST_BACKEND_GPU # undef DISABLED_ON_GPU # define DISABLED_ON_GPU(X) XLA_TEST_PASTE(DISABLED_, X) diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc index cda1989fad670c805f30b5043e342d5f9a9a6fe2..997a1d8273736af31994ebbd07ff3857d1e8e0b5 100644 --- a/tensorflow/compiler/xla/tests/test_utils.cc +++ b/tensorflow/compiler/xla/tests/test_utils.cc @@ -339,8 +339,8 @@ StatusOr>> MakeFakeArguments( return std::move(arguments); } -Status VerifyHloModule(const perftools::gputools::Platform& platform, - HloModule* const module, bool allow_mixed_precision) { +Status VerifyHloModule(const se::Platform& platform, HloModule* const module, + bool allow_mixed_precision) { return HloVerifier(allow_mixed_precision).Run(module).status(); } diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h index b5ab779574fd5237d14cd24c345a9d5f1d41d1fd..30c147910cae85e1ebdddc22e637a6c1fd577c20 100644 --- a/tensorflow/compiler/xla/tests/test_utils.h +++ b/tensorflow/compiler/xla/tests/test_utils.h @@ -68,8 +68,7 @@ StatusOr>> MakeFakeArguments( // Check that a given module satisfies various constraints before trying to // execute it. -Status VerifyHloModule(const perftools::gputools::Platform& platform, - HloModule* const module, +Status VerifyHloModule(const se::Platform& platform, HloModule* const module, bool allow_mixed_precision = false); } // namespace xla diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc index e8efc6e2a83f42bf81fc1261ba508632cf3f85b3..59afd28a80c0fbf3df38457cd05961c883769856 100644 --- a/tensorflow/compiler/xla/tests/test_utils_test.cc +++ b/tensorflow/compiler/xla/tests/test_utils_test.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_utils.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/local_client_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -28,7 +28,7 @@ namespace { class TestUtilsTest : public LocalClientTestBase {}; XLA_TEST_F(TestUtilsTest, UnusedParam) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); // Make the reduction lambda. Shape single_float = ShapeUtil::MakeShape(F32, {}); builder.Parameter(0, single_float, "unused"); diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc index 268ba338f2e6740a1d1a046d5a85494f3cf2e9f8..e2067bc1b835a946fc56801cbf227e05ef0686b4 100644 --- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc +++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc @@ -45,7 +45,7 @@ class TransferManagerTest : public LocalClientTestBase { ~TransferManagerTest() override = default; - std::unique_ptr AllocateDeviceBuffer(const Shape& shape) { + ScopedShapedBuffer AllocateDeviceBuffer(const Shape& shape) { return transfer_manager_ ->AllocateScopedShapedBuffer( shape, GetOrCreateAllocator(local_client_->platform()), @@ -64,10 +64,10 @@ XLA_TEST_F(TransferManagerTest, TransferR0U32) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectR0Equal(42, *result); } @@ -80,10 +80,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1F32) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectR1Equal({1.25f, 2.5f, -17.0f, -20.125f}, *result); @@ -98,10 +98,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1LargeF32) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectR1Equal(test_vector, *result); } @@ -114,10 +114,10 @@ XLA_TEST_F(TransferManagerTest, TransferR1U8) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); EXPECT_EQ(result->GetR1U8AsString(), test_string); } @@ -130,10 +130,10 @@ XLA_TEST_F(TransferManagerTest, TransferR2F32) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectR2Equal( {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, *result); @@ -150,10 +150,10 @@ XLA_TEST_F(TransferManagerTest, // Round trip literal through device. Set the on-device layout to something // different than the literal layout. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); EXPECT_FALSE( LayoutUtil::Equal(result->shape().layout(), literal->shape().layout())); @@ -170,10 +170,10 @@ XLA_TEST_F(TransferManagerTest, TransferTuple) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } @@ -184,10 +184,10 @@ XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } @@ -204,10 +204,10 @@ XLA_TEST_F(TransferManagerTest, TransferNestedTuple) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } @@ -219,10 +219,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValue) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } @@ -238,10 +238,10 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) { // Round trip literal through device. ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice( - stream_executor_, *literal, *device_buffer)); + stream_executor_, *literal, device_buffer)); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr result, transfer_manager_->TransferLiteralFromDevice( - stream_executor_, *device_buffer)); + stream_executor_, device_buffer)); LiteralTestUtil::ExpectEqual(*literal, *result); } diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc index 098be6d7aabe88d0deef600716229ddbd0bcae2f..61be1746530a1991271107910d033f6b93ea9bfd 100644 --- a/tensorflow/compiler/xla/tests/tuple_test.cc +++ b/tensorflow/compiler/xla/tests/tuple_test.cc @@ -269,7 +269,7 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) { ComputeAndCompareR2(&builder, expected, {}, error_spec_); } -XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) { +XLA_TEST_F(TupleTest, SelectBetweenTuplesOnFalse) { // Tests a selection between tuples with "false" path taken. XlaBuilder builder(TestName()); @@ -313,7 +313,7 @@ XLA_TEST_F(TupleTest, TuplesInAMap) { ComputeAndCompareR1(&b, {-99.0f, 101.0f, 214.41f}, {}, error_spec_); } -XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) { +XLA_TEST_F(TupleTest, SelectBetweenTuplesOnTrue) { // Tests a selection between tuples with "true" path taken. XlaBuilder builder(TestName()); @@ -350,7 +350,7 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) { } // Cascaded selects between tuple types. -XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) { +XLA_TEST_F(TupleTest, SelectBetweenTuplesCascaded) { // // vec1 vec2 vec2 vec1 // | | | | @@ -390,8 +390,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) { ComputeAndCompareR1(&builder, {3.f, 6.f, 9.f}, {}, error_spec_); } -XLA_TEST_F(TupleTest, - DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesReuseConstants)) { +XLA_TEST_F(TupleTest, SelectBetweenTuplesReuseConstants) { // Similar to SelectBetweenTuples, but the constants are shared between the // input tuples. XlaBuilder builder(TestName()); @@ -516,10 +515,8 @@ XLA_TEST_F(TupleTest, ComplexTuples) { class TupleHloTest : public HloTestBase {}; -// Disabled on CPU parallel because that's broken and will be removed soon. // Disabled on the interpreter because bitcast doesn't exist on the interpreter. -TEST_F(TupleHloTest, - DISABLED_ON_INTERPRETER(DISABLED_ON_CPU_PARALLEL(BitcastAfterGTE))) { +TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) { const char* testcase = R"( HloModule m @@ -535,8 +532,7 @@ TEST_F(TupleHloTest, HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest()) .ValueOrDie(); auto param = Literal::MakeTupleOwned(Literal::CreateR1({1, 2, 3})); - TF_ASSERT_OK_AND_ASSIGN(auto result, - ExecuteNoHloPasses(std::move(module), {param.get()})); + auto result = ExecuteNoHloPasses(std::move(module), {param.get()}); EXPECT_TRUE(LiteralTestUtil::Equal( *result, *Literal::MakeTupleOwned(Literal::CreateR2({{1, 2, 3}})))); diff --git a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc index 32ba067a10df6c15348344da813e6a960f05491c..82d301983fc7885ef5c1c1ed05b74fc017bb7727 100644 --- a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc +++ b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc @@ -19,9 +19,9 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -33,9 +33,9 @@ namespace { class VecOpsReduceTest : public ClientLibraryTestBase { public: - VecOpsReduceTest() : builder_(client_, TestName()) {} + VecOpsReduceTest() : builder_(TestName()) {} - ComputationDataHandle BuildSampleConstantCube() { + XlaOp BuildSampleConstantCube() { // clang-format off Array3D x3d({ {{1.0, 2.0, 3.0}, // | dim 1 // } plane 0 in dim 0 @@ -49,7 +49,7 @@ class VecOpsReduceTest : public ClientLibraryTestBase { return builder_.ConstantR3FromArray3D(x3d); } - ComputationBuilder builder_; + XlaBuilder builder_; ErrorSpec errspec_{1e-3, 0}; }; diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc index b52c718814d4ffeff68c60588a6637a2159d57e5..3dded3f7157195b2c7aaac2ff9aac79ca4611d05 100644 --- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc +++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc @@ -19,10 +19,11 @@ limitations under the License. #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -39,7 +40,7 @@ namespace { class VecOpsSimpleTest : public ClientLibraryTestBase { public: - explicit VecOpsSimpleTest(perftools::gputools::Platform* platform = nullptr) + explicit VecOpsSimpleTest(se::Platform* platform = nullptr) : ClientLibraryTestBase(platform) { mutable_debug_options()->add_xla_disable_hlo_passes("algsimp"); mutable_debug_options()->add_xla_disable_hlo_passes("inline"); @@ -49,7 +50,7 @@ class VecOpsSimpleTest : public ClientLibraryTestBase { }; XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); auto exp = builder.Exp(x); @@ -63,7 +64,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) { XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) { for (int count : {63, 64, 65, 127, 128, 129, 17 * 4096}) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector exponents; exponents.reserve(count); for (int i = 0; i < count; ++i) { @@ -84,7 +85,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) { } XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D exponents(2, 2, 2, 2); std::vector exponents_vector; @@ -106,7 +107,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) { } XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); builder.Neg(x); @@ -117,7 +118,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) { } XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({2, -2, 12, -4, 5, 20, -15, 0, -2, 1}); builder.Neg(x); @@ -126,7 +127,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) { } XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {0, 1, 42, static_cast(-1), static_cast(-12)}); builder.Neg(x); @@ -136,7 +137,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) { } XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); builder.SquareF32(x); @@ -147,7 +148,7 @@ XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) { } XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); builder.ReciprocalF32(x); @@ -159,7 +160,7 @@ XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) { } XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({0.0, -0.0}); auto exp = builder.SqrtF32(x); @@ -167,7 +168,7 @@ XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) { } XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({16.0, 1.0, 1024.0, 0.16, 0.2, 12345}); auto exp = builder.SqrtF32(x); @@ -176,7 +177,7 @@ XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) { } XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({16.0, 1.0, 1024.0, 0.16, 0.2, 12345, 1.2345}); auto exp = builder.Pow(x, builder.ConstantR0(-.5f)); @@ -188,7 +189,7 @@ XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) { } XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto add = CreateScalarAddComputation(F32, &builder); auto x = builder.ConstantR1( @@ -203,7 +204,7 @@ XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) { } XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); auto y = builder.ConstantR1( @@ -218,8 +219,8 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) { XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) { // Similar to MaxTenValues, except that the inputs come from params rather // than constants. - ComputationBuilder builder(client_, TestName()); - ComputationDataHandle v1, v2; + XlaBuilder builder(TestName()); + XlaOp v1, v2; std::unique_ptr param0_data = CreateR1Parameter( {41.0f, 2.0f, 3.0f, 84.0f}, /*parameter_number=*/0, /*name=*/"v1", /*builder=*/&builder, /*data_handle=*/&v1); @@ -236,7 +237,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) { XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) { // Similar to MaxTenValuesFromParams, except that the data size passed in and // out is large. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // Number of floats in the data passed into and out of the computation. constexpr int datalen = 15 * 1000; @@ -259,7 +260,7 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) { expected_vec.push_back(larger); } - ComputationDataHandle v1, v2; + XlaOp v1, v2; std::unique_ptr param0_data = CreateR1Parameter(v1vec, /*parameter_number=*/0, /*name=*/"v1", /*builder=*/&builder, /*data_handle=*/&v1); @@ -274,7 +275,7 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) { } XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); auto y = builder.ConstantR0(0); @@ -286,7 +287,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) { } XLA_TEST_F(VecOpsSimpleTest, MinTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); auto y = builder.ConstantR1( @@ -299,7 +300,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinTenValues) { } XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR0(0); auto one = builder.ConstantR0(1); auto x = builder.ConstantR1( @@ -312,7 +313,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) { } XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR0(0); auto one = builder.ConstantR0(1); auto x = builder.ConstantR1( @@ -325,7 +326,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) { } XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR1({0.0f, 0.0f}); auto one = builder.ConstantR1({1.0f, 1.0f}); auto x = builder.ConstantR1({2.1, -2.6}); @@ -336,7 +337,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) { } XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto one = builder.ConstantR0(1); auto two = builder.ConstantR0(2); auto x = builder.ConstantR1( @@ -348,11 +349,22 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) { ComputeAndCompareR1(&builder, expected, {}); } +XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) { + ComputationBuilder builder(client_, TestName()); + auto zero = builder.ConstantR0(0); + auto one = builder.ConstantR0(10); + auto x = builder.ConstantR1({-3, 3, 9, 13}); + auto clamp = builder.Clamp(zero, x, one); + + std::vector expected = {0, 3, 9, 10}; + ComputeAndCompareR1(&builder, expected, {}); +} + XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { - Computation add_half; + XlaComputation add_half; { // add_half(x) = x + 0.5 - ComputationBuilder builder(client_, "add_half"); + XlaBuilder builder("add_half"); auto x_value = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x_value"); auto half = builder.ConstantR0(0.5); @@ -362,10 +374,10 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { add_half = computation_status.ConsumeValueOrDie(); } - Computation clamp; + XlaComputation clamp; { // clamp(y) = clamp<0,5>(y) - ComputationBuilder builder(client_, "clamp"); + XlaBuilder builder("clamp"); auto y_value = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y_value"); auto zero = builder.ConstantR0(0.0); @@ -375,10 +387,10 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { clamp = computation_status.ConsumeValueOrDie(); } - Computation mult_relu_add; + XlaComputation mult_relu_add; { // mult_relu_add(z) = clamp(add_half(2 * max(z, 0))) - ComputationBuilder builder(client_, "mult_relu_add"); + XlaBuilder builder("mult_relu_add"); auto z_value = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "z_value"); auto zero = builder.ConstantR0(0.0); @@ -392,7 +404,7 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { mult_relu_add = computation_status.ConsumeValueOrDie(); } - ComputationBuilder builder(client_, "map10"); + XlaBuilder builder("map10"); { auto x = builder.ConstantR1( {2.1, -21.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); @@ -405,7 +417,7 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { } XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}); auto y = builder.ConstantR0(3); builder.Rem(x, y); @@ -415,7 +427,7 @@ XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) { } XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({false, true}); auto y = builder.ConstantR1({true, false}); builder.Eq(x, y); @@ -425,7 +437,7 @@ XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) { } XLA_TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({false, true}); auto y = builder.ConstantR1({true, false}); builder.Ne(x, y); diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc index 89ce2ce797f979b8668fbdb172a4a3abc5922b9f..336fed27c6f19fab7f8b171b7364e323a6e08a07 100644 --- a/tensorflow/compiler/xla/tests/while_test.cc +++ b/tensorflow/compiler/xla/tests/while_test.cc @@ -37,8 +37,6 @@ limitations under the License. #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/platform/types.h" -namespace se = ::perftools::gputools; - namespace xla { namespace { @@ -1323,10 +1321,6 @@ void BM_WhileLoop(int num_iters) { } } -// TODO(b/32470510): Benchmark fails on parallel CPU backend. -#ifndef XLA_TEST_BACKEND_CPU_PARALLEL BENCHMARK(BM_WhileLoop); -#endif - } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc index ff3418a128eed82b730a6602d6e3faba4ad7be32..8354bb71cb7e8875b654e6624a5a1a13cbf30d9d 100644 --- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -34,7 +34,7 @@ limitations under the License. namespace xla { namespace { -namespace se = ::perftools::gputools; + namespace gtl = ::tensorflow::gtl; class HloProfileTest : public ClientLibraryTestBase {}; @@ -129,18 +129,18 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, auto* transfer_manager = backend->transfer_manager(); TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr lhs_arg, + ScopedShapedBuffer lhs_arg, transfer_manager->AllocateScopedShapedBuffer( lhs_arg_shape, allocator, backend->default_device_ordinal())); TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( - executor, *Literal::CreateFromShape(lhs_arg_shape), *lhs_arg)); + executor, *Literal::CreateFromShape(lhs_arg_shape), lhs_arg)); TF_ASSERT_OK_AND_ASSIGN( - std::unique_ptr rhs_arg, + ScopedShapedBuffer rhs_arg, transfer_manager->AllocateScopedShapedBuffer( rhs_arg_shape, allocator, backend->default_device_ordinal())); TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( - executor, *Literal::CreateFromShape(rhs_arg_shape), *rhs_arg)); + executor, *Literal::CreateFromShape(rhs_arg_shape), rhs_arg)); TF_ASSERT_OK_AND_ASSIGN( std::unique_ptr local_executable, @@ -165,7 +165,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, backend->eigen_intra_op_thread_pool()); TF_ASSERT_OK_AND_ASSIGN( auto execution_result, - executable->ExecuteOnStream(&run_options, {lhs_arg.get(), rhs_arg.get()}, + executable->ExecuteOnStream(&run_options, {&lhs_arg, &rhs_arg}, &hlo_execution_profile)); (void)execution_result; @@ -175,8 +175,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, XLA_VLOG_LINES(4, *profile_output); } -// TODO(b/71364943): This test exposes a bug in the parallel CPU backend. -XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) { +XLA_TEST_F(HloProfileTest, ProfileSingleComputation) { const int64 m = 256, k = 256, n = 256; Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k}); Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k}); @@ -239,12 +238,9 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) { EXPECT_TRUE(HasTrops(tanh_profile)); } -// TODO(b/71364943): This test exposes a bug in the parallel CPU backend. -// // TODO(b/71544591): The GPU backend does not record cycles spent in on Hlo // instructions "interior" to while nodes. -XLA_TEST_F(HloProfileTest, - DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(ProfileWhileComputation))) { +XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) { const int64 size = 256; Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size}); Shape while_result_shape = diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc index b2f122982adf750106f034e7e786367720ebafcf..fdbfc0210ea63ac4350ba48ac3354d23c53c69a7 100644 --- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc +++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc @@ -303,18 +303,14 @@ bool HloParser::ParseComputations() { // set the layouts to what the hlo text says. for (int p = 0; p < computation->num_parameters(); p++) { const Shape& param_shape = computation->parameter_instruction(p)->shape(); - if (param_shape.has_layout()) { - module_->mutable_entry_computation_layout() - ->mutable_parameter_layout(p) - ->ResetLayout(param_shape.layout()); - } + TF_CHECK_OK(module_->mutable_entry_computation_layout() + ->mutable_parameter_layout(p) + ->CopyLayoutFromShape(param_shape)); } const Shape& result_shape = computation->root_instruction()->shape(); - if (result_shape.has_layout()) { - module_->mutable_entry_computation_layout() - ->mutable_result_layout() - ->ResetLayout(result_shape.layout()); - } + TF_CHECK_OK(module_->mutable_entry_computation_layout() + ->mutable_result_layout() + ->CopyLayoutFromShape(result_shape)); } return true; @@ -470,6 +466,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder, case HloOpcode::kRoundNearestAfz: case HloOpcode::kBitcast: case HloOpcode::kCeil: + case HloOpcode::kClz: case HloOpcode::kCopy: case HloOpcode::kCos: case HloOpcode::kExp: @@ -724,15 +721,6 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder, shape, operands[0], *broadcast_dimensions)); break; } - case HloOpcode::kBroadcastDimOne: { - if (!ParseOperands(&operands, /*expected_size=*/1) || - !ParseAttributes(attrs)) { - return false; - } - instruction = builder->AddInstruction( - HloInstruction::CreateBroadcastDimOne(shape, operands[0])); - break; - } case HloOpcode::kConcatenate: { optional> dimensions; attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List, diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc index 57684b58346166f7e3ef9576f6cd8f70ab9dc389..adc8b1d620eb65fdca19072831360b71847abf9e 100644 --- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc +++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc @@ -57,18 +57,6 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] { ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y) } -)" -}, -// broadcast size-one dimensions -{ -"BroadcastDimOne", -R"(HloModule broadcast_dim_one_module - -ENTRY %broadcast-dim-one () -> f32[2,2] { - %constant = f32[1,2]{1,0} constant(f32[1,2] { { 1.1, 2.2 } }) - ROOT %broadcast-dim-one = f32[2,2]{1,0} broadcast-dim-one(f32[1,2]{1,0} %constant) -} - )" }, // pred constant diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h index 9fa4297523bab0748863479be52dff1b7b523a8b..b645acb700b0f168112a40c9c72b4669435f717d 100644 --- a/tensorflow/compiler/xla/types.h +++ b/tensorflow/compiler/xla/types.h @@ -46,4 +46,10 @@ using ::Eigen::half; } // namespace xla +// Alias namespace ::stream_executor as ::xla::se. +namespace stream_executor {} +namespace xla { +namespace se = ::stream_executor; +} // namespace xla + #endif // TENSORFLOW_COMPILER_XLA_TYPES_H_ diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h index 2da9f9ed6f40fcf5b2512f974519df0b355da10f..be33bd6dd1304fa8fc6e5aed1d4c4d65bf97e692 100644 --- a/tensorflow/compiler/xla/util.h +++ b/tensorflow/compiler/xla/util.h @@ -528,6 +528,16 @@ bool IsInt32(T x) { // value is implementation-defined." return static_cast(x) == x; } + +template +Status EraseElementFromVector(std::vector* container, const T& value) { + // c_find returns a const_iterator which does not seem to work on gcc 4.8.4, + // and this breaks the ubuntu/xla_gpu build bot. + auto it = std::find(container->begin(), container->end(), value); + TF_RET_CHECK(it != container->end()); + container->erase(it); + return Status::OK(); +} } // namespace xla #define XLA_LOG_LINES(SEV, STRING) \ diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc index 93284b80f9e1f82c4b18dc7388754d5c01a7740c..f11123ca24849af1d9c4fd49809a986eb7202bd5 100644 --- a/tensorflow/compiler/xla/window_util.cc +++ b/tensorflow/compiler/xla/window_util.cc @@ -199,6 +199,9 @@ bool IsInactiveWindowDimension(const Window& window, int64 logical_dim) { int64 DilatedBound(int64 bound, int64 dilation) { CHECK_GE(bound, 0); CHECK_GE(dilation, 1); + if (bound == 0) { + return 0; + } // Suppose the array has three entries 123 and the dilation factor is 4. Then // the dilated array has 9 entries 1xxx2xxx3. Here, each original entry except @@ -212,7 +215,7 @@ int64 StridedBound(int64 bound, int64 window_size, int64 stride) { CHECK_GE(bound, 0); CHECK_GE(stride, 1); - if (window_size > bound) { + if (bound == 0 || window_size > bound) { return 0; } diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto index f18d53c6089e8d4411099be8fb0fb8c349ace4f7..d23f9e5918f54c4f385f3b16fd84bbee51ed5a95 100644 --- a/tensorflow/compiler/xla/xla_data.proto +++ b/tensorflow/compiler/xla/xla_data.proto @@ -801,6 +801,9 @@ enum UnaryOperation { // Elementwise, extract real component of complex x. UNOP_IMAG = 16; + + // Elementwise, computes clz(x). + UNOP_CLZ = 17; } message UnaryOpRequest { diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD index 7e47516550068e8f38d2155e48e229c2ab77b488..abdbdb4cd22ff38a0fae89af10c600a178d9a3d4 100644 --- a/tensorflow/contrib/BUILD +++ b/tensorflow/contrib/BUILD @@ -25,11 +25,13 @@ py_library( "//tensorflow/contrib/batching:batch_py", "//tensorflow/contrib/bayesflow:bayesflow_py", "//tensorflow/contrib/boosted_trees:init_py", + "//tensorflow/contrib/checkpoint/python:checkpoint", "//tensorflow/contrib/cloud:cloud_py", "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip", "//tensorflow/contrib/cluster_resolver:cluster_resolver_py", - "//tensorflow/contrib/coder:coder_ops_py", + "//tensorflow/contrib/coder:coder_py", "//tensorflow/contrib/compiler:compiler_py", + "//tensorflow/contrib/constrained_optimization", "//tensorflow/contrib/copy_graph:copy_graph_py", "//tensorflow/contrib/crf:crf_py", "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py", diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py index 36cc5144d072893a950c1701fc46fb55510d1fd2..7f33d460dce0778601ecfd3456f118189a3f346e 100644 --- a/tensorflow/contrib/__init__.py +++ b/tensorflow/contrib/__init__.py @@ -24,10 +24,12 @@ import os # Add projects here, they will show up under tf.contrib. from tensorflow.contrib import batching from tensorflow.contrib import bayesflow +from tensorflow.contrib import checkpoint from tensorflow.contrib import cloud from tensorflow.contrib import cluster_resolver from tensorflow.contrib import coder from tensorflow.contrib import compiler +from tensorflow.contrib import constrained_optimization from tensorflow.contrib import copy_graph from tensorflow.contrib import crf from tensorflow.contrib import cudnn_rnn diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py index 8add2aacff1d64f1617cd24167c4c6c6706044da..159d985db5c48f8fe1a26350255f8d8f68482473 100644 --- a/tensorflow/contrib/all_reduce/python/all_reduce.py +++ b/tensorflow/contrib/all_reduce/python/all_reduce.py @@ -18,10 +18,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import math -import re from tensorflow.contrib import nccl +from tensorflow.python.framework import device as device_lib from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -659,21 +660,20 @@ def _split_by_task(devices, values): num_devices = len(devices) if num_devices != len(values): raise ValueError("len(devices) must equal len(values)") - pattern = re.compile(r"/task:(\d+)/") - per_task_devices = [] - per_task_values = [] + per_task_devices = collections.OrderedDict() + per_task_values = collections.OrderedDict() for d in range(num_devices): - m = pattern.search(devices[d]) - if m: - index = int(m.group(1)) - while index >= len(per_task_devices): - per_task_devices.append([]) - per_task_values.append([]) - per_task_devices[index].append(devices[d]) - per_task_values[index].append(values[d]) - else: + d_spec = device_lib.DeviceSpec.from_string(devices[d]) + if not hasattr(d_spec, "task") or d_spec.task is None: assert False, "failed to parse device %s" % devices[d] - return (per_task_devices, per_task_values) + index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task) + if index not in per_task_devices: + per_task_devices[index] = [] + per_task_values[index] = [] + per_task_devices[index].append(devices[d]) + per_task_values[index].append(values[d]) + + return (list(per_task_devices.values()), list(per_task_values.values())) def build_nccl_all_reduce(input_tensors, red_op, un_op=None): diff --git a/tensorflow/contrib/autograph/README.md b/tensorflow/contrib/autograph/README.md index 7e84f237dc9a83098f142a54c48cf5b6ba35aaaa..0fcbf5dd59ceceb337434b1e27006ddaf620b017 100644 --- a/tensorflow/contrib/autograph/README.md +++ b/tensorflow/contrib/autograph/README.md @@ -1,4 +1,117 @@ -# Autograph +# AutoGraph -A compiler for generating TensorFlow numeric and control flow ops from Python -code. +IMPORTANT: AutoGraph is pre-alpha, under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback! + +AutoGraph is a Python to TensorFlow compiler. + +With AutoGraph, you can write [Eager style](https://www.tensorflow.org/programmers_guide/eager) code in a concise manner, and run it as a TensorFlow graph. AutoGraph uses source code transformation and partial evaluation to generate Python code that builds an equivalent TensorFlow subgraph. The result is code that behaves like ops and can be freely combined with other TensorFlow ops. + +For example, this Python function: + +``` +def f(x): + if x < 0: + x = -x + return x +``` + +would be converted to this: + +``` +def graph_mode_f(x): + with tf.name_scope('f'): + + def if_true(): + with tf.name_scope('if_true'): + x_1, = x, + x_1 = tf.negative(x_1) + return x_1, + + def if_false(): + with tf.name_scope('if_false'): + x_1, = x, + return x_1, + x = ag__.utils.run_cond(tf.greater(x, 0), if_true, if_false) + return x +``` + +so you can use it like an op: + +``` +with tf.Graph().as_default(): + x = tf.constant(-1.0) + + converted_f = autograph.to_graph(f) + y = converted_f(x) + + with tf.Session() as sess: + print(sess.run(y)) + # Output: 1 +``` + +# Getting started + +Use AutoGraph in one of the following ways, described below: + + 1. Annotations (simpler) + 2. Functional API (more flexible) + +NOTE: You can find more examples in this [interactive notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb). + +To get started, install the latest nightly TensorFlow build: + +```shell +pip install -U tf-nightly +``` + +Then import the `autograph` module from `tf.contrib`: + +``` +from tensorflow.contrib import autograph as ag +``` + +## Using with annotations + +Annotating a function or class with `@convert` converts it in place: + +``` +@ag.convert() +def f(x): + if x < 0: + x = -x + return x +``` + +... so that it always outputs TensorFlow code: + +``` +with tf.Graph().as_default(): + x = tf.constant(-1) + + y = f(x) + + with tf.Session() as sess: + print(sess.run(y)) + # Output: 1 +``` + +## Using the functional API + +The functional API allows you to convert an existing function, class or object after it was defined: + +``` +converted_f = ag.to_graph(f) + +print(converted_f(tf.constant(-1))) +# Output: Tensor + +print(f(-1)) +# Output: 1 +``` + +You can use the functional API to inspect the generated code as well: + +``` +print(ag.to_code(f)) +# Output: +``` diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py index 5dfb7a59d51859983c7e0c5549facc3a48b2d285..91de82f0a78ccae711298d78364810dd099a5c38 100644 --- a/tensorflow/contrib/autograph/converters/break_statements.py +++ b/tensorflow/contrib/autograph/converters/break_statements.py @@ -24,72 +24,102 @@ from tensorflow.contrib.autograph.pyct import transformer from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno -class BreakCanonicalizationTransformer(transformer.Base): +# Tags for local state. +BREAK_USED = 'break_used' +CONTROL_VAR_NAME = 'control_var_name' + + +class BreakStatementTransformer(transformer.Base): """Canonicalizes break statements into additional conditionals.""" - def __init__(self, context): - super(BreakCanonicalizationTransformer, self).__init__(context) - # This is a stack structure, to correctly process nested loops. - # Each item is a list [break_used, break_variable_name] - self.break_uses = [] + def _track_body(self, nodes, break_var): + self.enter_local_scope() + self.set_local(CONTROL_VAR_NAME, break_var) + nodes = self.visit_block(nodes) + break_used = self.get_local(BREAK_USED, False) + self.exit_local_scope() + return nodes, break_used def visit_Break(self, node): - self.break_uses[-1][0] = True + self.set_local(BREAK_USED, True) + var_name = self.get_local(CONTROL_VAR_NAME) + # TODO(mdan): This will fail when expanded inside a top-level else block. template = """ var_name = True continue """ - return templates.replace(template, var_name=self.break_uses[-1][1]) + return templates.replace(template, var_name=var_name) + + def _guard_if_present(self, block, var_name): + """Prevents the block from executing if var_name is set.""" + if not block: + return block + template = """ + if not var_name: + block + """ + node = templates.replace( + template, + var_name=var_name, + block=block) + return node def visit_While(self, node): scope = anno.getanno(node, NodeAnno.BODY_SCOPE) - break_var = self.context.namer.new_symbol('break_requested', - scope.referenced) + break_var = self.context.namer.new_symbol('break__', scope.referenced) - self.break_uses.append([False, break_var]) - node = self.generic_visit(node) - if self.break_uses[-1][0]: + node.test = self.visit(node.test) + node.body, break_used = self._track_body(node.body, break_var) + # A break in the else clause applies to the containing scope. + node.orelse = self.visit_block(node.orelse) + + if break_used: template = """ var_name = False - while original_test and not var_name: - original_body + while test and not var_name: + body else: - original_orelse + orelse """ + # Python's else clause only triggers if the loop exited cleanly (e.g. + # break did not trigger). node = templates.replace( template, var_name=break_var, - original_test=node.test, - original_body=node.body, - original_orelse=node.orelse) - self.break_uses.pop() + test=node.test, + body=node.body, + orelse=self._guard_if_present(node.orelse, break_var)) return node def visit_For(self, node): scope = anno.getanno(node, NodeAnno.BODY_SCOPE) - break_var = self.context.namer.new_symbol('break_requested', - scope.referenced) + break_var = self.context.namer.new_symbol('break__', scope.referenced) + + node.target = self.visit(node.target) + node.iter = self.visit(node.iter) + node.body, break_used = self._track_body(node.body, break_var) + # A break in the else clause applies to the containing scope. + node.orelse = self.visit_block(node.orelse) - self.break_uses.append([False, break_var]) - node = self.generic_visit(node) - if self.break_uses[-1][0]: + if break_used: + node.orelse = self._guard_if_present(node.orelse, break_var) template = """ var_name = False - original_for + for_stmt """ + # Python's else clause only triggers if the loop exited cleanly (e.g. + # break did not trigger). node = templates.replace( template, var_name=break_var, - original_for=node) + for_stmt=node) extra_cond = templates.replace_as_expression( 'not var_name', var_name=break_var) - new_for_node = node[1] - anno.setanno(new_for_node, 'extra_cond', extra_cond) - self.break_uses.pop() + anno.setanno(node[1], 'extra_cond', extra_cond) return node def transform(node, context): - return BreakCanonicalizationTransformer(context).visit(node) + return BreakStatementTransformer(context).visit(node) diff --git a/tensorflow/contrib/autograph/converters/break_statements_test.py b/tensorflow/contrib/autograph/converters/break_statements_test.py index dd4914a022f57b3bb4a19ec132f311f12269fa9e..1af59e9b5260fe0d3a3ef72c7a003dc451e230f3 100644 --- a/tensorflow/contrib/autograph/converters/break_statements_test.py +++ b/tensorflow/contrib/autograph/converters/break_statements_test.py @@ -25,7 +25,7 @@ from tensorflow.python.platform import test class BreakCanonicalizationTest(converter_test_base.TestCase): - def test_basic_break(self): + def test_basic_while(self): def test_fn(x): v = [] @@ -40,13 +40,11 @@ class BreakCanonicalizationTest(converter_test_base.TestCase): node = break_statements.transform(node, self.ctx) with self.compiled(node) as result: - self.assertEqual(test_fn(0), result.test_fn(0)) - self.assertEqual(test_fn(1), result.test_fn(1)) - self.assertEqual(test_fn(2), result.test_fn(2)) - self.assertEqual(test_fn(3), result.test_fn(3)) - self.assertEqual(test_fn(4), result.test_fn(4)) + self.assertEqual([], result.test_fn(0)) + self.assertEqual([], result.test_fn(1)) + self.assertEqual([3], result.test_fn(4)) - def test_basic_break_for_loop(self): + def test_basic_for(self): def test_fn(a): v = [] @@ -57,30 +55,18 @@ class BreakCanonicalizationTest(converter_test_base.TestCase): v.append(x) return v - # The break is incompletely canonicalized for for loops. Everything is - # in place except for the condition verification. - def test_equiv_fn(a): - v = [] - for x in a: - x -= 1 - if x % 2 == 0: - continue - v.append(x) - return v - node = self.parse_and_analyze(test_fn, {}) node = break_statements.transform(node, self.ctx) with self.compiled(node) as result: - # The break is incompletely canonicalized. Everything is in place, but - # the loop does not break. - self.assertEqual(test_equiv_fn([]), result.test_fn([])) - self.assertEqual(test_equiv_fn([1]), result.test_fn([1])) - self.assertEqual(test_equiv_fn([2]), result.test_fn([2])) - self.assertEqual( - test_equiv_fn([1, 2, 3, 4]), result.test_fn([1, 2, 3, 4])) + # The break is incompletely canonicalized. The loop will not interrupt, + # but the section following the break will be skipped. + self.assertEqual([], result.test_fn([])) + self.assertEqual([3, 3], result.test_fn([4, 4])) + self.assertEqual([3], result.test_fn([4, 5])) + self.assertEqual([3], result.test_fn([5, 4])) - def test_continue_deeply_nested(self): + def test_deeply_nested(self): def test_fn(x): v = [] @@ -93,7 +79,7 @@ class BreakCanonicalizationTest(converter_test_base.TestCase): u.append(x) else: w.append(x) - continue + break v.append(x) return v, u, w @@ -101,11 +87,60 @@ class BreakCanonicalizationTest(converter_test_base.TestCase): node = break_statements.transform(node, self.ctx) with self.compiled(node) as result: - self.assertEqual(test_fn(0), result.test_fn(0)) - self.assertEqual(test_fn(1), result.test_fn(1)) - self.assertEqual(test_fn(2), result.test_fn(2)) - self.assertEqual(test_fn(3), result.test_fn(3)) - self.assertEqual(test_fn(4), result.test_fn(4)) + self.assertEqual(([], [], []), result.test_fn(0)) + self.assertEqual(([2, 1], [2], [0]), result.test_fn(3)) + self.assertEqual(([10, 9, 8, 7], [10, 8], [6]), result.test_fn(11)) + + def test_nested_loops(self): + + def test_fn(x): + v = [] + u = [] + while x > 0: + x -= 1 + y = x + while y > 0: + y -= 1 + if y % 2 == 0: + break + u.append(y) + if x == 0: + break + v.append(x) + return v, u + + node = self.parse_and_analyze(test_fn, {}) + node = break_statements.transform(node, self.ctx) + + with self.compiled(node) as result: + self.assertEqual(([], []), result.test_fn(0)) + self.assertEqual(([1], []), result.test_fn(2)) + self.assertEqual(([2, 1], [1]), result.test_fn(3)) + self.assertEqual(([4, 3, 2, 1], [3, 1]), result.test_fn(5)) + + def test_loop_else(self): + + def test_fn(x): + v = [] + u = [] + while x > 0: + x -= 1 + y = x + while y > 1: + break + else: + u.append(y) + break + v.append(x) + return v, u + + node = self.parse_and_analyze(test_fn, {}) + node = break_statements.transform(node, self.ctx) + + with self.compiled(node) as result: + self.assertEqual(([], []), result.test_fn(0)) + self.assertEqual(([], [1]), result.test_fn(2)) + self.assertEqual(([2], [1]), result.test_fn(3)) if __name__ == '__main__': diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py index 685fd39d7cd7d0b15c9240032c9767392ec3642c..554f0471d44d54194c45c3855b1483796ae65a6a 100644 --- a/tensorflow/contrib/autograph/converters/call_trees.py +++ b/tensorflow/contrib/autograph/converters/call_trees.py @@ -245,8 +245,6 @@ class CallTreeTransformer(transformer.Base): new_call.keywords = node.keywords return new_call - # pylint:disable=invalid-name - def visit_Expr(self, node): if isinstance(node.value, gast.Call): if anno.hasanno(node.value.func, 'live_val'): @@ -294,15 +292,17 @@ class CallTreeTransformer(transformer.Base): raise NotImplementedError( 'py_func with return values (unknown function)') else: - if self.context.recursive: + if ast_util.matches(node, 'super(_)'): + # super() calls are preserved. The class conversion mechanism will + # ensure that they return the correct value. + pass + elif self.context.recursive: node = self._insert_dynamic_conversion(node) else: # Unresolved functions are allowed in non-recursive mode. pass return node - # pylint:enable=invalid-name - def transform(node, context, uncompiled_modules, nocompile_decorators): """Transform function call to the compiled counterparts. diff --git a/tensorflow/contrib/autograph/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py index 23b61cf78155b376f5bd1760f9a45669c2589679..41c2e71702e7e3ee3811a2cbee27c8c988eb3a5c 100644 --- a/tensorflow/contrib/autograph/converters/converter_test_base.py +++ b/tensorflow/contrib/autograph/converters/converter_test_base.py @@ -35,14 +35,17 @@ from tensorflow.python.platform import test class FakeNamer(object): + """A fake namer that uses a global counter to generate unique names.""" + + def __init__(self): + self.i = 0 def new_symbol(self, name_root, used): - i = 0 while True: - name = '%s%d' % (name_root, i) + self.i += 1 + name = '%s%d' % (name_root, self.i) if name not in used: return name - i += 1 def compiled_function_name(self, original_fqn, diff --git a/tensorflow/contrib/autograph/converters/lists.py b/tensorflow/contrib/autograph/converters/lists.py index 6dda554acc6331830e4a086f2d51aefcdf2ecd91..b49521b2c328f418828a5e92890aa1b169384b70 100644 --- a/tensorflow/contrib/autograph/converters/lists.py +++ b/tensorflow/contrib/autograph/converters/lists.py @@ -82,23 +82,33 @@ class ListTransformer(transformer.Base): element=call_node.args[0]) return node + def _replace_list_constructors(self, targets, values): + for target in targets: + if (isinstance(target, (gast.Tuple, gast.List)) and + isinstance(values, (gast.Tuple, gast.List))): + n_targets = len(target.elts) + for i in range(n_targets): + target_el, value_el = target.elts[i], values.elts[i] + values.elts[i] = self._replace_list_constructors( + (target_el,), value_el) + return values + if isinstance(values, gast.List): + if values.elts: + return self._pre_populated_list(values) + else: + return self._empty_list(values) + return values + def visit_Assign(self, node): node = self.generic_visit(node) # Only convert lists when they are assigned to a variable, e.g.: # l = [] - # TODO(mdan): This rule should be improved. - if len(node.targets) != 1: - return node - if not isinstance(node.value, gast.List): - return node - if not isinstance(node.value.ctx, gast.Load): - return node - - if node.value.elts: - node.value = self._pre_populated_list(node.value) - else: - node.value = self._empty_list(node.value) + # TODO(mdan): A similar pattern exists in type_info.py + # We should add a generic "unpack_assignment" function to the base + # transformer, that has the same effect as applying some logic to the SSA + # form. + node.value = self._replace_list_constructors(node.targets, node.value) return node diff --git a/tensorflow/contrib/autograph/converters/lists_test.py b/tensorflow/contrib/autograph/converters/lists_test.py index 749ba14347314f975c5a6e1111133336e2f5c5e6..74c6dc64f197f75eb3e66c01fb078467e8e8ea89 100644 --- a/tensorflow/contrib/autograph/converters/lists_test.py +++ b/tensorflow/contrib/autograph/converters/lists_test.py @@ -45,7 +45,51 @@ class ListTest(converter_test_base.TestCase): result.utils = utils result.dtypes = dtypes with self.test_session() as sess: - self.assertEqual(test_fn(), sess.run(result.test_fn().stack())) + self.assertAllEqual([1], sess.run(result.test_fn().stack())) + + def test_empty_annotated_lists_unpacked(self): + + def test_fn(): + l, m = [], [] + utils.set_element_type(l, dtypes.int32) + utils.set_element_type(m, dtypes.int32) + l.append(1) + m.append(2) + return l, m + + node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils}) + node = lists.transform(node, self.ctx) + + with self.compiled(node, tensor_array_ops.TensorArray, + dtypes.int32) as result: + result.utils = utils + result.dtypes = dtypes + with self.test_session() as sess: + res_l, res_m = result.test_fn() + self.assertEqual([1], sess.run(res_l.stack())) + self.assertEqual([2], sess.run(res_m.stack())) + + def test_empty_annotated_lists_list_unpacked(self): + + def test_fn(): + [l, m] = [], [] + utils.set_element_type(l, dtypes.int32) + utils.set_element_type(m, dtypes.int32) + l.append(1) + m.append(2) + return l, m + + node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils}) + node = lists.transform(node, self.ctx) + + with self.compiled(node, tensor_array_ops.TensorArray, + dtypes.int32) as result: + result.utils = utils + result.dtypes = dtypes + with self.test_session() as sess: + res_l, res_m = result.test_fn() + self.assertEqual([1], sess.run(res_l.stack())) + self.assertEqual([2], sess.run(res_m.stack())) if __name__ == '__main__': diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb similarity index 50% rename from tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb rename to tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb index 7f5e4d4ac124f3e9834a87193da110160926e77e..324b23c24b5a7970d7f20ed955839ba1cf1774fc 100644 --- a/tensorflow/contrib/autograph/examples/notebooks/rnn_colorbot_estimator.ipynb +++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb @@ -62,7 +62,7 @@ } }, "source": [ - "# Case study: building an RNN\n" + "# Case study: training a custom RNN, using Keras and Estimators\n" ] }, { @@ -118,6 +118,16 @@ " length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n", " return rgb, chars, length\n", "\n", + "\n", + "def set_static_batch_shape(batch_size):\n", + " def apply(rgb, chars, length):\n", + " rgb.set_shape((batch_size, None))\n", + " chars.set_shape((batch_size, None, 256))\n", + " length.set_shape((batch_size,))\n", + " return rgb, chars, length\n", + " return apply\n", + "\n", + "\n", "def load_dataset(data_dir, url, batch_size, training=True):\n", " \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n", " path = tf.keras.utils.get_file(os.path.basename(url), url, cache_dir=data_dir)\n", @@ -129,7 +139,10 @@ " if training:\n", " dataset = dataset.shuffle(buffer_size=3000)\n", " dataset = dataset.padded_batch(\n", - " batch_size, padded_shapes=([None], [None, None], []))\n", + " batch_size, padded_shapes=((None,), (None, 256), ()))\n", + " # To simplify the model code, we statically set as many of the shapes that we\n", + " # know.\n", + " dataset = dataset.map(set_static_batch_shape(batch_size))\n", " return dataset" ] }, @@ -145,7 +158,8 @@ "source": [ "To show the use of control flow, we write the RNN loop by hand, rather than using a pre-built RNN model.\n", "\n", - "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode." + "Note how we write the model code in Eager style, with regular `if` and `while` statements. Then, we annotate the functions with `@autograph.convert` to have them automatically compiled to run in graph mode.\n", + "We use Keras to define the model, and we will train it using Estimators." ] }, { @@ -166,70 +180,72 @@ }, "outputs": [], "source": [ - "class RnnColorbot(object):\n", - " \"\"\"Holds the parameters of the colorbot model.\"\"\"\n", + "@autograph.convert()\n", + "class RnnColorbot(tf.keras.Model):\n", + " \"\"\"RNN Colorbot model.\"\"\"\n", "\n", " def __init__(self):\n", + " super(RnnColorbot, self).__init__()\n", " self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n", " self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n", " self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n", "\n", + "\n", + " def _rnn_layer(self, chars, cell, batch_size, training):\n", + " \"\"\"A single RNN layer.\n", + "\n", + " Args:\n", + " chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n", + " cell: An object of type tf.contrib.rnn.LSTMBlockCell\n", + " batch_size: Int, the batch size to use\n", + " training: Boolean, whether the layer is used for training\n", + "\n", + " Returns:\n", + " A Tensor of shape (max_sequence_length, batch_size, output_size).\n", + " \"\"\"\n", + " hidden_outputs = []\n", + " autograph.utils.set_element_type(hidden_outputs, tf.float32)\n", + " state, output = cell.zero_state(batch_size, tf.float32)\n", + " for ch in chars:\n", + " cell_output, (state, output) = cell.call(ch, (state, output))\n", + " hidden_outputs.append(cell_output)\n", + " hidden_outputs = hidden_outputs.stack()\n", + " if training:\n", + " hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n", + " return hidden_outputs\n", + "\n", + " def build(self, _):\n", + " \"\"\"Creates the model variables. See keras.Model.build().\"\"\"\n", " self.lower_cell.build(tf.TensorShape((None, 256)))\n", " self.upper_cell.build(tf.TensorShape((None, 256)))\n", - " self.relu_layer.build(tf.TensorShape((None, 128)))\n", + " self.relu_layer.build(tf.TensorShape((None, 128))) \n", + " self.built = True\n", "\n", "\n", - "def rnn_layer(chars, cell, batch_size, training):\n", - " \"\"\"A simple RNN layer.\n", - " \n", - " Args:\n", - " chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n", - " cell: An object of type tf.contrib.rnn.LSTMBlockCell\n", - " batch_size: Int, the batch size to use\n", - " training: Boolean, whether the layer is used for training\n", + " def call(self, inputs, training=False):\n", + " \"\"\"The RNN model code. Uses Eager and \n", "\n", - " Returns:\n", - " A Tensor of shape (max_sequence_length, batch_size, output_size).\n", - " \"\"\"\n", - " hidden_outputs = []\n", - " autograph.utils.set_element_type(hidden_outputs, tf.float32)\n", - " state, output = cell.zero_state(batch_size, tf.float32)\n", - " for ch in chars:\n", - " cell_output, (state, output) = cell.call(ch, (state, output))\n", - " hidden_outputs.append(cell_output)\n", - " hidden_outputs = hidden_outputs.stack()\n", - " if training:\n", - " hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n", - " return hidden_outputs\n", + " The model consists of two RNN layers (made by lower_cell and upper_cell),\n", + " followed by a fully connected layer with ReLU activation.\n", "\n", + " Args:\n", + " inputs: A tuple (chars, length)\n", + " training: Boolean, whether the layer is used for training\n", "\n", - "@autograph.convert(recursive=True)\n", - "def model(inputs, colorbot, batch_size, training):\n", - " \"\"\"RNNColorbot model.\n", - " \n", - " The model consists of two RNN layers (made by lower_cell and upper_cell),\n", - " followed by a fully connected layer with ReLU activation.\n", - " \n", - " Args:\n", - " inputs: A tuple (chars, length)\n", - " colorbot: An object of type RnnColorbot\n", - " batch_size: Int, the batch size to use\n", - " training: Boolean, whether the layer is used for training\n", - " \n", - " Returns:\n", - " A Tensor of shape (batch_size, 3) - the model predictions.\n", - " \"\"\"\n", - " (chars, length) = inputs\n", - " seq = tf.transpose(chars, [1, 0, 2])\n", - " seq.set_shape((None, batch_size, 256))\n", + " Returns:\n", + " A Tensor of shape (batch_size, 3) - the model predictions.\n", + " \"\"\"\n", + " chars, length = inputs\n", + " batch_size = chars.shape[0]\n", + " seq = tf.transpose(chars, (1, 0, 2))\n", "\n", - " seq = rnn_layer(seq, colorbot.lower_cell, batch_size, training)\n", - " seq = rnn_layer(seq, colorbot.upper_cell, batch_size, training)\n", + " seq = self._rnn_layer(seq, self.lower_cell, batch_size, training)\n", + " seq = self._rnn_layer(seq, self.upper_cell, batch_size, training)\n", "\n", - " # Grab just the end-of-sequence from each output.\n", - " indices = tf.stack([length - 1, range(batch_size)], axis=1)\n", - " sequence_ends = tf.gather_nd(seq, indices)\n", - " return colorbot.relu_layer(sequence_ends)\n", + " # Grab just the end-of-sequence from each output.\n", + " indices = tf.stack([length - 1, range(batch_size)], axis=1)\n", + " sequence_ends = tf.gather_nd(seq, indices)\n", + " return self.relu_layer(sequence_ends)\n", "\n", "@autograph.convert()\n", "def loss_fn(labels, predictions):\n", @@ -246,9 +262,9 @@ } }, "source": [ - "We will now create the model function for the estimator.\n", + "We will now create the model function for the custom Estimator.\n", "\n", - "In the model function, we simply call the converted functions that we defined above - that's it!" + "In the model function, we simply use the model class we defined above - that's it!" ] }, { @@ -275,14 +291,12 @@ " sequence_length = features['sequence_length']\n", " inputs = (chars, sequence_length)\n", "\n", - " # Create the model components.\n", - " # Simply calling the AutoGraph-ed functions and objects just works!\n", + " # Create the model. Simply using the AutoGraph-ed class just works!\n", " colorbot = RnnColorbot()\n", - " \n", - " batch_size = params['batch_size']\n", + " colorbot.build(None)\n", "\n", " if mode == tf.estimator.ModeKeys.TRAIN:\n", - " predictions = model(inputs, colorbot, batch_size, training=True)\n", + " predictions = colorbot(inputs, training=True)\n", " loss = loss_fn(labels, predictions)\n", "\n", " learning_rate = params['learning_rate']\n", @@ -292,14 +306,13 @@ " return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)\n", "\n", " elif mode == tf.estimator.ModeKeys.EVAL:\n", - " predictions = model(inputs, colorbot, batch_size, training=False)\n", + " predictions = colorbot(inputs)\n", " loss = loss_fn(labels, predictions)\n", "\n", " return tf.estimator.EstimatorSpec(mode, loss=loss)\n", - " \n", + "\n", " elif mode == tf.estimator.ModeKeys.PREDICT:\n", - " # For prediction, we expect single tensors.\n", - " predictions = model(inputs, colorbot, 1, training=False)\n", + " predictions = colorbot(inputs)\n", "\n", " predictions = tf.minimum(predictions, 1.0)\n", " return tf.estimator.EstimatorSpec(mode, predictions=predictions)" @@ -368,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": 7, "metadata": { "colab": { "autoexec": { @@ -379,9 +392,9 @@ }, "colab_type": "code", "executionInfo": { - "elapsed": 10064, + "elapsed": 10604, "status": "ok", - "timestamp": 1523580419240, + "timestamp": 1524095272039, "user": { "displayName": "", "photoUrl": "", @@ -390,7 +403,7 @@ "user_tz": 240 }, "id": "2pg1AfbxBJQq", - "outputId": "41894b16-3d3a-4e30-f6e4-5a9c837a2210", + "outputId": "9c924b4f-06e1-4538-976c-a3e1ddac5660", "slideshow": { "slide_type": "-" } @@ -400,7 +413,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Eval loss at step 100: 0.0665446\n" + "Eval loss at step 100: 0.0674834\n" ] } ], @@ -444,7 +457,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": 8, "metadata": { "colab": { "autoexec": { @@ -455,9 +468,9 @@ }, "colab_type": "code", "executionInfo": { - "elapsed": 31286, + "elapsed": 7990, "status": "ok", - "timestamp": 1523580450579, + "timestamp": 1524095280105, "user": { "displayName": "", "photoUrl": "", @@ -466,7 +479,7 @@ "user_tz": 240 }, "id": "dxHex2tUN_10", - "outputId": "b3dc558d-b800-4e9b-e60e-3441124e80d8", + "outputId": "2b889e5a-b9ed-4645-bf03-d98f26c72101", "slideshow": { "slide_type": "slide" } @@ -478,7 +491,7 @@ "\u003clink rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'\u003e\u003c/link\u003e" ], "text/plain": [ - "\u003cIPython.core.display.HTML at 0x7f4112527e90\u003e" + "\u003cIPython.core.display.HTML at 0x7f3f36aa6cd0\u003e" ] }, "metadata": { @@ -494,7 +507,7 @@ "\u003cscript src='/nbextensions/google.colab/tabbar_main.min.js'\u003e\u003c/script\u003e" ], "text/plain": [ - "\u003cIPython.core.display.HTML at 0x7f4112527f10\u003e" + "\u003cIPython.core.display.HTML at 0x7f3eca67f7d0\u003e" ] }, "metadata": { @@ -510,7 +523,7 @@ "\u003cdiv id=\"id1\"\u003e\u003c/div\u003e" ], "text/plain": [ - "\u003cIPython.core.display.HTML at 0x7f4112527f50\u003e" + "\u003cIPython.core.display.HTML at 0x7f3eca67f8d0\u003e" ] }, "metadata": { @@ -523,11 +536,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f474-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"borderColor\": [\"#a7a7a7\"], \"contentBorder\": [\"0px\"], \"tabNames\": [\"RNN Colorbot\"], \"elementId\": \"id1\"});\n", - "//# sourceURL=js_a0db480422" + "window[\"e8ddfa22-4362-11e8-91ec-c8d3ffb5fbe0\"] = colab_lib.createTabBar({\"contentBorder\": [\"0px\"], \"elementId\": \"id1\", \"borderColor\": [\"#a7a7a7\"], \"contentHeight\": [\"initial\"], \"tabNames\": [\"RNN Colorbot\"], \"location\": \"top\", \"initialSelection\": 0});\n", + "//# sourceURL=js_71b9087b6d" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd1d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67f950\u003e" ] }, "metadata": { @@ -540,11 +553,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f475-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_d2a46ea291" + "window[\"e8ddfa23-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", + "//# sourceURL=js_e390445f33" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd0d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e" ] }, "metadata": { @@ -557,11 +570,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_0a8262c6e9" + "window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", + "//# sourceURL=js_241dd76d85" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd390\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e" ] }, "metadata": { @@ -575,11 +588,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_e32f85ccd2" + "window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", + "//# sourceURL=js_60c64e3d50" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e" ] }, "metadata": { @@ -593,11 +606,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f478-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f477-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_eaee748b21" + "window[\"e8ddfa26-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa25-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_14ea437cbd" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd550\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e" ] }, "metadata": { @@ -611,11 +624,11 @@ { "data": { "application/javascript": [ - "window[\"2c60f479-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_2befe06587" + "window[\"e8ddfa27-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", + "//# sourceURL=js_09294c2226" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4112527f10\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fcd0\u003e" ] }, "metadata": { @@ -629,11 +642,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1a-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"2c60f476-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_8ec4aeeb25" + "window[\"ec965514-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"e8ddfa24-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_e5e8266997" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd690\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fe10\u003e" ] }, "metadata": { @@ -647,11 +660,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_9f9f4574f1" + "window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", + "//# sourceURL=js_07a097f0ee" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd350\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fc90\u003e" ] }, "metadata": { @@ -665,11 +678,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_bcccd8f300" + "window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", + "//# sourceURL=js_790d669ca8" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd6d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67f8d0\u003e" ] }, "metadata": { @@ -683,11 +696,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1c-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_2c056cee72" + "window[\"ec965517-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965516-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_d30df771f0" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fd90\u003e" ] }, "metadata": { @@ -701,11 +714,11 @@ { "data": { "application/javascript": [ - "window[\"354d7b1e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_c853c3f58b" + "window[\"ec965518-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", + "//# sourceURL=js_8a43a2da4b" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd610\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fc50\u003e" ] }, "metadata": { @@ -718,369 +731,9 @@ }, { "data": { - "application/javascript": [ - "window[\"354d7b1f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b1b-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_e5730ab00d" - ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMBJREFUeJzt3F+I1XX+x/G32zjiFERUpgaFd2JBzOg5joX4h0SiMgmM\n/uhVGIlgFBlERGB3hUEkhkRdtDfRP1ACL6KpLBqcguxCjEAkmGamQcSohFHzsxe7O6zssvsydtff\n+ns8rs758j3f8z7fiyef7/k3o7XWCiDwh4s9APC/QzCAmGAAMcEAYoIBxAQDiAkGF8XTTz9d3W63\n7rvvvhoZGakVK1Zc7JEICMYlbvXq1TU8PHyxxzjPV199VcPDw/XZZ5/V22+/XVVVM2bMuMhTkRAM\n/qt+++23+uGHH+r666+vWbNmXexxuECCcQl76qmnanx8vLZs2VIDAwP1+uuv1zfffFP3339/dTqd\nWr9+fY2MjEzvv2nTpnr55ZfrgQceqIGBgXr44Yfr5MmTVVV1+vTp2r59ey1durQ6nU5t2LChTpw4\nUVVVk5OTtWXLllq6dGmtXbu23nnnnelj7tq1q7Zt21bbt2+vJUuW1HvvvVfPPvtsHTp0qAYGBmrX\nrl1/N/fRo0dr06ZN1el06u67766hoaGqqhodHa1OpzO93zPPPFO33nrr9P3t27fXm2+++e89iZyv\ncUlbtWpVGx4ebq21NjEx0brdbjtw4EBrrbUvvviidbvdduLEidZaaxs3bmxr1qxp33//fZuammob\nN25sO3fubK219tZbb7VHH320TU1NtXPnzrXDhw+3X375pbXW2kMPPdR27NjRTp8+3Y4cOdIGBwen\nn/OVV15pN910U/voo49aa61NTU21999/vz344IPTMx48eLCtWLGitdbamTNn2po1a9qePXvamTNn\n2vDwcOvv72/Hjh2bfj2HDx9urbW2du3advvtt7ejR4+21lpbuXJlO3LkyH/qVNJas8L4f6D95edC\n+/btq5UrV9by5curqmrZsmV1880316effjq977333ls33HBD9fb21h133FFHjhypqqqenp46efJk\nHTt2rGbMmFGLFi2qyy+/vCYmJurrr7+uJ598smbOnFkLFy6sDRs21N69e6eP2d/fX6tXr66qqt7e\n3n8666FDh+rUqVP1yCOPVE9PTw0ODtaqVavqgw8+qKqqJUuW1MjISB0/fryqqtauXVtffvlljY6O\n1q+//loLFy78N501/pGeiz0A/z1jY2O1f//++vjjj6vqzyE5e/ZsLVu2bHqfa665Zvr27Nmz69Sp\nU1VVdc8999TExEQ98cQT9fPPP9e6devq8ccfr8nJybryyitr9uzZ04+bP39+HT58ePr+3Llz4xkn\nJydr3rx5522bP39+TU5OVlVVp9OpoaGhuu6666rb7Va32629e/dWb29vLV68+ALOBr+HYFzi/vbT\nh3nz5tX69etrx44dF3ycnp6e2rp1a23durXGxsZq8+bNtWDBgrrtttvqp59+qlOnTlVfX19VVY2P\nj9ecOXP+4Qz/ypw5c2p8fPy8bWNjY7VgwYKqqup2u/Xiiy/WvHnzqtPp1MDAQD333HPV29tb3W73\ngl8XF8YlySXu2muvrdHR0aqqWrduXQ0NDdXnn39e586dq6mpqRoZGakff/zxXx7n4MGD9d1339W5\nc+eqr6+venp66rLLLqu5c+dWf39/vfTSS3X69On69ttv6913361169b9rnlvueWW6uvrq9dee63O\nnj1bBw8erE8++aTuvPPOqqq68cYba9asWbVv377qdDp1xRVX1NVXX10ffvjheW+I8p8hGJe4zZs3\n1+7du6vb7db+/ftr9+7dtWfPnlq2bFmtWrWq3njjjen3OP7ZSuD48eO1bdu2Wrx4cd111121dOnS\n6Sjs3LmzRkdHa/ny5bVt27Z67LHHzrvMuRAzZ86sV199tQ4cOFCDg4P1/PPP1wsvvDC9wqj68yrj\nqquumr7U+WsoFi1a9Luek9yM1vyBDpCxwgBiggHEBAOICQYQ+z/7PYzjf/QRGVxM12z68u+2WWEA\nMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE\nBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhAT\nDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMIDajtdYu9hDA/wYrDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4j9CY2LTAbbRbWuAAAAAElFTkSuQmCC\n", "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2050\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_a897ef7e24" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2250\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_565fa3d154" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4113124d90\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"354d7b22-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b21-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_222e0dc6af" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4113124c10\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"354d7b23-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_831db7458f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4113124310\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab4-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"354d7b20-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_adb576c6eb" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_9418f2d32f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f990850\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_3fad25f306" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4112527ed0\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab7-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab6-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_45b9340e7b" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f990c90\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab8-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_bec9896d44" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f990a10\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fab9-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fab5-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_460b91ad4a" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3a10\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_7dedd0b037" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3890\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_4b1c977dc7" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3bd0\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fabc-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803fabb-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_d64fedfcf9" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3410\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3803fabd-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_3e8c929c3f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3c50\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b986c-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3803faba-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_9f9cf2b76f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_b402e6b587" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3d90\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_9b7d66db72" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3b10\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b986f-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986e-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_11ec213a3f" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3950\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "window[\"3b9b9870-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_9c055e4bc0" - ], - "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41b21d3850\u003e" - ] - }, - "metadata": { - "tags": [ - "id1_content_0", - "outputarea_id1" - ] - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAENCAYAAAD60Fs2AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAACMRJREFUeJzt3F+IlfW+x/Gvp3FECyIqU4PCO7EgZnQtnUJ0JJGoTDoY\n/dGrMBJhosggIgK7KwwiMdxRF11F/0AJvIisLBqcguxCjEAkmNQGcRvVwIzm71zsc4Yje7P3x9h7\nz97u1+tqrYdnPeu7nos3v2f9m9FaawUQ+K/pHgD49yEYQEwwgJhgADHBAGKCAcQEg2nx9NNPV7fb\nrfvuu69GRkZq5cqV0z0SAcG4xK1evbqGh4ene4wLfPXVVzU8PFyfffZZvf3221VVNWPGjGmeioRg\n8E/122+/1Q8//FDXX399zZo1a7rH4SIJxiXsqaeeqhMnTtSWLVuqv7+/Xn/99frmm2/q/vvvr06n\nU+vXr6+RkZGp/Tdt2lQvv/xyPfDAA9Xf318PP/xwnTlzpqqqJicna9u2bbVs2bLqdDq1YcOGOn36\ndFVVjY2N1ZYtW2rZsmW1du3aeuedd6aOuXPnzhoaGqpt27bV0qVL67333qtnn322Dh06VP39/bVz\n584/m/vo0aO1adOm6nQ6dffdd9f+/furqmp0dLQ6nc7Ufs8880zdeuutU/e3bdtWb7755t/3JHKh\nxiVtcHCwDQ8Pt9ZaO3nyZOt2u+3AgQOttda++OKL1u122+nTp1trrW3cuLGtWbOmff/9921iYqJt\n3Lix7dixo7XW2ltvvdUeffTRNjEx0c6fP98OHz7cfvnll9Zaaw899FDbvn17m5ycbEeOHGnLly+f\nes5XXnml3XTTTe2jjz5qrbU2MTHR3n///fbggw9OzXjw4MG2cuXK1lprZ8+ebWvWrGm7d+9uZ8+e\nbcPDw62vr68dO3Zs6vUcPny4tdba2rVr2+23396OHj3aWmtt1apV7ciRI/+oU0lrzQrjP0D7358L\n7d27t1atWlUrVqyoqqqBgYG6+eab69NPP53a9957760bbrihent764477qgjR45UVVVPT0+dOXOm\njh07VjNmzKjFixfX5ZdfXidPnqyvv/66nnzyyZo5c2YtWrSoNmzYUHv27Jk6Zl9fX61evbqqqnp7\ne//qrIcOHarx8fF65JFHqqenp5YvX16Dg4P1wQcfVFXV0qVLa2RkpE6dOlVVVWvXrq0vv/yyRkdH\n69dff61Fixb9nc4af0nPdA/AP8/x48dr37599fHHH1fVn0Jy7ty5GhgYmNrnmmuumbo9e/bsGh8f\nr6qqe+65p06ePFlPPPFE/fzzz7Vu3bp6/PHHa2xsrK688sqaPXv21OMWLFhQhw8fnro/b968eMax\nsbGaP3/+BdsWLFhQY2NjVVXV6XRq//79dd1111W3261ut1t79uyp3t7eWrJkyUWcDX4PwbjE/f9P\nH+bPn1/r16+v7du3X/Rxenp6auvWrbV169Y6fvx4bd68uRYuXFi33XZb/fTTTzU+Pl5z5sypqqoT\nJ07U3Llz/+IMf8vcuXPrxIkTF2w7fvx4LVy4sKqqut1uvfjiizV//vzqdDrV399fzz33XPX29la3\n273o18XFcUlyibv22mtrdHS0qqrWrVtX+/fvr88//7zOnz9fExMTNTIyUj/++OPfPM7Bgwfru+++\nq/Pnz9ecOXOqp6enLrvsspo3b1719fXVSy+9VJOTk/Xtt9/Wu+++W+vWrftd895yyy01Z86ceu21\n1+rcuXN18ODB+uSTT+rOO++sqqobb7yxZs2aVXv37q1Op1NXXHFFXX311fXhhx9e8IYo/xiCcYnb\nvHlz7dq1q7rdbu3bt6927dpVu3fvroGBgRocHKw33nhj6j2Ov7YSOHXqVA0NDdWSJUvqrrvuqmXL\nlk1FYceOHTU6OlorVqyooaGheuyxxy64zLkYM2fOrFdffbUOHDhQy5cvr+eff75eeOGFqRVG1Z9W\nGVddddXUpc7/hWLx4sW/6znJzWjNH+gAGSsMICYYQEwwgJhgALF/2e9h/PEP/z3dI8B/tKseee/P\ntllhADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEA\nYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOI\nCQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAm\nGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhg\nADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIB\nxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQ\nEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBM\nMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHB\nAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQD\niAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHEBAOICQYQEwwg\nJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICY\nYAAxwQBiggHEBAOICQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEwwgJhgADHBAGKC\nAcQEA4gJBhATDCA2o7XWpnsI4N+DFQYQEwwgJhhATDCAmGAAMcEAYoIBxAQDiAkGEBMMICYYQEww\ngJhgADHBAGKCAcQEA4gJBhATDCAmGEBMMICYYAAxwQBiggHE/gfh60wGjfc7LQAAAABJRU5ErkJg\ngg==\n", - "text/plain": [ - "\u003cmatplotlib.figure.Figure at 0x7f4113124310\u003e" + "\u003cmatplotlib.figure.Figure at 0x7f3ecc00bf10\u003e" ] }, "metadata": { @@ -1095,11 +748,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9871-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b986d-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_ba6a061307" + "window[\"ec965519-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec965515-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_893ad561f4" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd890\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55c90\u003e" ] }, "metadata": { @@ -1113,11 +766,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", - "//# sourceURL=js_83e3496927" + "window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.getActiveOutputArea();\n", + "//# sourceURL=js_2d99e0ac17" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd590\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67fe50\u003e" ] }, "metadata": { @@ -1131,11 +784,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", - "//# sourceURL=js_f437bab20d" + "window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"] = document.querySelector(\"#id1_content_0\");\n", + "//# sourceURL=js_5c19462e32" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a22d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55dd0\u003e" ] }, "metadata": { @@ -1149,11 +802,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9874-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9873-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_93aa63450e" + "window[\"ec96551c-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551b-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_b9c8b7567b" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2b90\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55a50\u003e" ] }, "metadata": { @@ -1167,11 +820,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9875-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", - "//# sourceURL=js_aca189bea5" + "window[\"ec96551d-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"id1\"].setSelectedTabIndex(0);\n", + "//# sourceURL=js_fd05186348" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd4d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55810\u003e" ] }, "metadata": { @@ -1185,10 +838,10 @@ { "data": { "text/html": [ - "\u003cdiv class=id_100313201 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e" + "\u003cdiv class=id_888646481 style=\"margin-right:10px; display:flex;align-items:center;\"\u003e\u003cspan style=\"margin-right: 3px;\"\u003e\u003c/span\u003e\u003c/div\u003e" ], "text/plain": [ - "\u003cIPython.core.display.HTML at 0x7f410f990a90\u003e" + "\u003cIPython.core.display.HTML at 0x7f3f32414810\u003e" ] }, "metadata": { @@ -1203,11 +856,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n", - "//# sourceURL=js_5df1fe383e" + "window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n", + "//# sourceURL=js_efef96e882" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f410f8fd490\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e" ] }, "metadata": { @@ -1222,11 +875,11 @@ { "data": { "application/javascript": [ - "window[\"3b9b9877-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3b9b9876-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n", - "//# sourceURL=js_c62c7174ad" + "window[\"ec96551f-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ec96551e-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n", + "//# sourceURL=js_6eca889864" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2390\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3eca67f990\u003e" ] }, "metadata": { @@ -1241,11 +894,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 input\");\n", - "//# sourceURL=js_2e2201ddc4" + "window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 input\");\n", + "//# sourceURL=js_f02070cc60" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2810\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b553d0\u003e" ] }, "metadata": { @@ -1260,11 +913,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76585-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76584-3eb4-11e8-91ec-c8d3ffb5fbe0\"].remove();\n", - "//# sourceURL=js_288e5283d6" + "window[\"ed8ea973-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea972-4362-11e8-91ec-c8d3ffb5fbe0\"].remove();\n", + "//# sourceURL=js_ed9faba660" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a26d0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31a95450\u003e" ] }, "metadata": { @@ -1279,11 +932,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_100313201 span\");\n", - "//# sourceURL=js_2f31d19cde" + "window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"] = jQuery(\".id_888646481 span\");\n", + "//# sourceURL=js_f3458d7074" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31a95250\u003e" ] }, "metadata": { @@ -1298,11 +951,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76587-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = window[\"3ed76586-3eb4-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n", - "//# sourceURL=js_2fbbcda050" + "window[\"ed8ea975-4362-11e8-91ec-c8d3ffb5fbe0\"] = window[\"ed8ea974-4362-11e8-91ec-c8d3ffb5fbe0\"].text(\"Give me a color name (or press 'enter' to exit): \");\n", + "//# sourceURL=js_3ffd97bd6f" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f4112527e90\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31a953d0\u003e" ] }, "metadata": { @@ -1317,11 +970,11 @@ { "data": { "application/javascript": [ - "window[\"3ed76588-3eb4-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"3b9b9872-3eb4-11e8-91ec-c8d3ffb5fbe0\"]);\n", - "//# sourceURL=js_f94d975cf3" + "window[\"ed8ea976-4362-11e8-91ec-c8d3ffb5fbe0\"] = google.colab.output.setActiveOutputArea(window[\"ec96551a-4362-11e8-91ec-c8d3ffb5fbe0\"]);\n", + "//# sourceURL=js_7f73e8bcca" ], "text/plain": [ - "\u003cIPython.core.display.Javascript at 0x7f41127a2fd0\u003e" + "\u003cIPython.core.display.Javascript at 0x7f3f31b55710\u003e" ] }, "metadata": { @@ -1337,7 +990,7 @@ "def predict_input_fn(color_name):\n", " \"\"\"An input function for prediction.\"\"\"\n", " _, chars, sequence_length = parse(color_name)\n", - " \n", + "\n", " # We create a batch of a single element.\n", " features = {\n", " 'chars': tf.expand_dims(chars, 0),\n", @@ -1385,7 +1038,11 @@ "colab": { "collapsed_sections": [], "default_view": {}, - "name": "RNN Colorbot using Estimators", + "last_runtime": { + "build_target": "", + "kind": "local" + }, + "name": "RNN Colorbot using Keras and Estimators", "provenance": [ { "file_id": "1CtzefX39ffFibX_BqE6cRbT0UW_DdVKl", diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py index 3c3130c77025c45ca219daf4bb66082f4e8a7f82..24f87b2c14da4a3523f1e580d4362cbd3679a2cd 100644 --- a/tensorflow/contrib/autograph/impl/api.py +++ b/tensorflow/contrib/autograph/impl/api.py @@ -241,7 +241,7 @@ def to_graph(e, module = gast.Module([]) for import_line in config.COMPILED_IMPORT_STATEMENTS: module.body.extend(parser.parse_str(import_line).body) - for dep in conversion_map.dependency_cache.values(): + for dep in reversed(conversion_map.dependency_cache.values()): module.body.append(dep) compiled_node, compiled_src = compiler.ast_to_object(module) diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py index c9fb972953ea315006b9f3005a5d3df5a2f043d3..55a30dc127957b2a9caa053db843380c94bacfbf 100644 --- a/tensorflow/contrib/autograph/impl/conversion.py +++ b/tensorflow/contrib/autograph/impl/conversion.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import imp import gast @@ -39,6 +40,7 @@ from tensorflow.contrib.autograph.converters import side_effect_guards from tensorflow.contrib.autograph.converters import single_return from tensorflow.contrib.autograph.impl import config from tensorflow.contrib.autograph.impl import naming +from tensorflow.contrib.autograph.pyct import ast_util from tensorflow.contrib.autograph.pyct import context from tensorflow.contrib.autograph.pyct import inspect_utils from tensorflow.contrib.autograph.pyct import parser @@ -81,7 +83,9 @@ class ConversionMap(object): self.recursive = recursive self.nocompile_decorators = nocompile_decorators self.partial_types = partial_types if partial_types else () - self.dependency_cache = {} + # Required to output dependencies in discovery order, which should match + # the reverse dependency order. + self.dependency_cache = collections.OrderedDict() self.additional_imports = set() self.name_map = {} self.api_module = api_module @@ -154,7 +158,16 @@ def entity_to_graph(o, conversion_map, arg_values, arg_types): if tf_inspect.isclass(o): node, name, ns = class_to_graph(o, conversion_map) elif tf_inspect.isfunction(o): - node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types) + # TODO(mdan): This is not a reliable mechanism. + # The most reliable way is to check the source code, the AST will contain + # a Lambda node instead of a FunctionDef + if o.__name__ == '': + raise NotImplementedError( + 'lambda functions are not yet supported; declare the function' + ' using def instead: %s' % o) + else: + node, name, ns = function_to_graph(o, conversion_map, arg_values, + arg_types) elif tf_inspect.ismethod(o): node, name, ns = function_to_graph(o, conversion_map, arg_values, arg_types) else: @@ -192,6 +205,9 @@ def class_to_graph(c, conversion_map): class_namespace = {} for _, m in members: + # Only convert the members that are directly defined by the class. + if inspect_utils.getdefiningclass(m, c) is not c: + continue node, _, namespace = function_to_graph( m, conversion_map=conversion_map, @@ -205,12 +221,49 @@ def class_to_graph(c, conversion_map): converted_members[m] = node namer = conversion_map.new_namer(class_namespace) class_name = namer.compiled_class_name(c.__name__, c) - node = gast.ClassDef( - class_name, - bases=[], - keywords=[], - body=list(converted_members.values()), - decorator_list=[]) + + # TODO(mdan): This needs to be explained more thoroughly. + # Process any base classes: if the sueprclass if of a whitelisted type, an + # absolute import line is generated. Otherwise, it is marked for conversion + # (as a side effect of the call to namer.compiled_class_name() followed by + # conversion_map.update_name_map(namer)). + output_nodes = [] + renames = {} + bases = [] + for base in c.__bases__: + if isinstance(object, base): + bases.append('object') + continue + if is_whitelisted_for_graph(base): + alias = namer.new_symbol(base.__name__, ()) + output_nodes.append( + gast.ImportFrom( + module=base.__module__, + names=[gast.alias(name=base.__name__, asname=alias)], + level=0)) + else: + # This will trigger a conversion into a class with this name. + alias = namer.compiled_class_name(base.__name__, base) + bases.append(alias) + renames[qual_names.QN(base.__name__)] = qual_names.QN(alias) + conversion_map.update_name_map(namer) + + # Generate the definition of the converted class. + output_nodes.append( + gast.ClassDef( + class_name, + bases=bases, + keywords=[], + body=list(converted_members.values()), + decorator_list=[])) + node = gast.Module(output_nodes) + + # Make a final pass to replace references to the class or its base classes. + # Most commonly, this occurs when making super().__init__() calls. + # TODO(mdan): Making direct references to superclass' superclass will fail. + node = qual_names.resolve(node) + renames[qual_names.QN(c.__name__)] = qual_names.QN(class_name) + node = ast_util.rename_symbols(node, renames) return node, class_name, class_namespace @@ -222,16 +275,22 @@ def _add_reserved_symbol(namespace, name, entity): raise ValueError('The name "%s" is reserved and may not be used.' % name) +ag_internal = None + + def _add_self_references(namespace, api_module): - # Craft a module that exposes parts of the external API as well as certain - # internal modules. - ag_internal = imp.new_module('autograph') - ag_internal.converted_call = api_module.converted_call - ag_internal.utils = utils - # TODO(mdan): Add safeguards against name clashes. - # We don't want to create a submodule because we want the operators to be - # accessible as ag__. - ag_internal.__dict__.update(operators.__dict__) + """Adds namespace references to the module that exposes the api itself.""" + global ag_internal + if ag_internal is None: + # Craft a module that exposes parts of the external API as well as certain + # internal modules. + ag_internal = imp.new_module('autograph') + ag_internal.converted_call = api_module.converted_call + ag_internal.utils = utils + # TODO(mdan): Add safeguards against name clashes. + # We don't want to create a submodule because we want the operators to be + # accessible as ag__. + ag_internal.__dict__.update(operators.__dict__) _add_reserved_symbol(namespace, 'ag__', ag_internal) diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py index f0b597c12fd3dfb503a68e46d0e041c0e67d65ad..5edd8e74a8899a25fb51e2a4e133f3cb7933fa26 100644 --- a/tensorflow/contrib/autograph/impl/conversion_test.py +++ b/tensorflow/contrib/autograph/impl/conversion_test.py @@ -24,6 +24,7 @@ from tensorflow.contrib.autograph import utils from tensorflow.contrib.autograph.impl import api from tensorflow.contrib.autograph.impl import conversion from tensorflow.python.framework import constant_op +from tensorflow.python.keras._impl.keras.engine import training from tensorflow.python.platform import test @@ -78,6 +79,87 @@ class ConversionTest(test.TestCase): conversion_map.dependency_cache[f].body[0].body[0].value.func.id) self.assertEqual('tf__g', conversion_map.dependency_cache[g].name) + def test_entity_to_graph_class_hierarchy(self): + + class TestBase(object): + + def __init__(self, x='base'): + self.x = x + + def foo(self): + return self.x + + def bar(self): + return self.x + + class TestSubclass(TestBase): + + def __init__(self, y): + super(TestSubclass, self).__init__('sub') + self.y = y + + def foo(self): + return self.y + + def baz(self): + return self.y + + conversion_map = self._simple_conversion_map() + conversion.entity_to_graph(TestSubclass, conversion_map, None, None) + + self.assertTrue(TestBase in conversion_map.dependency_cache) + self.assertTrue(TestSubclass in conversion_map.dependency_cache) + self.assertEqual('TfTestBase', + conversion_map.dependency_cache[TestBase].body[-1].name) + self.assertEqual( + 'TfTestSubclass', + conversion_map.dependency_cache[TestSubclass].body[-1].name) + + def test_entity_to_graph_class_hierarchy_whitelisted(self): + + class TestSubclass(training.Model): + + def __init__(self, y): + super(TestSubclass, self).__init__() + self.built = False + + def call(self, x): + return 3 * x + + conversion_map = self._simple_conversion_map() + conversion.entity_to_graph(TestSubclass, conversion_map, None, None) + + self.assertTrue(TestSubclass in conversion_map.dependency_cache) + self.assertFalse(training.Model in conversion_map.dependency_cache) + self.assertEqual( + 'Model', + conversion_map.dependency_cache[TestSubclass].body[0].names[0].name) + self.assertEqual( + 'TfTestSubclass', + conversion_map.dependency_cache[TestSubclass].body[-1].name) + + def test_entity_to_graph_lambda(self): + f = lambda a: a + + with self.assertRaises(NotImplementedError): + conversion_map = self._simple_conversion_map() + conversion.entity_to_graph(f, conversion_map, None, None) + + def test_ag_module_cached(self): + def callee(): + return range(3) + + def caller(a): + return a() + + conversion_map = self._simple_conversion_map() + _, _, callee_ns = conversion.entity_to_graph( + callee, conversion_map, None, None) + _, _, caller_ns = conversion.entity_to_graph( + caller, conversion_map, None, None) + + self.assertTrue(callee_ns['ag__'] is caller_ns['ag__']) + if __name__ == '__main__': test.main() diff --git a/tensorflow/contrib/autograph/impl/naming.py b/tensorflow/contrib/autograph/impl/naming.py index 1facaa0ca0ebcc6d4281e7c92a462ceeb00b453a..b1d3f76be7763fada88fd0a1da9d3aa43b67ddfa 100644 --- a/tensorflow/contrib/autograph/impl/naming.py +++ b/tensorflow/contrib/autograph/impl/naming.py @@ -62,8 +62,6 @@ class Namer(object): n += 1 new_name = '%s_%d' % (new_name_root, n) - if live_entity is not None: - self.renamed_calls[live_entity] = new_name self.generated_names.add(new_name) if live_entity is not None: self.renamed_calls[live_entity] = new_name diff --git a/tensorflow/contrib/autograph/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py index 4a70bab4402a940dec6a8b183daf7406a7e34131..c4f82d11708393a6029d3f17be428b47eb9342ff 100644 --- a/tensorflow/contrib/autograph/pyct/ast_util.py +++ b/tensorflow/contrib/autograph/pyct/ast_util.py @@ -23,10 +23,11 @@ import ast import gast from tensorflow.contrib.autograph.pyct import anno +from tensorflow.contrib.autograph.pyct import parser class CleanCopier(gast.NodeVisitor): - """Copy AST nodes. + """Copies AST nodes. The copied nodes will ignore almost all fields that are prefixed by '__'. Exceptions make some annotations. @@ -106,3 +107,79 @@ def keywords_to_dict(keywords): keys.append(gast.Str(kw.arg)) values.append(kw.value) return gast.Dict(keys=keys, values=values) + + +class PatternMatcher(gast.NodeVisitor): + """Matches a node against a pattern represented by a node. + + The pattern may contain wildcards represented by the symbol '_'. + """ + + def __init__(self, pattern): + self.pattern = pattern + self.pattern_stack = [] + self.matches = True + + def compare_and_visit(self, node, pattern): + self.pattern_stack.append(self.pattern) + self.pattern = pattern + self.generic_visit(node) + self.pattern = self.pattern_stack.pop() + + def no_match(self): + self.matches = False + return False + + def is_wildcard(self, p): + if isinstance(p, (list, tuple)) and len(p) == 1: + p, = p + if isinstance(p, gast.Name) and p.id == '_': + return True + if p == '_': + return True + return False + + def generic_visit(self, node): + if not self.matches: + return + + pattern = self.pattern + for f in node._fields: + if f.startswith('__'): + continue + + if not hasattr(node, f): + if hasattr(pattern, f) and getattr(pattern, f): + return self.no_match() + else: + continue + if not hasattr(pattern, f): + return self.no_match() + + v = getattr(node, f) + p = getattr(pattern, f) + + if self.is_wildcard(p): + continue + if isinstance(v, (list, tuple)): + if not isinstance(p, (list, tuple)) or len(v) != len(p): + return self.no_match() + for v_item, p_item in zip(v, p): + self.compare_and_visit(v_item, p_item) + elif isinstance(v, (gast.AST, ast.AST)): + if not isinstance(v, type(p)) and not isinstance(p, type(v)): + return self.no_match() + self.compare_and_visit(v, p) + else: + # Assume everything else is a value type. + if v != p: + return self.no_match() + + +def matches(node, pattern): + if isinstance(pattern, str): + pattern = parser.parse_expression(pattern) + matcher = PatternMatcher(pattern) + matcher.visit(node) + return matcher.matches + diff --git a/tensorflow/contrib/autograph/pyct/ast_util_test.py b/tensorflow/contrib/autograph/pyct/ast_util_test.py index 8faf92c705d997db298dbb1115981fd9da26372d..3afa04a50685d19c90944c14ed39f9d3ad35e486 100644 --- a/tensorflow/contrib/autograph/pyct/ast_util_test.py +++ b/tensorflow/contrib/autograph/pyct/ast_util_test.py @@ -85,7 +85,33 @@ class AstUtilTest(test.TestCase): output.body += (ast.Assign([ast.Name(id='d', ctx=ast.Store())], d),) result, _ = compiler.ast_to_object(output) self.assertDictEqual(result.d, {'a': 3, 'c': 1, 'd': 'e'}) - print(d) + + def assertMatch(self, target_str, pattern_str): + node = parser.parse_expression(target_str) + pattern = parser.parse_expression(pattern_str) + self.assertTrue(ast_util.matches(node, pattern)) + + def assertNoMatch(self, target_str, pattern_str): + node = parser.parse_expression(target_str) + pattern = parser.parse_expression(pattern_str) + self.assertFalse(ast_util.matches(node, pattern)) + + def test_matches_symbols(self): + self.assertMatch('foo', '_') + self.assertNoMatch('foo()', '_') + self.assertMatch('foo + bar', 'foo + _') + self.assertNoMatch('bar + bar', 'foo + _') + self.assertNoMatch('foo - bar', 'foo + _') + + def test_matches_function_args(self): + self.assertMatch('super(Foo, self).__init__(arg1, arg2)', + 'super(_).__init__(_)') + self.assertMatch('super().__init__()', 'super(_).__init__(_)') + self.assertNoMatch('super(Foo, self).bar(arg1, arg2)', + 'super(_).__init__(_)') + self.assertMatch('super(Foo, self).__init__()', 'super(Foo, _).__init__(_)') + self.assertNoMatch('super(Foo, self).__init__()', + 'super(Bar, _).__init__(_)') if __name__ == '__main__': diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py index 63361cc4f2557d22800072d90a51b7e4ddab34ab..eef74599a7d5415b4b05d2f05fb094b1dcd33323 100644 --- a/tensorflow/contrib/autograph/pyct/inspect_utils.py +++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py @@ -63,16 +63,27 @@ def getnamespace(f): return namespace +def _get_unbound_function(m): + # TODO(mdan): Figure out why six.get_unbound_function fails in some cases. + # The failure case is for tf.keras.Model. + if hasattr(m, 'im_func'): + return m.im_func + return m + + def getdefiningclass(m, owner_class): """Resolves the class (e.g. one of the superclasses) that defined a method.""" - m = six.get_unbound_function(m) - last_defining = owner_class - for superclass in tf_inspect.getmro(owner_class): + # Normalize bound functions to their respective unbound versions. + m = _get_unbound_function(m) + for superclass in owner_class.__bases__: if hasattr(superclass, m.__name__): superclass_m = getattr(superclass, m.__name__) - if six.get_unbound_function(superclass_m) == m: - last_defining = superclass - return last_defining + if _get_unbound_function(superclass_m) is m: + return superclass + elif hasattr(m, '__self__') and m.__self__ == owner_class: + # Python 3 class methods only work this way it seems :S + return superclass + return owner_class def getmethodclass(m): diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py index cf841dae814f64583bc43a2e110f1dcf5c0d7c1f..1a212f676a616307b41feafafda9d1d794ba3d2d 100644 --- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py +++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py @@ -243,6 +243,10 @@ class InspectUtilsTest(test.TestCase): def bar(self): pass + @classmethod + def class_method(cls): + pass + class Subclass(Superclass): def foo(self): @@ -257,6 +261,9 @@ class InspectUtilsTest(test.TestCase): inspect_utils.getdefiningclass(Subclass.bar, Subclass) is Superclass) self.assertTrue( inspect_utils.getdefiningclass(Subclass.baz, Subclass) is Subclass) + self.assertTrue( + inspect_utils.getdefiningclass(Subclass.class_method, Subclass) is + Superclass) def test_isbuiltin(self): self.assertTrue(inspect_utils.isbuiltin(range)) diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py index 2f553e1e23dfb7367fc9b6123222e685d0490780..c00946f9c41bc68d5c638d71f356b484db1286d1 100644 --- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py +++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py @@ -184,7 +184,7 @@ class TypeInfoResolver(transformer.Base): # Multiple targets mean multiple assignment. for target in targets: # Tuple target means unpacking. - if isinstance(target, gast.Tuple): + if isinstance(target, (gast.Tuple, gast.List)): for i, target_item in enumerate(target.elts): # Two cases here: # 1. Static unpacking, e.g. a, b = c, d @@ -199,8 +199,7 @@ class TypeInfoResolver(transformer.Base): target_symbol = anno.getanno(target, anno.Basic.QN) self.scope.setval(target_symbol, source) else: - raise ValueError( - 'assignment target has unknown type: %s' % target_item) + raise ValueError('assignment target has unknown type: %s' % target) def visit_With(self, node): for wi in node.items: diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py index e102ab763011e744863e95e858638b6bea689d18..4db6cc0adfad90ffc1a6bbcadfc80215688d271e 100644 --- a/tensorflow/contrib/autograph/pyct/transformer.py +++ b/tensorflow/contrib/autograph/pyct/transformer.py @@ -69,6 +69,10 @@ class Base(gast.NodeTransformer): def enclosing_entities(self): return tuple(self._enclosing_entities) + @property + def locel_scope_level(self): + return len(self._local_scope_state) + def enter_local_scope(self): self._local_scope_state.append({}) diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py index 7df514cd207c5e781f3b4abaa2020016b197669d..9d6cc9245aa463d0c8cfc7ad209736357b6c0323 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py @@ -417,9 +417,18 @@ class SparseSplitHandler(InequalitySplitHandler): return (are_splits_ready, partition_ids, gains, split_infos) -@function.Defun(dtypes.bool, dtypes.bool, dtypes.float32, dtypes.float32, - dtypes.int32, dtypes.float32, dtypes.float32, dtypes.float32, - dtypes.float32, dtypes.float32) +@function.Defun( + dtypes.bool, + dtypes.bool, + dtypes.float32, + dtypes.float32, + dtypes.int32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + noinline=True) def dense_make_stats_update(is_active, are_buckets_ready, float_column, quantile_buckets, example_partition_ids, gradients, hessians, weights, empty_gradients, empty_hessians): @@ -452,9 +461,20 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column, gradients, hessians) -@function.Defun(dtypes.bool, dtypes.bool, dtypes.int64, dtypes.float32, - dtypes.int64, dtypes.float32, dtypes.int32, dtypes.float32, - dtypes.float32, dtypes.float32, dtypes.float32, dtypes.float32) +@function.Defun( + dtypes.bool, + dtypes.bool, + dtypes.int64, + dtypes.float32, + dtypes.int64, + dtypes.float32, + dtypes.int32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + noinline=True) def sparse_make_stats_update( is_active, are_buckets_ready, sparse_column_indices, sparse_column_values, sparse_column_shape, quantile_buckets, example_partition_ids, gradients, diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc index 4481c0d0e4400acd93c9a277de185db7aaf9bcb0..67ac9bf387ae9b3ca29e610c2c4138c28302ca33 100644 --- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream_test.cc @@ -138,6 +138,12 @@ void GenerateOneValue(int32 worker_id, int64 max_elements, double *total_weight, stream->Finalize(); } +void GenerateOneZeroWeightedValue(int32 worker_id, int64 max_elements, + double *total_weight, Stream *stream) { + stream->PushEntry(10, 0); + stream->Finalize(); +} + TEST(WeightedQuantilesStreamTest, OneValue) { const double eps = 0.01; const int64 max_elements = 1 << 16; @@ -145,6 +151,13 @@ TEST(WeightedQuantilesStreamTest, OneValue) { {10.0, 10.0, 10.0, 10.0, 10.0}, 1e-2); } +TEST(WeightedQuantilesStreamTest, OneZeroWeightValue) { + const double eps = 0.01; + const int64 max_elements = 1 << 16; + TestSingleWorkerStreams(eps, max_elements, GenerateOneZeroWeightedValue, {}, + 1e-2); +} + TEST(WeightedQuantilesStreamTest, FixedUniform) { const double eps = 0.01; const int64 max_elements = 1 << 16; diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h index aec232f3cbb096f0aa51e4362a821882391f8027..7576856dc3a6d0b6681ee9745c875cf46d1e2960 100644 --- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h +++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h @@ -235,6 +235,11 @@ class WeightedQuantilesSummary { // The resulting boundaries are guaranteed to both contain at least // num_boundaries unique elements and maintain approximation bounds. std::vector GenerateBoundaries(int64 num_boundaries) const { + std::vector output; + if (entries_.empty()) { + return output; + } + // Generate soft compressed summary. WeightedQuantilesSummary compressed_summary; @@ -246,7 +251,6 @@ class WeightedQuantilesSummary { compressed_summary.Compress(num_boundaries, compression_eps); // Return boundaries. - std::vector output; output.reserve(compressed_summary.entries_.size()); for (const auto& entry : compressed_summary.entries_) { output.push_back(entry.value); @@ -260,6 +264,9 @@ class WeightedQuantilesSummary { // full rank queries O(nlogn). std::vector GenerateQuantiles(int64 num_quantiles) const { std::vector output; + if (entries_.empty()) { + return output; + } num_quantiles = std::max(num_quantiles, 2LL); output.reserve(num_quantiles + 1); diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py index 4bde7f3e33d6f8b295cd35cb32bbbccecf8a2b87..08c1dcdd028829e6ef290965347d184ed42f416d 100644 --- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py +++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py @@ -970,10 +970,8 @@ class GradientBoostedDecisionTreeModel(object): # Stack all the inputs to one tensor per type. # This is a workaround for the slowness of graph building in tf.cond. # See (b/36554864). - split_sizes = array_ops.stack([ - array_ops.shape(partition_id)[0] - for partition_id in partition_ids_list - ]) + split_sizes = array_ops.reshape( + array_ops.shape_n(partition_ids_list), [-1]) partition_ids = array_ops.concat(partition_ids_list, axis=0) gains = array_ops.concat(gains_list, axis=0) split_infos = array_ops.concat(split_info_list, axis=0) diff --git a/tensorflow/contrib/checkpoint/README.md b/tensorflow/contrib/checkpoint/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d35c5bae3b702c0fea5194e5e653660e319e38c5 --- /dev/null +++ b/tensorflow/contrib/checkpoint/README.md @@ -0,0 +1,2 @@ +Tools for working with object-based checkpoints produced by +`tf.train.Checkpoint`. diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1192cc44a17823f69db28947308a8b839a83e57e --- /dev/null +++ b/tensorflow/contrib/checkpoint/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tools for working with object-based checkpoints. + + +For creating and managing dependencies: +@@dot_graph_from_checkpoint +@@split_dependency +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency +from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint + +from tensorflow.python.util.all_util import remove_undocumented + +remove_undocumented(module_name=__name__) diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..a5681ffa61d07ef29d0a0862db9736a210c8e26e --- /dev/null +++ b/tensorflow/contrib/checkpoint/python/BUILD @@ -0,0 +1,61 @@ +licenses(["notice"]) # Apache 2.0 + +package(default_visibility = ["//tensorflow:internal"]) + +load("//tensorflow:tensorflow.bzl", "py_test") + +py_library( + name = "checkpoint", + srcs_version = "PY2AND3", + deps = [ + ":split_dependency", + ":visualize", + ], +) + +py_library( + name = "split_dependency", + srcs = ["split_dependency.py"], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:internal"], + deps = [ + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:training", + ], +) + +py_test( + name = "split_dependency_test", + srcs = ["split_dependency_test.py"], + deps = [ + ":split_dependency", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:training", + "//tensorflow/python/eager:test", + ], +) + +py_library( + name = "visualize", + srcs = ["visualize.py"], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:internal"], + deps = [ + "//tensorflow/python:pywrap_tensorflow", + ], +) + +py_test( + name = "visualize_test", + srcs = ["visualize_test.py"], + deps = [ + ":visualize", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:training", + "//tensorflow/python/eager:test", + ], +) diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/checkpoint/python/split_dependency.py similarity index 95% rename from tensorflow/contrib/eager/python/checkpointable_utils.py rename to tensorflow/contrib/checkpoint/python/split_dependency.py index 30c4103c5aa52a74bcc8f72c7e1df186c9f7f591..3aec8c96e90440d6da00d95cffc34bd53ec7164f 100644 --- a/tensorflow/contrib/eager/python/checkpointable_utils.py +++ b/tensorflow/contrib/checkpoint/python/split_dependency.py @@ -1,4 +1,4 @@ -"""Utilities for working with Checkpointable objects.""" +"""Utility for creating multiple dependencies with synchronized save/restore.""" # Copyright 2017 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,7 +20,7 @@ from __future__ import print_function import functools from tensorflow.python.ops import control_flow_ops -from tensorflow.python.training import checkpointable as core_checkpointable +from tensorflow.python.training import checkpointable as checkpointable from tensorflow.python.training import saver as saver_lib @@ -43,7 +43,7 @@ class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject): return self._restore_callback(tensor) -class _SplitDependency(core_checkpointable.CheckpointableBase): +class _SplitDependency(checkpointable.CheckpointableBase): """Looks like a regular variable while synchronizing save/restores.""" def __init__(self, save_buffer, restore_buffer, name, dtype, num_components, @@ -83,7 +83,7 @@ class _SplitDependency(core_checkpointable.CheckpointableBase): def _gather_saveables_for_checkpoint(self): """Looks to Checkpointable like a regular variable.""" return { - core_checkpointable.VARIABLE_VALUE_KEY: + checkpointable.VARIABLE_VALUE_KEY: functools.partial(_CallbackSaveable, dtype=self._dtype, save_callback=self._save, diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py similarity index 96% rename from tensorflow/contrib/eager/python/checkpointable_utils_test.py rename to tensorflow/contrib/checkpoint/python/split_dependency_test.py index bd42d405db9d1275c83636dc83090fa11b0b74b1..f1d9d19b047ee69281cf8bdba38a28dc87947e38 100644 --- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py +++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py @@ -18,7 +18,7 @@ from __future__ import print_function import os -from tensorflow.contrib.eager.python import checkpointable_utils as contrib_checkpointable_utils +from tensorflow.contrib.checkpoint.python import split_dependency from tensorflow.python.eager import test from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops @@ -47,7 +47,7 @@ class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase): def __init__(self): self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.]) - split_dependencies = contrib_checkpointable_utils.split_dependency( + split_dependencies = split_dependency.split_dependency( component_names=("first_half", "second_half"), component_dtypes=(self.combined.dtype,) * 2, fill_save_buffer_fn=_split_variable_closure( diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py new file mode 100644 index 0000000000000000000000000000000000000000..86fbdb41d2c37803f2bd71b5aa2f72845c87d448 --- /dev/null +++ b/tensorflow/contrib/checkpoint/python/visualize.py @@ -0,0 +1,111 @@ +"""Utilities for visualizing dependency graphs.""" +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.core.protobuf import checkpointable_object_graph_pb2 +from tensorflow.python import pywrap_tensorflow +from tensorflow.python.framework import errors_impl +from tensorflow.python.training import checkpointable + + +def dot_graph_from_checkpoint(save_path): + r"""Visualizes an object-based checkpoint (from `tf.train.Checkpoint`). + + Useful for inspecting checkpoints and debugging loading issues. + + Example usage from Python (requires pydot): + ```python + import tensorflow as tf + import pydot + + dot_string = tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt') + parsed, = pydot.graph_from_dot_data(dot_string) + parsed.write_svg('/tmp/tensorflow/visualized_checkpoint.svg') + ``` + + Example command line usage: + ```sh + python -c "import tensorflow as tf;\ + print(tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt'))"\ + | dot -Tsvg > /tmp/tensorflow/checkpoint_viz.svg + ``` + + Args: + save_path: The checkpoint prefix, as returned by `tf.train.Checkpoint.save` + or `tf.train.latest_checkpoint`. + Returns: + A graph in DOT format as a string. + """ + reader = pywrap_tensorflow.NewCheckpointReader(save_path) + try: + object_graph_string = reader.get_tensor( + checkpointable.OBJECT_GRAPH_PROTO_KEY) + except errors_impl.NotFoundError: + raise ValueError( + ('The specified checkpoint "%s" does not appear to be object-based (it ' + 'is missing the key "%s"). Likely it was created with a name-based ' + 'saver and does not contain an object dependency graph.') % ( + save_path, checkpointable.OBJECT_GRAPH_PROTO_KEY)) + shape_map = reader.get_variable_to_shape_map() + dtype_map = reader.get_variable_to_dtype_map() + object_graph = ( + checkpointable_object_graph_pb2.CheckpointableObjectGraph()) + object_graph.ParseFromString(object_graph_string) + graph = 'digraph {\n' + def _escape(name): + return name.replace('"', '\\"') + slot_ids = set() + for node in object_graph.nodes: + for slot_reference in node.slot_variables: + slot_ids.add(slot_reference.slot_variable_node_id) + for node_id, node in enumerate(object_graph.nodes): + if (len(node.attributes) == 1 + and node.attributes[0].name == checkpointable.VARIABLE_VALUE_KEY): + if node_id in slot_ids: + color = 'orange' + tooltip_prefix = 'Slot variable' + else: + color = 'blue' + tooltip_prefix = 'Variable' + attribute = node.attributes[0] + graph += ('N_%d [shape=point label="" color=%s width=.25' + ' tooltip="%s %s shape=%s %s"]\n') % ( + node_id, + color, + tooltip_prefix, + _escape(attribute.full_name), + shape_map[attribute.checkpoint_key], + dtype_map[attribute.checkpoint_key].name) + elif node.slot_variables: + graph += ('N_%d [shape=point label="" width=.25 color=red,' + 'tooltip="Optimizer"]\n') % node_id + else: + graph += 'N_%d [shape=point label="" width=.25]\n' % node_id + for reference in node.children: + graph += 'N_%d -> N_%d [label="%s"]\n' % ( + node_id, reference.node_id, _escape(reference.local_name)) + for slot_reference in node.slot_variables: + graph += 'N_%d -> N_%d [label="%s" style=dotted]\n' % ( + node_id, + slot_reference.slot_variable_node_id, + _escape(slot_reference.slot_name)) + graph += 'N_%d -> N_%d [style=dotted]\n' % ( + slot_reference.original_variable_node_id, + slot_reference.slot_variable_node_id) + graph += '}\n' + return graph diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1d9ab789235cb964521315b4864563f89745ae75 --- /dev/null +++ b/tensorflow/contrib/checkpoint/python/visualize_test.py @@ -0,0 +1,97 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import os + +from tensorflow.contrib.checkpoint.python import visualize + +from tensorflow.python.eager import context +from tensorflow.python.eager import test +from tensorflow.python.framework import constant_op +from tensorflow.python.keras._impl.keras.engine import training +from tensorflow.python.keras._impl.keras.layers import core +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.training import adam +from tensorflow.python.training import checkpointable_utils + +try: + import pydot # pylint: disable=g-import-not-at-top +except ImportError: + pydot = None + + +class MyModel(training.Model): + """A concrete Model for testing.""" + + def __init__(self): + super(MyModel, self).__init__() + self._named_dense = core.Dense(1, use_bias=True) + self._second = core.Dense(1, use_bias=False) + + def call(self, values): + ret = self._second(self._named_dense(values)) + return ret + + +class DotGraphTests(test.TestCase): + + def testMakeDotGraph(self): + with context.eager_mode(): + input_value = constant_op.constant([[3.]]) + model = MyModel() + optimizer = adam.AdamOptimizer(0.001) + optimizer_step = resource_variable_ops.ResourceVariable(12) + save_checkpoint = checkpointable_utils.Checkpoint( + optimizer=optimizer, model=model, optimizer_step=optimizer_step) + optimizer.minimize(functools.partial(model, input_value)) + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt') + save_path = save_checkpoint.save(checkpoint_prefix) + prefix = save_checkpoint.save(save_path) + + dot_graph_string = visualize.dot_graph_from_checkpoint(prefix) + + # The remainder of this test is more-or-less optional since it's so + # dependent on pydot/platform/Python versions. + if pydot is None: + self.skipTest('pydot is required for the remainder of this test.') + try: + parsed, = pydot.graph_from_dot_data(dot_graph_string) + except NameError as e: + if "name 'dot_parser' is not defined" in str(e): + self.skipTest("pydot isn't working") + else: + raise + # Check that the graph isn't completely trivial + self.assertEqual( + '"model"', + parsed.obj_dict['edges'][('N_0', 'N_1')][0]['attributes']['label']) + image_path = os.path.join(self.get_temp_dir(), 'saved.svg') + try: + parsed.write_svg(image_path) + except Exception as e: # pylint: disable=broad-except + # For some reason PyDot's "dot not available" error is an Exception, not + # something more specific. + if '"dot" not found in path' in str(e): + self.skipTest("pydot won't save SVGs (dot not available)") + else: + raise + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py index 5a2771229d9ffe2b5b389d1077fe02a230e9a4c0..1403483d287041b02dfbf538f7e7ddee11662f47 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py +++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py @@ -245,7 +245,7 @@ class TPUClusterResolver(ClusterResolver): else: if not self._tpu.startswith(compat.as_bytes('grpc://')): # Case 3. - return server_lib.ClusterSpec({}) + return None # Case 2. cluster_spec = {self._job_name: [self._tpu[len( compat.as_bytes('grpc://')):]]} diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py index dff7a03b6847fb6e159dc2fa9832fceb3dfe2d54..5b3f9be5a11237f9dceebefa1db294efaf7e482d 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py +++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py @@ -356,8 +356,7 @@ class TPUClusterResolverTest(test.TestCase): tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar') self.assertEqual( compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master()) - self.assertEqual( - server_lib.ClusterSpec({}), tpu_cluster_resolver.cluster_spec()) + self.assertEqual(None, tpu_cluster_resolver.cluster_spec()) def testGkeEnvironment(self): os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470' diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt index 10f29deca08f2a70eeeb7c758f3268e0bce11e8c..d75b1b12a62e31346f4990b7497b734001e285ae 100644 --- a/tensorflow/contrib/cmake/CMakeLists.txt +++ b/tensorflow/contrib/cmake/CMakeLists.txt @@ -214,6 +214,7 @@ include(protobuf) include(re2) include(cub) include(sqlite) +include(double_conversion) if (tensorflow_BUILD_CC_TESTS) include(googletest) endif() @@ -234,6 +235,7 @@ set(tensorflow_EXTERNAL_LIBRARIES ${protobuf_STATIC_LIBRARIES} ${re2_STATIC_LIBRARIES} ${sqlite_STATIC_LIBRARIES} + ${double_conversion_STATIC_LIBRARIES} ) if (systemlib_ZLIB) @@ -261,6 +263,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES fft2d re2 sqlite_copy_headers_to_destination + double_conversion ) include_directories( @@ -283,6 +286,7 @@ include_directories( ${PROTOBUF_INCLUDE_DIRS} ${re2_INCLUDE_DIR} ${sqlite_INCLUDE_DIR} + ${double_conversion_INCLUDE_DIR} ) if(tensorflow_ENABLE_SSL_SUPPORT) @@ -467,6 +471,10 @@ if (tensorflow_ENABLE_GPU) include_directories(${tensorflow_source_dir}/third_party/gpus) # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES}) + if(NOT WIN32) + # add gomp to tensorflow_EXTERNAL_LIBRARIES, needed by libcusolver.so + list(APPEND tensorflow_EXTERNAL_LIBRARIES gomp) + endif() # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used # in the default build is upgraded. diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake new file mode 100644 index 0000000000000000000000000000000000000000..527ccdc8d887cb4c2e7d2412c99a8bc682568472 --- /dev/null +++ b/tensorflow/contrib/cmake/external/double_conversion.cmake @@ -0,0 +1,54 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +include (ExternalProject) + +set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion) +set(double_conversion_URL https://github.com/google/double-conversion.git) +set(double_conversion_TAG 5664746) +set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR}) +set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so) +set(double_conversion_INCLUDES ${double_conversion_BUILD}) + +if(WIN32) + set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib) +else() + set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a) +endif() + +set(double_conversion_HEADERS + "${double_conversion_INCLUDE_DIR}/double-conversion/bignum-dtoa.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/cached-powers.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/double-conversion.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/fixed-dtoa.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/strtod.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/bignum.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/diy-fp.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/fast-dtoa.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/ieee.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/utils.h" +) + +ExternalProject_Add(double_conversion + PREFIX double_conversion + GIT_REPOSITORY ${double_conversion_URL} + GIT_TAG ${double_conversion_TAG} + DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" + BUILD_IN_SOURCE 1 + INSTALL_COMMAND "" + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON +) diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt index 91839194c7c214fe910ff78723ab418f86c7fac0..6468bed4979253be5c20666d26bf24fa479d64a0 100644 --- a/tensorflow/contrib/cmake/python_modules.txt +++ b/tensorflow/contrib/cmake/python_modules.txt @@ -129,7 +129,13 @@ tensorflow/contrib/boosted_trees/kernels tensorflow/contrib/boosted_trees/ops tensorflow/contrib/boosted_trees/proto tensorflow/contrib/boosted_trees/python +tensorflow/contrib/boosted_trees/python/kernel_tests tensorflow/contrib/boosted_trees/python/ops +tensorflow/contrib/boosted_trees/python/training +tensorflow/contrib/boosted_trees/python/training/functions +tensorflow/contrib/boosted_trees/python/utils +tensorflow/contrib/checkpoint +tensorflow/contrib/checkpoint/python tensorflow/contrib/cloud tensorflow/contrib/cloud/kernels tensorflow/contrib/cloud/ops @@ -142,8 +148,11 @@ tensorflow/contrib/coder tensorflow/contrib/coder/kernels tensorflow/contrib/coder/ops tensorflow/contrib/coder/python +tensorflow/contrib/coder/python/layers tensorflow/contrib/coder/python/ops tensorflow/contrib/compiler +tensorflow/contrib/constrained_optimization +tensorflow/contrib/constrained_optimization/python tensorflow/contrib/copy_graph tensorflow/contrib/copy_graph/python tensorflow/contrib/copy_graph/python/util diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake index ed018b4fed8e47632f632723f19cc755f2079f86..f38c9e05135f9f8d2fb3e2efedb7223e06e4983a 100644 --- a/tensorflow/contrib/cmake/tf_core_kernels.cmake +++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake @@ -63,6 +63,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS) "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/training_ops.cc" + "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc" @@ -176,6 +177,16 @@ if(WIN32) "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc" ) list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs}) +else(WIN32) + if(tensorflow_ENABLE_GPU) + file(GLOB_RECURSE tf_core_kernels_gpu_exclude_srcs + # temporarily disable nccl as it needs to be ported with gpu + "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc" + "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc" + "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc" + ) + list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_gpu_exclude_srcs}) + endif(tensorflow_ENABLE_GPU) endif(WIN32) file(GLOB_RECURSE tf_core_gpu_kernels_srcs diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake index af48ef1fd40456162fee8b1e2c3ca45ecdb58830..9a37b681194d4ef82b27a0160dd969f733ecad67 100644 --- a/tensorflow/contrib/cmake/tf_stream_executor.cmake +++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake @@ -64,6 +64,8 @@ file(GLOB tf_stream_executor_srcs if (tensorflow_ENABLE_GPU) file(GLOB tf_stream_executor_gpu_srcs "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc" + "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.h" + "${tensorflow_source_dir}/tensorflow/compiler/xla/statusor.cc" ) if (NOT tensorflow_BUILD_CC_TESTS) file(GLOB tf_stream_executor_gpu_tests diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD index 9ca4ce8a9c765677865f77ea4982ad8613ce334c..a2c6e413039ee3b5af3cb53d1af3325037536d36 100644 --- a/tensorflow/contrib/coder/BUILD +++ b/tensorflow/contrib/coder/BUILD @@ -1,5 +1,5 @@ # Description: -# Contains entropy coding related modules. +# Contains tools related to data compression. package(default_visibility = [ "//learning/brain:__subpackages__", @@ -54,19 +54,27 @@ tf_gen_op_libs( ], ) +cc_library( + name = "range_coder_ops_util", + srcs = ["kernels/range_coder_ops_util.cc"], + hdrs = ["kernels/range_coder_ops_util.h"], + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + ], +) + tf_kernel_library( name = "range_coder_ops", srcs = [ "kernels/range_coder_ops.cc", - "kernels/range_coder_ops_util.cc", - ], - hdrs = [ - "kernels/range_coder_ops_util.h", ], visibility = ["//visibility:public"], deps = [ ":coder_ops_op_lib", ":range_coder", + ":range_coder_ops_util", "//tensorflow/core:framework", "//tensorflow/core:lib", ], @@ -152,10 +160,21 @@ tf_gen_op_wrapper_py( deps = [":coder_ops_op_lib"], ) +py_library( + name = "coder_py", + srcs = [ + "__init__.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":coder_ops_py", + ":entropybottleneck_py", + ], +) + tf_custom_op_py_library( name = "coder_ops_py", srcs = [ - "__init__.py", "python/ops/coder_ops.py", ], dso = [ @@ -186,3 +205,44 @@ tf_py_test( ], main = "python/ops/coder_ops_test.py", ) + +py_library( + name = "entropybottleneck_py", + srcs = [ + "python/layers/entropybottleneck.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":coder_ops_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:functional_ops", + "//tensorflow/python:init_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:nn", + "//tensorflow/python:ops", + "//tensorflow/python:random_ops", + "//tensorflow/python:state_ops", + "//tensorflow/python:summary_ops", + "//tensorflow/python:tensor_shape", + "//tensorflow/python:variable_scope", + "//tensorflow/python/eager:context", + "//tensorflow/python/keras:engine", + "//third_party/py/numpy", + ], +) + +tf_py_test( + name = "entropybottleneck_py_test", + srcs = [ + "python/layers/entropybottleneck_test.py", + ], + additional_deps = [ + ":entropybottleneck_py", + "//tensorflow/python:client_testlib", + "//tensorflow/python:variables", + "//tensorflow/python:training", + ], + main = "python/layers/entropybottleneck_test.py", +) diff --git a/tensorflow/contrib/coder/__init__.py b/tensorflow/contrib/coder/__init__.py index b7e663e6f1359f399cdaa80e037635a8f7546b37..99b8ac7595ec632b2918e6b7ca22c06dd7f0a8b3 100644 --- a/tensorflow/contrib/coder/__init__.py +++ b/tensorflow/contrib/coder/__init__.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Entropy code operations.""" +"""Data compression tools.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function # pylint: disable=wildcard-import +from tensorflow.contrib.coder.python.layers.entropybottleneck import * from tensorflow.contrib.coder.python.ops.coder_ops import * # pylint: enable=wildcard-import diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc index c787e8edede0942cd152eafa6333849d194e58b6..bd5272ee6f20ac3537a2e378225ede5ee90782c5 100644 --- a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc +++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc @@ -16,6 +16,7 @@ limitations under the License. #define EIGEN_USE_THREADS #include +#include #include #include #include @@ -79,8 +80,8 @@ class PmfToCdfOp : public OpKernel { } private: - struct Item { - Item(int32* p, double mass) : pointer(p), mass(mass) { + struct PenaltyItem { + PenaltyItem(int32* p, double mass) : pointer(p), mass(mass) { penalty = ComputeNextPenalty(); } @@ -90,7 +91,7 @@ class PmfToCdfOp : public OpKernel { penalty = ComputeNextPenalty(); } - friend bool operator<(const Item& lhs, const Item& rhs) { + friend bool operator<(const PenaltyItem& lhs, const PenaltyItem& rhs) { return lhs.penalty < rhs.penalty; } @@ -106,6 +107,34 @@ class PmfToCdfOp : public OpKernel { double penalty; }; + struct GainItem { + GainItem(int32* p, double mass) : pointer(p), mass(mass) { + gain = ComputeNextGain(); + } + + void Increase() { + CHECK_GT(*pointer, 0); + ++*pointer; + gain = ComputeNextGain(); + } + + friend bool operator>(const GainItem& lhs, const GainItem& rhs) { + return lhs.gain > rhs.gain; + } + + double ComputeNextGain() { + // Never increment zero value to non-zero value. + if (*pointer < 1) { + return -std::numeric_limits::infinity(); + } + return mass * (std::log2(*pointer + 1) - std::log2(*pointer)); + } + + int32* pointer; + double mass; + double gain; + }; + void PerShard(gtl::ArraySlice pmf, gtl::MutableArraySlice cdf) const { CHECK_EQ(pmf.size(), cdf.size()); @@ -121,7 +150,7 @@ class PmfToCdfOp : public OpKernel { int32 sum = std::accumulate(cdf.begin(), cdf.end(), 0); if (sum > normalizer) { - std::vector queue; + std::vector queue; queue.reserve(cdf.size()); for (int i = 0; i < cdf.size(); ++i) { queue.emplace_back(&cdf[i], pmf[i]); @@ -132,9 +161,26 @@ class PmfToCdfOp : public OpKernel { queue[0].Decrease(); // Performs a linear search because this find_if is likely to return // iterator very close to the begin. - auto iter = - std::find_if(std::next(queue.begin()), queue.end(), - [&queue](const Item& rhs) { return queue[0] < rhs; }); + auto iter = std::find_if( + std::next(queue.begin()), queue.end(), + [&queue](const PenaltyItem& rhs) { return queue[0] < rhs; }); + std::rotate(queue.begin(), std::next(queue.begin()), iter); + } + } else if (sum < normalizer) { + std::vector queue; + queue.reserve(cdf.size()); + for (int i = 0; i < cdf.size(); ++i) { + queue.emplace_back(&cdf[i], pmf[i]); + } + + std::sort(queue.begin(), queue.end(), std::greater()); + while (sum++ < normalizer) { + queue[0].Increase(); + // Performs a linear search because this find_if is likely to return + // iterator very close to the begin. + auto iter = std::find_if( + std::next(queue.begin()), queue.end(), + [&queue](const GainItem& rhs) { return queue[0] > rhs; }); std::rotate(queue.begin(), std::next(queue.begin()), iter); } } diff --git a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc index c70e38faab713e23b5defa890d35bfadeac5940a..3408f6b519a33fbb8f23d19c16bc7138fc34c121 100644 --- a/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc +++ b/tensorflow/contrib/coder/kernels/pmf_to_cdf_op_test.cc @@ -82,7 +82,7 @@ class PmfToQuantizedCdfOpTest : public OpsTestBase { EXPECT_GT(diff, 0); } - EXPECT_LE(cdf_slice(cdf_slice.size() - 1), normalizer); + EXPECT_EQ(cdf_slice(cdf_slice.size() - 1), normalizer); } } }; @@ -98,6 +98,8 @@ TEST_F(PmfToQuantizedCdfOpTest, UnderSum) { GenerateData(&rand, {&matrix(i, 0), n}); } + pmf.flat() = pmf.flat() * 0.85f; + constexpr int kPrecision = 10; SetupOp(kPrecision, &pmf); TF_ASSERT_OK(RunOpKernel()); @@ -115,7 +117,7 @@ TEST_F(PmfToQuantizedCdfOpTest, OverSum) { matrix.setZero(); const std::size_t n = matrix.dimension(1) / 2; - random::PhiloxRandom gen; + random::PhiloxRandom gen(random::New64(), random::New64()); random::SimplePhilox rand(&gen); for (int64 i = 0; i < matrix.dimension(0); ++i) { GenerateData(&rand, {&matrix(i, 0), n}); diff --git a/tensorflow/contrib/coder/ops/coder_ops.cc b/tensorflow/contrib/coder/ops/coder_ops.cc index 9bb171298f85088fdb776302776f2ba379b4f52e..a185e07913f84a813d76a8c63741bd22a832c8b9 100644 --- a/tensorflow/contrib/coder/ops/coder_ops.cc +++ b/tensorflow/contrib/coder/ops/coder_ops.cc @@ -77,7 +77,7 @@ are incorrect. For this reason, the range coder uses integer arithmetics and avoids using any floating point operations internally, and `cdf` should contain integers representing quantized probability mass rather than floating points. -data: An int32 tensor. +data: An int16 tensor. cdf: An int32 tensor representing the CDF's of `data`. Each integer is divided by `2^precision` to represent a fraction. encoded: A range-coded scalar string. @@ -112,7 +112,7 @@ potential performance issues, the decoder does not return error status. encoded: A scalar string tensor from RangeEncode. shape: An int32 1-D tensor representing the shape of the data encoded by RangeEncode. -decoded: An int32 tensor with shape equal to `shape`. +decoded: An int16 tensor with shape equal to `shape`. precision: The number of bits for probability quantization. Must be <= 16, and must match the precision used by RangeEncode that produced `encoded`. )doc"); @@ -138,14 +138,12 @@ platforms. For entropy encoders and decoders to have the same quantized CDF on different platforms, the quantized CDF should be produced once and saved, then the saved quantized CDF should be used everywhere. -After quantization, if PMF sums to less than or equal to 2^precision, then this -is equivalent to cumsum over the last dimension. This op makes no effort to make -the sum close to 2^precision when the sum is already <= 2^precision. +After quantization, if PMF does not sum to 2^precision, then some values of PMF +are increased or decreased to adjust the sum to equal to 2^precision. -After quantization, if PMF sums to greater than 2^precision, then some values of -PMF is decreased to keep the sum no more than 2^precision. - -Note that the input PMF is pre-quantization. +Note that the input PMF is pre-quantization. The input PMF is not normalized +by this op prior to quantization. Therefore the user is responsible for +normalizing PMF if necessary. )doc"); // clang-format on } // namespace tensorflow diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck.py b/tensorflow/contrib/coder/python/layers/entropybottleneck.py new file mode 100644 index 0000000000000000000000000000000000000000..f039cb0f5265b920200f63c5bd5ebeb4e23826be --- /dev/null +++ b/tensorflow/contrib/coder/python/layers/entropybottleneck.py @@ -0,0 +1,697 @@ +# -*- coding: utf-8 -*- +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Entropy bottleneck layer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.coder.python.ops import coder_ops + +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.keras._impl.keras import engine +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import functional_ops +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import nn +from tensorflow.python.ops import random_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.summary import summary + + +class EntropyBottleneck(engine.Layer): + """Entropy bottleneck layer. + + This layer can be used to model the entropy (the amount of information + conveyed) of the tensor passing through it. During training, this can be used + to impose a (soft) entropy constraint on its activations, limiting the amount + of information flowing through the layer. Note that this is distinct from + other types of bottlenecks, which reduce the dimensionality of the space, for + example. Dimensionality reduction does not limit the amount of information, + and does not enable efficient data compression per se. + + After training, this layer can be used to compress any input tensor to a + string, which may be written to a file, and to decompress a file which it + previously generated back to a reconstructed tensor (possibly on a different + machine having access to the same model checkpoint). The entropies estimated + during training or evaluation are approximately equal to the average length of + the strings in bits. + + The layer implements a flexible probability density model to estimate entropy, + which is described in the appendix of the paper (please cite the paper if you + use this code for scientific work): + + "Variational image compression with a scale hyperprior" + + Johannes Ballé, David Minnen, Saurabh Singh, Sung Jin Hwang, Nick Johnston + + https://arxiv.org/abs/1802.01436 + + The layer assumes that the input tensor is at least 2D, with a batch dimension + at the beginning and a channel dimension as specified by `data_format`. The + layer trains an independent probability density model for each channel, but + assumes that across all other dimensions, the inputs are i.i.d. (independent + and identically distributed). Because the entropy (and hence, average + codelength) is a function of the densities, this assumption may have a direct + effect on the compression performance. + + Because data compression always involves discretization, the outputs of the + layer are generally only approximations of its inputs. During training, + discretization is modeled using additive uniform noise to ensure + differentiability. The entropies computed during training are differential + entropies. During evaluation, the data is actually quantized, and the + entropies are discrete (Shannon entropies). To make sure the approximated + tensor values are good enough for practical purposes, the training phase must + be used to balance the quality of the approximation with the entropy, by + adding an entropy term to the training loss, as in the following example. + + Here, we use the entropy bottleneck to compress the latent representation of + an autoencoder. The data vectors `x` in this case are 4D tensors in + `'channels_last'` format (for example, 16x16 pixel grayscale images). + + The layer always produces exactly one auxiliary loss and one update op which + are only significant for compression and decompression. To use the compression + feature, the auxiliary loss must be minimized during or after training. After + that, the update op must be executed at least once. Here, we simply attach + them to the main training step. + + Training: + ``` + # Build autoencoder. + x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) + y = forward_transform(x) + entropy_bottleneck = EntropyBottleneck() + y_, likelihoods = entropy_bottleneck(y, training=True) + x_ = backward_transform(y_) + + # Information content (= predicted codelength) in bits of each batch element + # (note that taking the natural logarithm and dividing by `log(2)` is + # equivalent to taking base-2 logarithms): + bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2) + + # Squared difference of each batch element: + squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3)) + + # The loss is a weighted sum of mean squared error and entropy (average + # information content), where the weight controls the trade-off between + # approximation error and entropy. + main_loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits) + + # Minimize loss and auxiliary loss, and execute update op. + main_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) + main_step = optimizer.minimize(main_loss) + # 1e-2 is a good starting point for the learning rate of the auxiliary loss, + # assuming Adam is used. + aux_optimizer = tf.train.AdamOptimizer(learning_rate=1e-2) + aux_step = optimizer.minimize(entropy_bottleneck.losses[0]) + step = tf.group(main_step, aux_step, entropy_bottleneck.updates[0]) + ``` + + Evaluation: + ``` + # Build autoencoder. + x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) + y = forward_transform(x) + y_, likelihoods = EntropyBottleneck()(y, training=False) + x_ = backward_transform(y_) + + # Information content (= predicted codelength) in bits of each batch element: + bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2) + + # Squared difference of each batch element: + squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3)) + + # The loss is a weighted sum of mean squared error and entropy (average + # information content), where the weight controls the trade-off between + # approximation error and entropy. + loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits) + ``` + + To be able to compress the bottleneck tensor and decompress it in a different + session, or on a different machine, you need three items: + - The compressed representations stored as strings. + - The shape of the bottleneck for these string representations as a `Tensor`, + as well as the number of channels of the bottleneck at graph construction + time. + - The checkpoint of the trained model that was used for compression. Note: + It is crucial that the auxiliary loss produced by this layer is minimized + during or after training, and that the update op is run after training and + minimization of the auxiliary loss, but *before* the checkpoint is saved. + + Compression: + ``` + x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) + y = forward_transform(x) + strings = EntropyBottleneck().compress(y) + shape = tf.shape(y)[1:] + ``` + + Decompression: + ``` + strings = tf.placeholder(tf.string, shape=[None]) + shape = tf.placeholder(tf.int32, shape=[3]) + entropy_bottleneck = EntropyBottleneck(dtype=tf.float32) + y_ = entropy_bottleneck.decompress(strings, shape, channels=5) + x_ = backward_transform(y_) + ``` + Here, we assumed that the tensor produced by the forward transform has 5 + channels. + + The above four use cases can also be implemented within the same session (i.e. + on the same `EntropyBottleneck` instance), for testing purposes, etc., by + calling the object more than once. + + Arguments: + init_scale: Float. A scaling factor determining the initial width of the + probability densities. This should be chosen big enough so that the + range of values of the layer inputs roughly falls within the interval + [`-init_scale`, `init_scale`] at the beginning of training. + filters: An iterable of ints, giving the number of filters at each layer of + the density model. Generally, the more filters and layers, the more + expressive is the density model in terms of modeling more complicated + distributions of the layer inputs. For details, refer to the paper + referenced above. The default is `[3, 3, 3]`, which should be sufficient + for most practical purposes. + tail_mass: Float, between 0 and 1. The bottleneck layer automatically + determines the range of input values that should be represented based on + their frequency of occurrence. Values occurring in the tails of the + distributions will be clipped to that range during compression. + `tail_mass` determines the amount of probability mass in the tails which + is cut off in the worst case. For example, the default value of `1e-9` + means that at most 1 in a billion input samples will be clipped to the + range. + optimize_integer_offset: Boolean. Typically, the input values of this layer + are floats, which means that quantization during evaluation can be + performed with an arbitrary offset. By default, the layer determines that + offset automatically. In special situations, such as when it is known that + the layer will receive only full integer values during evaluation, it can + be desirable to set this argument to `False` instead, in order to always + quantize to full integer values. + likelihood_bound: Float. If positive, the returned likelihood values are + ensured to be greater than or equal to this value. This prevents very + large gradients with a typical entropy loss (defaults to 1e-9). + range_coder_precision: Integer, between 1 and 16. The precision of the range + coder used for compression and decompression. This trades off computation + speed with compression efficiency, where 16 is the slowest but most + efficient setting. Choosing lower values may increase the average + codelength slightly compared to the estimated entropies. + data_format: Either `'channels_first'` or `'channels_last'` (default). + trainable: Boolean. Whether the layer should be trained. + name: String. The name of the layer. + dtype: Default dtype of the layer's parameters (default of `None` means use + the type of the first input). + + Read-only properties: + init_scale: See above. + filters: See above. + tail_mass: See above. + optimize_integer_offset: See above. + likelihood_bound: See above. + range_coder_precision: See above. + data_format: See above. + name: String. See above. + dtype: See above. + trainable_variables: List of trainable variables. + non_trainable_variables: List of non-trainable variables. + variables: List of all variables of this layer, trainable and non-trainable. + updates: List of update ops of this layer. Always contains exactly one + update op, which must be run once after the last training step, before + `compress` or `decompress` is used. + losses: List of losses added by this layer. Always contains exactly one + auxiliary loss, which must be added to the training loss. + + Mutable properties: + trainable: Boolean. Whether the layer should be trained. + input_spec: Optional `InputSpec` object specifying the constraints on inputs + that can be accepted by the layer. + """ + + def __init__(self, init_scale=10, filters=(3, 3, 3), tail_mass=1e-9, + optimize_integer_offset=True, likelihood_bound=1e-9, + range_coder_precision=16, data_format="channels_last", **kwargs): + super(EntropyBottleneck, self).__init__(**kwargs) + self._init_scale = float(init_scale) + self._filters = tuple(int(f) for f in filters) + self._tail_mass = float(tail_mass) + if not 0 < self.tail_mass < 1: + raise ValueError( + "`tail_mass` must be between 0 and 1, got {}.".format(self.tail_mass)) + self._optimize_integer_offset = bool(optimize_integer_offset) + self._likelihood_bound = float(likelihood_bound) + self._range_coder_precision = int(range_coder_precision) + self._data_format = data_format + self._channel_axis(2) # trigger ValueError early + self.input_spec = engine.InputSpec(min_ndim=2) + + @property + def init_scale(self): + return self._init_scale + + @property + def filters(self): + return self._filters + + @property + def tail_mass(self): + return self._tail_mass + + @property + def optimize_integer_offset(self): + return self._optimize_integer_offset + + @property + def likelihood_bound(self): + return self._likelihood_bound + + @property + def range_coder_precision(self): + return self._range_coder_precision + + @property + def data_format(self): + return self._data_format + + def _channel_axis(self, ndim): + try: + return {"channels_first": 1, "channels_last": ndim - 1}[self.data_format] + except KeyError: + raise ValueError("Unsupported `data_format` for {} layer: {}.".format( + self.__class__.__name__, self.data_format)) + + def _logits_cumulative(self, inputs, stop_gradient): + """Evaluate logits of the cumulative densities. + + Args: + inputs: The values at which to evaluate the cumulative densities, expected + to be a `Tensor` of shape `(channels, 1, batch)`. + stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so + that the gradient of the output with respect to the density model + parameters is disconnected (the gradient with respect to `inputs` is + left untouched). + + Returns: + A `Tensor` of the same shape as `inputs`, containing the logits of the + cumulative densities evaluated at the given inputs. + """ + logits = inputs + + for i in range(len(self.filters) + 1): + matrix = self._matrices[i] + if stop_gradient: + matrix = array_ops.stop_gradient(matrix) + logits = math_ops.matmul(matrix, logits) + + bias = self._biases[i] + if stop_gradient: + bias = array_ops.stop_gradient(bias) + logits += bias + + if i < len(self._factors): + factor = self._factors[i] + if stop_gradient: + factor = array_ops.stop_gradient(factor) + logits += factor * math_ops.tanh(logits) + + return logits + + def build(self, input_shape): + """Builds the layer. + + Creates the variables for the network modeling the densities, creates the + auxiliary loss estimating the median and tail quantiles of the densities, + and then uses that to create the probability mass functions and the update + op that produces the discrete cumulative density functions used by the range + coder. + + Args: + input_shape: Shape of the input tensor, used to get the number of + channels. + + Raises: + ValueError: if `input_shape` doesn't specify the length of the channel + dimension. + """ + input_shape = tensor_shape.TensorShape(input_shape) + channel_axis = self._channel_axis(input_shape.ndims) + channels = input_shape[channel_axis].value + if channels is None: + raise ValueError("The channel dimension of the inputs must be defined.") + self.input_spec = engine.InputSpec( + ndim=input_shape.ndims, axes={channel_axis: channels}) + filters = (1,) + self.filters + (1,) + scale = self.init_scale ** (1 / (len(self.filters) + 1)) + + # Create variables. + self._matrices = [] + self._biases = [] + self._factors = [] + for i in range(len(self.filters) + 1): + init = np.log(np.expm1(1 / scale / filters[i + 1])) + matrix = self.add_variable( + "matrix_{}".format(i), dtype=self.dtype, + shape=(channels, filters[i + 1], filters[i]), + initializer=init_ops.Constant(init)) + matrix = nn.softplus(matrix) + self._matrices.append(matrix) + + bias = self.add_variable( + "bias_{}".format(i), dtype=self.dtype, + shape=(channels, filters[i + 1], 1), + initializer=init_ops.RandomUniform(-.5, .5)) + self._biases.append(bias) + + if i < len(self.filters): + factor = self.add_variable( + "factor_{}".format(i), dtype=self.dtype, + shape=(channels, filters[i + 1], 1), + initializer=init_ops.Zeros()) + factor = math_ops.tanh(factor) + self._factors.append(factor) + + # To figure out what range of the densities to sample, we need to compute + # the quantiles given by `tail_mass / 2` and `1 - tail_mass / 2`. Since we + # can't take inverses of the cumulative directly, we make it an optimization + # problem: + # `quantiles = argmin(|logit(cumulative) - target|)` + # where `target` is `logit(tail_mass / 2)` or `logit(1 - tail_mass / 2)`. + # Taking the logit (inverse of sigmoid) of the cumulative makes the + # representation of the right target more numerically stable. + + # Numerically stable way of computing logits of `tail_mass / 2` + # and `1 - tail_mass / 2`. + target = np.log(2 / self.tail_mass - 1) + # Compute lower and upper tail quantile as well as median. + target = constant_op.constant([-target, 0, target], dtype=self.dtype) + + def quantiles_initializer(shape, dtype=None, partition_info=None): + del partition_info # unused + assert tuple(shape[1:]) == (1, 3) + init = constant_op.constant( + [[[-self.init_scale, 0, self.init_scale]]], dtype=dtype) + return array_ops.tile(init, (shape[0], 1, 1)) + + quantiles = self.add_variable( + "quantiles", shape=(channels, 1, 3), dtype=self.dtype, + initializer=quantiles_initializer) + logits = self._logits_cumulative(quantiles, stop_gradient=True) + loss = math_ops.reduce_sum(abs(logits - target)) + self.add_loss(loss, inputs=None) + + # Save medians for `call`, `compress`, and `decompress`. + self._medians = quantiles[:, :, 1:2] + if not self.optimize_integer_offset: + self._medians = math_ops.round(self._medians) + + # Largest distance observed between lower tail quantile and median, + # or between median and upper tail quantile. + minima = math_ops.reduce_max(self._medians - quantiles[:, :, 0:1]) + maxima = math_ops.reduce_max(quantiles[:, :, 2:3] - self._medians) + minmax = math_ops.maximum(minima, maxima) + minmax = math_ops.ceil(minmax) + minmax = math_ops.maximum(minmax, 1) + + # Sample the density up to `minmax` around the median. + samples = math_ops.range(-minmax, minmax + 1, dtype=self.dtype) + samples += self._medians + + half = constant_op.constant(.5, dtype=self.dtype) + # We strip the sigmoid from the end here, so we can use the special rule + # below to only compute differences in the left tail of the sigmoid. + # This increases numerical stability (see explanation in `call`). + lower = self._logits_cumulative(samples - half, stop_gradient=True) + upper = self._logits_cumulative(samples + half, stop_gradient=True) + # Flip signs if we can move more towards the left tail of the sigmoid. + sign = -math_ops.sign(math_ops.add_n([lower, upper])) + pmf = abs(math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) + # Add tail masses to first and last bin of pmf, as we clip values for + # compression, meaning that out-of-range values get mapped to these bins. + pmf = array_ops.concat([ + math_ops.add_n([pmf[:, 0, :1], math_ops.sigmoid(lower[:, 0, :1])]), + pmf[:, 0, 1:-1], + math_ops.add_n([pmf[:, 0, -1:], math_ops.sigmoid(-upper[:, 0, -1:])]), + ], axis=-1) + self._pmf = pmf + + cdf = coder_ops.pmf_to_quantized_cdf( + pmf, precision=self.range_coder_precision) + def cdf_getter(*args, **kwargs): + del args, kwargs # ignored + return variable_scope.get_variable( + "quantized_cdf", dtype=dtypes.int32, initializer=cdf, + trainable=False, validate_shape=False, collections=()) + # Need to provide a fake shape here since add_variable insists on it. + self._quantized_cdf = self.add_variable( + "quantized_cdf", shape=(channels, 1), dtype=dtypes.int32, + getter=cdf_getter, trainable=False) + + update_op = state_ops.assign( + self._quantized_cdf, cdf, validate_shape=False) + self.add_update(update_op, inputs=None) + + super(EntropyBottleneck, self).build(input_shape) + + def call(self, inputs, training): + """Pass a tensor through the bottleneck. + + Args: + inputs: The tensor to be passed through the bottleneck. + training: Boolean. If `True`, returns a differentiable approximation of + the inputs, and their likelihoods under the modeled probability + densities. If `False`, returns the quantized inputs and their + likelihoods under the corresponding probability mass function. These + quantities can't be used for training, as they are not differentiable, + but represent actual compression more closely. + + Returns: + values: `Tensor` with the same shape as `inputs` containing the perturbed + or quantized input values. + likelihood: `Tensor` with the same shape as `inputs` containing the + likelihood of `values` under the modeled probability distributions. + + Raises: + ValueError: if `inputs` has different `dtype` or number of channels than + a previous set of inputs the model was invoked with earlier. + """ + inputs = ops.convert_to_tensor(inputs) + ndim = self.input_spec.ndim + channel_axis = self._channel_axis(ndim) + half = constant_op.constant(.5, dtype=self.dtype) + + # Convert to (channels, 1, batch) format by commuting channels to front + # and then collapsing. + order = list(range(ndim)) + order.pop(channel_axis) + order.insert(0, channel_axis) + values = array_ops.transpose(inputs, order) + shape = array_ops.shape(values) + values = array_ops.reshape(values, (shape[0], 1, -1)) + + # Add noise or quantize. + if training: + noise = random_ops.random_uniform(array_ops.shape(values), -half, half) + values = math_ops.add_n([values, noise]) + elif self.optimize_integer_offset: + values = math_ops.round(values - self._medians) + self._medians + else: + values = math_ops.round(values) + + # Evaluate densities. + # We can use the special rule below to only compute differences in the left + # tail of the sigmoid. This increases numerical stability: sigmoid(x) is 1 + # for large x, 0 for small x. Subtracting two numbers close to 0 can be done + # with much higher precision than subtracting two numbers close to 1. + lower = self._logits_cumulative(values - half, stop_gradient=False) + upper = self._logits_cumulative(values + half, stop_gradient=False) + # Flip signs if we can move more towards the left tail of the sigmoid. + sign = -math_ops.sign(math_ops.add_n([lower, upper])) + sign = array_ops.stop_gradient(sign) + likelihood = abs( + math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) + if self.likelihood_bound > 0: + likelihood_bound = constant_op.constant( + self.likelihood_bound, dtype=self.dtype) + # TODO(jballe): Override gradients. + likelihood = math_ops.maximum(likelihood, likelihood_bound) + + # Convert back to input tensor shape. + order = list(range(1, ndim)) + order.insert(channel_axis, 0) + values = array_ops.reshape(values, shape) + values = array_ops.transpose(values, order) + likelihood = array_ops.reshape(likelihood, shape) + likelihood = array_ops.transpose(likelihood, order) + + if not context.executing_eagerly(): + values_shape, likelihood_shape = self.compute_output_shape(inputs.shape) + values.set_shape(values_shape) + likelihood.set_shape(likelihood_shape) + + return values, likelihood + + def compress(self, inputs): + """Compress inputs and store their binary representations into strings. + + Args: + inputs: `Tensor` with values to be compressed. + + Returns: + String `Tensor` vector containing the compressed representation of each + batch element of `inputs`. + """ + with ops.name_scope(self._name_scope()): + inputs = ops.convert_to_tensor(inputs) + if not self.built: + # Check input assumptions set before layer building, e.g. input rank. + self._assert_input_compatibility(inputs) + if self.dtype is None: + self._dtype = inputs.dtype.base_dtype.name + self.build(inputs.shape) + + # Check input assumptions set after layer building, e.g. input shape. + if not context.executing_eagerly(): + self._assert_input_compatibility(inputs) + + ndim = self.input_spec.ndim + channel_axis = self._channel_axis(ndim) + # Tuple of slices for expanding dimensions of tensors below. + slices = ndim * [None] + [slice(None)] + slices[channel_axis] = slice(None) + slices = tuple(slices) + + # Expand dimensions of CDF to input dimensions, keeping the channels along + # the right dimension. + cdf = self._quantized_cdf[slices[1:]] + num_levels = array_ops.shape(cdf)[-1] - 1 + + # Bring inputs to the right range by centering the range on the medians. + half = constant_op.constant(.5, dtype=self.dtype) + medians = array_ops.squeeze(self._medians, [1, 2]) + offsets = (math_ops.cast(num_levels // 2, self.dtype) + half) - medians + # Expand offsets to input dimensions and add to inputs. + values = inputs + offsets[slices[:-1]] + + # Clip to range and cast to integers. Because we have added .5 above, and + # all values are positive, the cast effectively implements rounding. + values = math_ops.maximum(values, half) + values = math_ops.minimum( + values, math_ops.cast(num_levels, self.dtype) - half) + values = math_ops.cast(values, dtypes.int16) + + def loop_body(tensor): + return coder_ops.range_encode( + tensor, cdf, precision=self.range_coder_precision) + strings = functional_ops.map_fn( + loop_body, values, dtype=dtypes.string, back_prop=False) + + if not context.executing_eagerly(): + strings.set_shape(inputs.shape[:1]) + + return strings + + def decompress(self, strings, shape, channels=None): + """Decompress values from their compressed string representations. + + Args: + strings: A string `Tensor` vector containing the compressed data. + shape: A `Tensor` vector of int32 type. Contains the shape of the tensor + to be decompressed, excluding the batch dimension. + channels: Integer. Specifies the number of channels statically. Needs only + be set if the layer hasn't been built yet (i.e., this is the first input + it receives). + + Returns: + The decompressed `Tensor`. Its shape will be equal to `shape` prepended + with the batch dimension from `strings`. + + Raises: + ValueError: If the length of `shape` isn't available at graph construction + time. + """ + with ops.name_scope(self._name_scope()): + strings = ops.convert_to_tensor(strings) + shape = ops.convert_to_tensor(shape) + if self.built: + ndim = self.input_spec.ndim + channel_axis = self._channel_axis(ndim) + if channels is None: + channels = self.input_spec.axes[channel_axis] + else: + if not (shape.shape.is_fully_defined() and shape.shape.ndims == 1): + raise ValueError("`shape` must be a vector with known length.") + ndim = shape.shape[0].value + 1 + channel_axis = self._channel_axis(ndim) + input_shape = ndim * [None] + input_shape[channel_axis] = channels + self.build(input_shape) + + # Tuple of slices for expanding dimensions of tensors below. + slices = ndim * [None] + [slice(None)] + slices[channel_axis] = slice(None) + slices = tuple(slices) + + # Expand dimensions of CDF to input dimensions, keeping the channels along + # the right dimension. + cdf = self._quantized_cdf[slices[1:]] + num_levels = array_ops.shape(cdf)[-1] - 1 + + def loop_body(string): + return coder_ops.range_decode( + string, shape, cdf, precision=self.range_coder_precision) + outputs = functional_ops.map_fn( + loop_body, strings, dtype=dtypes.int16, back_prop=False) + outputs = math_ops.cast(outputs, self.dtype) + + medians = array_ops.squeeze(self._medians, [1, 2]) + offsets = math_ops.cast(num_levels // 2, self.dtype) - medians + outputs -= offsets[slices[:-1]] + + if not context.executing_eagerly(): + outputs_shape = ndim * [None] + outputs_shape[0] = strings.shape[0] + outputs_shape[channel_axis] = channels + outputs.set_shape(outputs_shape) + + return outputs + + def visualize(self): + """Multi-channel visualization of densities as images. + + Creates and returns an image summary visualizing the current probabilty + density estimates. The image contains one row for each channel. Within each + row, the pixel intensities are proportional to probability values, and each + row is centered on the median of the corresponding distribution. + + Returns: + The created image summary. + """ + with ops.name_scope(self._name_scope()): + image = self._pmf + image *= 255 / math_ops.reduce_max(image, axis=1, keepdims=True) + image = math_ops.cast(image + .5, dtypes.uint8) + image = image[None, :, :, None] + return summary.image("pmf", image, max_outputs=1) + + def compute_output_shape(self, input_shape): + input_shape = tensor_shape.TensorShape(input_shape) + return input_shape, input_shape diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py new file mode 100644 index 0000000000000000000000000000000000000000..798b0234ebcce7df108a0da65d1305502ce0253a --- /dev/null +++ b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py @@ -0,0 +1,315 @@ +# -*- coding: utf-8 -*- +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests of EntropyBottleneck class.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.coder.python.layers import entropybottleneck + +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import gradient_descent + + +class EntropyBottleneckTest(test.TestCase): + + def test_noise(self): + # Tests that the noise added is uniform noise between -0.5 and 0.5. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck() + noisy, _ = layer(inputs, training=True) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + values = np.linspace(-50, 50, 100)[:, None] + noisy, = sess.run([noisy], {inputs: values}) + self.assertFalse(np.allclose(values, noisy, rtol=0, atol=.49)) + self.assertAllClose(values, noisy, rtol=0, atol=.5) + + def test_quantization(self): + # Tests that inputs are quantized to full integer values, even after + # quantiles have been updated. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=False) + quantized, _ = layer(inputs, training=False) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + values = np.linspace(-50, 50, 100)[:, None] + quantized, = sess.run([quantized], {inputs: values}) + self.assertAllClose(np.around(values), quantized, rtol=0, atol=1e-6) + + def test_quantization_optimized_offset(self): + # Tests that inputs are not quantized to full integer values after quantiles + # have been updated. However, the difference between input and output should + # be between -0.5 and 0.5, and the offset must be consistent. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=True) + quantized, _ = layer(inputs, training=False) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + values = np.linspace(-50, 50, 100)[:, None] + quantized, = sess.run([quantized], {inputs: values}) + self.assertAllClose(values, quantized, rtol=0, atol=.5) + diff = np.ravel(np.around(values) - quantized) % 1 + self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6) + self.assertNotEqual(diff[0], 0) + + def test_codec(self): + # Tests that inputs are compressed and decompressed correctly, and quantized + # to full integer values, even after quantiles have been updated. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=60, + optimize_integer_offset=False) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = np.linspace(-50, 50, 100)[None, :, None] + decoded, = sess.run([decoded], {inputs: values}) + self.assertAllClose(np.around(values), decoded, rtol=0, atol=1e-6) + + def test_codec_optimized_offset(self): + # Tests that inputs are compressed and decompressed correctly, and not + # quantized to full integer values after quantiles have been updated. + # However, the difference between input and output should be between -0.5 + # and 0.5, and the offset must be consistent. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=60, + optimize_integer_offset=True) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = np.linspace(-50, 50, 100)[None, :, None] + decoded, = sess.run([decoded], {inputs: values}) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + diff = np.ravel(np.around(values) - decoded) % 1 + self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6) + self.assertNotEqual(diff[0], 0) + + def test_codec_clipping(self): + # Tests that inputs are compressed and decompressed correctly, and clipped + # to the expected range. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=40) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = np.linspace(-50, 50, 100)[None, :, None] + decoded, = sess.run([decoded], {inputs: values}) + expected = np.clip(np.around(values), -40, 40) + self.assertAllClose(expected, decoded, rtol=0, atol=1e-6) + + def test_channels_last(self): + # Test the layer with more than one channel and multiple input dimensions, + # with the channels in the last dimension. + inputs = array_ops.placeholder(dtypes.float32, (None, None, None, 2)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=50) + noisy, _ = layer(inputs, training=True) + quantized, _ = layer(inputs, training=False) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = 5 * np.random.normal(size=(7, 5, 3, 2)) + noisy, quantized, decoded = sess.run( + [noisy, quantized, decoded], {inputs: values}) + self.assertAllClose(values, noisy, rtol=0, atol=.5) + self.assertAllClose(values, quantized, rtol=0, atol=.5) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + + def test_channels_first(self): + # Test the layer with more than one channel and multiple input dimensions, + # with the channel dimension right after the batch dimension. + inputs = array_ops.placeholder(dtypes.float32, (None, 3, None, None)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_first", init_scale=50) + noisy, _ = layer(inputs, training=True) + quantized, _ = layer(inputs, training=False) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = 5 * np.random.normal(size=(2, 3, 5, 7)) + noisy, quantized, decoded = sess.run( + [noisy, quantized, decoded], {inputs: values}) + self.assertAllClose(values, noisy, rtol=0, atol=.5) + self.assertAllClose(values, quantized, rtol=0, atol=.5) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + + def test_compress(self): + # Test compression and decompression, and produce test data for + # `test_decompress`. If you set the constant at the end to `True`, this test + # will fail and the log will contain the new test data. + inputs = array_ops.placeholder(dtypes.float32, (2, 3, 10)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_first", filters=(), init_scale=2) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = 5 * np.random.uniform(size=(2, 3, 10)) - 2.5 + bitstrings, quantized_cdf, decoded = sess.run( + [bitstrings, layer._quantized_cdf, decoded], {inputs: values}) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + # Set this constant to `True` to log new test data for `test_decompress`. + if False: # pylint:disable=using-constant-test + assert False, (bitstrings, quantized_cdf, decoded) + + # Data generated by `test_compress`. + # pylint:disable=g-inconsistent-quotes,bad-whitespace + bitstrings = np.array([ + b'\x1e\xbag}\xc2\xdaN\x8b\xbd.', + b'\x8dF\xf0%\x1cv\xccllW' + ], dtype=object) + + quantized_cdf = np.array([ + [ 0, 15636, 22324, 30145, 38278, 65536], + [ 0, 19482, 26927, 35052, 42904, 65535], + [ 0, 21093, 28769, 36919, 44578, 65536] + ], dtype=np.int32) + + expected = np.array([ + [[-2., 1., 0., -2., -1., -2., -2., -2., 2., -1.], + [ 1., 2., 1., 0., -2., -2., 1., 2., 0., 1.], + [ 2., 0., -2., 2., 0., -1., -2., 0., 2., 0.]], + [[ 1., 2., 0., -1., 1., 2., 1., 1., 2., -2.], + [ 2., -1., -1., 0., -1., 2., 0., 2., -2., 2.], + [ 2., -2., -2., -1., -2., 1., -2., 0., 0., 0.]] + ], dtype=np.float32) + # pylint:enable=g-inconsistent-quotes,bad-whitespace + + def test_decompress(self): + # Test that decompression of values compressed with a previous version + # works, i.e. that the file format doesn't change across revisions. + bitstrings = array_ops.placeholder(dtypes.string) + input_shape = array_ops.placeholder(dtypes.int32) + quantized_cdf = array_ops.placeholder(dtypes.int32) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_first", filters=(), dtype=dtypes.float32) + layer.build(self.expected.shape) + layer._quantized_cdf = quantized_cdf + decoded = layer.decompress(bitstrings, input_shape[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + decoded, = sess.run([decoded], { + bitstrings: self.bitstrings, input_shape: self.expected.shape, + quantized_cdf: self.quantized_cdf}) + self.assertAllClose(self.expected, decoded, rtol=0, atol=1e-6) + + def test_build_decompress(self): + # Test that layer can be built when `decompress` is the first call to it. + bitstrings = array_ops.placeholder(dtypes.string) + input_shape = array_ops.placeholder(dtypes.int32, shape=[3]) + layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) + layer.decompress(bitstrings, input_shape[1:], channels=5) + self.assertTrue(layer.built) + + def test_pmf_normalization(self): + # Test that probability mass functions are normalized correctly. + layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) + layer.build((None, 10)) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + pmf, = sess.run([layer._pmf]) + self.assertAllClose(np.ones(10), np.sum(pmf, axis=-1), rtol=0, atol=1e-6) + + def test_visualize(self): + # Test that summary op can be constructed. + layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) + layer.build((None, 10)) + summary = layer.visualize() + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run([summary]) + + def test_normalization(self): + # Test that densities are normalized correctly. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck(filters=(2,)) + _, likelihood = layer(inputs, training=True) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + x = np.repeat(np.arange(-200, 201), 1000)[:, None] + likelihood, = sess.run([likelihood], {inputs: x}) + self.assertEqual(x.shape, likelihood.shape) + integral = np.sum(likelihood) * .001 + self.assertAllClose(1, integral, rtol=0, atol=1e-4) + + def test_entropy_estimates(self): + # Test that entropy estimates match actual range coding. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + filters=(2, 3), data_format="channels_last") + _, likelihood = layer(inputs, training=True) + diff_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2) + _, likelihood = layer(inputs, training=False) + disc_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2) + bitstrings = layer.compress(inputs) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + diff_entropy, disc_entropy, bitstrings = sess.run( + [diff_entropy, disc_entropy, bitstrings], + {inputs: np.random.normal(size=(1, 10000, 1))}) + codelength = 8 * sum(len(bitstring) for bitstring in bitstrings) + self.assertAllClose(diff_entropy, disc_entropy, rtol=5e-3, atol=0) + self.assertAllClose(disc_entropy, codelength, rtol=5e-3, atol=0) + self.assertGreater(codelength, disc_entropy) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/constrained_optimization/BUILD b/tensorflow/contrib/constrained_optimization/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..619153df67c90cea5a5082a411972948bac5fe90 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/BUILD @@ -0,0 +1,91 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +load("//tensorflow:tensorflow.bzl", "py_test") + +# Transitive dependencies of this target will be included in the pip package. +py_library( + name = "constrained_optimization_pip", + deps = [ + ":constrained_optimization", + ":test_util", + ], +) + +py_library( + name = "constrained_optimization", + srcs = [ + "__init__.py", + "python/candidates.py", + "python/constrained_minimization_problem.py", + "python/constrained_optimizer.py", + "python/external_regret_optimizer.py", + "python/swap_regret_optimizer.py", + ], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework", + "//tensorflow/python:standard_ops", + "//tensorflow/python:state_ops", + "//tensorflow/python:training", + "//third_party/py/numpy", + "@six_archive//:six", + ], +) + +py_test( + name = "candidates_test", + srcs = ["python/candidates_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":constrained_optimization", + "//tensorflow/python:client_testlib", + "//third_party/py/numpy", + ], +) + +# NOTE: This library can't be "testonly" since it needs to be included in the +# pip package. +py_library( + name = "test_util", + srcs = ["python/test_util.py"], + srcs_version = "PY2AND3", + deps = [ + ":constrained_optimization", + "//tensorflow/python:dtypes", + "//tensorflow/python:standard_ops", + ], +) + +py_test( + name = "external_regret_optimizer_test", + srcs = ["python/external_regret_optimizer_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":constrained_optimization", + ":test_util", + "//tensorflow/python:client_testlib", + "//tensorflow/python:standard_ops", + "//tensorflow/python:training", + "//third_party/py/numpy", + ], +) + +py_test( + name = "swap_regret_optimizer_test", + srcs = ["python/swap_regret_optimizer_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":constrained_optimization", + ":test_util", + "//tensorflow/python:client_testlib", + "//tensorflow/python:standard_ops", + "//tensorflow/python:training", + "//third_party/py/numpy", + ], +) diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c65a150464efc1e77419040f66f36fc6756325aa --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/README.md @@ -0,0 +1,345 @@ + + +# ConstrainedOptimization (TFCO) + +TFCO is a library for optimizing inequality-constrained problems in TensorFlow. +Both the objective function and the constraints are represented as Tensors, +giving users the maximum amount of flexibility in specifying their optimization +problems. + +This flexibility makes optimization considerably more difficult: on a non-convex +problem, if one uses the "standard" approach of introducing a Lagrange +multiplier for each constraint, and then jointly maximizing over the Lagrange +multipliers and minimizing over the model parameters, then a stable stationary +point might not even *exist*. Hence, in some cases, oscillation, instead of +convergence, is inevitable. + +Thankfully, it turns out that even if, over the course of optimization, no +*particular* iterate does a good job of minimizing the objective while +satisfying the constraints, the *sequence* of iterates, on average, usually +will. This observation suggests the following approach: at training time, we'll +periodically snapshot the model state during optimization; then, at evaluation +time, each time we're given a new example to evaluate, we'll sample one of the +saved snapshots uniformly at random, and apply it to the example. This +*stochastic model* will generally perform well, both with respect to the +objective function, and the constraints. + +In fact, we can do better: it's possible to post-process the set of snapshots to +find a distribution over at most $$m+1$$ snapshots, where $$m$$ is the number of +constraints, that will be at least as good (and will usually be much better) +than the (much larger) uniform distribution described above. If you're unable or +unwilling to use a stochastic model at all, then you can instead use a heuristic +to choose the single best snapshot. + +For full details, motivation, and theoretical results on the approach taken by +this library, please refer to: + +> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex +> Constrained Optimization". +> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + +which will be referred to as [CoJiSr18] throughout the remainder of this +document. + +### Proxy Constraints + +Imagine that we want to constrain the recall of a binary classifier to be at +least 90%. Since the recall is proportional to the number of true positive +classifications, which itself is a sum of indicator functions, this constraint +is non-differentible, and therefore cannot be used in a problem that will be +optimized using a (stochastic) gradient-based algorithm. + +For this and similar problems, TFCO supports so-called *proxy constraints*, +which are (at least semi-differentiable) approximations of the original +constraints. For example, one could create a proxy recall function by replacing +the indicator functions with sigmoids. During optimization, each proxy +constraint function will be penalized, with the magnitude of the penalty being +chosen to satisfy the corresponding *original* (non-proxy) constraint. + +On a problem including proxy constraints—even a convex problem—the +Lagrangian approach discussed above isn't guaranteed to work. However, a +different algorithm, based on minimizing *swap regret*, does work. Aside from +this difference, the recommended procedure for optimizing a proxy-constrained +problem remains the same: periodically snapshot the model during optimization, +and then either find the best $$m+1$$-sized distribution, or heuristically +choose the single best snapshot. + +## Components + +* [constrained_minimization_problem](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py): + contains the `ConstrainedMinimizationProblem` interface. Your own + constrained optimization problems should be represented using + implementations of this interface. + +* [constrained_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py): + contains the `ConstrainedOptimizer` interface, which is similar to (but + different from) `tf.train.Optimizer`, with the main difference being that + `ConstrainedOptimizer`s are given `ConstrainedMinimizationProblem`s to + optimize, and perform constrained optimization. + + * [external_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py): + contains the `AdditiveExternalRegretOptimizer` implementation, which is + a `ConstrainedOptimizer` implementing the Lagrangian approach discussed + above (with additive updates to the Lagrange multipliers). You should + use this optimizer for problems *without* proxy constraints. It may also + work for problems with proxy constraints, but we recommend using a swap + regret optimizer, instead. + + This optimizer is most similar to Algorithm 3 in Appendix C.3 of + [CoJiSr18], and is discussed in Section 3. The two differences are that + it uses proxy constraints (if they're provided) in the update of the + model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for + the "inner" updates. + + * [swap_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py): + contains the `AdditiveSwapRegretOptimizer` and + `MultiplicativeSwapRegretOptimizer` implementations, which are + `ConstrainedOptimizer`s implementing the swap-regret minimization + approach mentioned above (with additive or multiplicative updates, + respectively, to the parameters associated with the + constraints—these parameters are not Lagrange multipliers, but + play a similar role). You should use one of these optimizers (we suggest + `MultiplicativeSwapRegretOptimizer`) for problems *with* proxy + constraints. + + The `MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2 + in Section 4 of [CoJiSr18], with the difference being that it uses + `tf.train.Optimizer`s, instead of SGD, for the "inner" updates. The + `AdditiveSwapRegretOptimizer` differs further in that it performs + additive (instead of multiplicative) updates of the stochastic matrix. + +* [candidates](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/candidates.py): + contains two functions, `find_best_candidate_distribution` and + `find_best_candidate_index`. Both of these functions are given a set of + candidate solutions to a constrained optimization problem, from which the + former finds the best distribution over at most $$m+1$$ candidates, and the + latter heuristically finds the single best candidate. As discussed above, + the set of candidates will typically be model snapshots saved periodically + during optimization. Both of these functions require that scipy be + installed. + + The `find_best_candidate_distribution` function implements the approach + described in Lemma 3 of [CoJiSr18], while `find_best_candidate_index` + implements the heuristic used for hyperparameter search in the experiments + of Section 5.2. + +## Convex Example with Proxy Constraints + +This is a simple example of recall-constrained optimization on simulated data: +we will try to find a classifier that minimizes the average hinge loss while +constraining recall to be at least 90%. + +We'll start with the required imports—notice the definition of `tfco`: + +```python +import math +import numpy as np +import tensorflow as tf + +tfco = tf.contrib.constrained_optimization +``` + +We'll now create an implementation of the `ConstrainedMinimizationProblem` class +for this problem. The constructor takes three parameters: a Tensor containing +the classification labels (0 or 1) for every training example, another Tensor +containing the model's predictions on every training example (sometimes called +the "logits"), and the lower bound on recall that will be enforced using a +constraint. + +This implementation will contain both constraints *and* proxy constraints: the +former represents the constraint that the true recall (defined in terms of the +*number* of true positives) be at least `recall_lower_bound`, while the latter +represents the same constraint, but on a hinge approximation of the recall. + +```python +class ExampleProblem(tfco.ConstrainedMinimizationProblem): + + def __init__(self, labels, predictions, recall_lower_bound): + self._labels = labels + self._predictions = predictions + self._recall_lower_bound = recall_lower_bound + # The number of positively-labeled examples. + self._positive_count = tf.reduce_sum(self._labels) + + @property + def objective(self): + return tf.losses.hinge_loss(labels=self._labels, logits=self._predictions) + + @property + def constraints(self): + true_positives = self._labels * tf.to_float(self._predictions > 0) + true_positive_count = tf.reduce_sum(true_positives) + recall = true_positive_count / self._positive_count + # The constraint is (recall >= self._recall_lower_bound), which we convert + # to (self._recall_lower_bound - recall <= 0) because + # ConstrainedMinimizationProblems must always provide their constraints in + # the form (tensor <= 0). + # + # The result of this function should be a tensor, with each element being + # a quantity that is constrained to be nonpositive. We only have one + # constraint, so we return a one-element tensor. + return self._recall_lower_bound - recall + + @property + def proxy_constraints(self): + # Use 1 - hinge since we're SUBTRACTING recall in the constraint function, + # and we want the proxy constraint function to be convex. + true_positives = self._labels * tf.minimum(1.0, self._predictions) + true_positive_count = tf.reduce_sum(true_positives) + recall = true_positive_count / self._positive_count + # Please see the corresponding comment in the constraints property. + return self._recall_lower_bound - recall +``` + +We'll now create a simple simulated dataset by sampling 1000 random +10-dimensional feature vectors from a Gaussian, finding their labels using a +random "ground truth" linear model, and then adding noise by randomly flipping +200 labels. + +```python +# Create a simulated 10-dimensional training dataset consisting of 1000 labeled +# examples, of which 800 are labeled correctly and 200 are mislabeled. +num_examples = 1000 +num_mislabeled_examples = 200 +dimension = 10 +# We will constrain the recall to be at least 90%. +recall_lower_bound = 0.9 + +# Create random "ground truth" parameters to a linear model. +ground_truth_weights = np.random.normal(size=dimension) / math.sqrt(dimension) +ground_truth_threshold = 0 + +# Generate a random set of features for each example. +features = np.random.normal(size=(num_examples, dimension)).astype( + np.float32) / math.sqrt(dimension) +# Compute the labels from these features given the ground truth linear model. +labels = (np.matmul(features, ground_truth_weights) > + ground_truth_threshold).astype(np.float32) +# Add noise by randomly flipping num_mislabeled_examples labels. +mislabeled_indices = np.random.choice( + num_examples, num_mislabeled_examples, replace=False) +labels[mislabeled_indices] = 1 - labels[mislabeled_indices] +``` + +We're now ready to construct our model, and the corresponding optimization +problem. We'll use a linear model of the form $$f(x) = w^T x - t$$, where $$w$$ +is the `weights`, and $$t$$ is the `threshold`. The `problem` variable will hold +an instance of the `ExampleProblem` class we created earlier. + +```python +# Create variables containing the model parameters. +weights = tf.Variable(tf.zeros(dimension), dtype=tf.float32, name="weights") +threshold = tf.Variable(0.0, dtype=tf.float32, name="threshold") + +# Create the optimization problem. +constant_labels = tf.constant(labels, dtype=tf.float32) +constant_features = tf.constant(features, dtype=tf.float32) +predictions = tf.tensordot(constant_features, weights, axes=(1, 0)) - threshold +problem = ExampleProblem( + labels=constant_labels, + predictions=predictions, + recall_lower_bound=recall_lower_bound, +) +``` + +We're almost ready to train our model, but first we'll create a couple of +functions to measure its performance. We're interested in two quantities: the +average hinge loss (which we seek to minimize), and the recall (which we +constrain). + +```python +def average_hinge_loss(labels, predictions): + num_examples, = np.shape(labels) + signed_labels = (labels * 2) - 1 + total_hinge_loss = np.sum(np.maximum(0.0, 1.0 - signed_labels * predictions)) + return total_hinge_loss / num_examples + +def recall(labels, predictions): + positive_count = np.sum(labels) + true_positives = labels * (predictions > 0) + true_positive_count = np.sum(true_positives) + return true_positive_count / positive_count +``` + +As was mentioned earlier, external regret optimizers suffice for problems +without proxy constraints, but swap regret optimizers are recommended for +problems *with* proxy constraints. Since this problem contains proxy +constraints, we use the `MultiplicativeSwapRegretOptimizer`. + +For this problem, the constraint is fairly easy to satisfy, so we can use the +same "inner" optimizer (an `AdagradOptimizer` with a learning rate of 1) for +optimization of both the model parameters (`weights` and `threshold`), and the +internal parameters associated with the constraints (these are the analogues of +the Lagrange multipliers used by the `MultiplicativeSwapRegretOptimizer`). For +more difficult problems, it will often be necessary to use different optimizers, +with different learning rates (presumably found via a hyperparameter search): to +accomplish this, pass *both* the `optimizer` and `constraint_optimizer` +parameters to `MultiplicativeSwapRegretOptimizer`'s constructor. + +Since this is a convex problem (both the objective and proxy constraint +functions are convex), we can just take the last iterate. Periodic snapshotting, +and the use of the `find_best_candidate_distribution` or +`find_best_candidate_index` functions, is generally only necessary for +non-convex problems (and even then, it isn't *always* necessary). + +```python +with tf.Session() as session: + optimizer = tfco.MultiplicativeSwapRegretOptimizer( + optimizer=tf.train.AdagradOptimizer(learning_rate=1.0)) + train_op = optimizer.minimize(problem) + + session.run(tf.global_variables_initializer()) + for ii in xrange(1000): + session.run(train_op) + + trained_weights, trained_threshold = session.run((weights, threshold)) + +trained_predictions = np.matmul(features, trained_weights) - trained_threshold +print("Constrained average hinge loss = %f" % average_hinge_loss( + labels, trained_predictions)) +print("Constrained recall = %f" % recall(labels, trained_predictions)) +``` + +Running the above code gives the following output (due to the randomness of the +dataset, you'll get a different result when you run it): + +```none +Constrained average hinge loss = 0.710019 +Constrained recall = 0.899811 +``` + +As we hoped, the recall is extremely close to 90%—and, thanks to the use +of proxy constraints, this is the *true* recall, not a hinge approximation. + +For comparison, let's try optimizing the same problem *without* the recall +constraint: + +```python +with tf.Session() as session: + optimizer = tf.train.AdagradOptimizer(learning_rate=1.0) + # For optimizing the unconstrained problem, we just minimize the "objective" + # portion of the minimization problem. + train_op = optimizer.minimize(problem.objective) + + session.run(tf.global_variables_initializer()) + for ii in xrange(1000): + session.run(train_op) + + trained_weights, trained_threshold = session.run((weights, threshold)) + +trained_predictions = np.matmul(features, trained_weights) - trained_threshold +print("Unconstrained average hinge loss = %f" % average_hinge_loss( + labels, trained_predictions)) +print("Unconstrained recall = %f" % recall(labels, trained_predictions)) +``` + +This code gives the following output (again, you'll get a different answer, +since the dataset is random): + +```none +Unconstrained average hinge loss = 0.627271 +Unconstrained recall = 0.793951 +``` + +Because there is no constraint, the unconstrained problem does a better job of +minimizing the average hinge loss, but naturally doesn't approach 90% recall. diff --git a/tensorflow/contrib/constrained_optimization/__init__.py b/tensorflow/contrib/constrained_optimization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e49ba9f179ea98aaa9c35f79787605b53a1ec53 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/__init__.py @@ -0,0 +1,41 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A library for performing constrained optimization in TensorFlow.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# pylint: disable=wildcard-import +from tensorflow.contrib.constrained_optimization.python.candidates import * +from tensorflow.contrib.constrained_optimization.python.constrained_minimization_problem import * +from tensorflow.contrib.constrained_optimization.python.constrained_optimizer import * +from tensorflow.contrib.constrained_optimization.python.external_regret_optimizer import * +from tensorflow.contrib.constrained_optimization.python.swap_regret_optimizer import * +# pylint: enable=wildcard-import + +from tensorflow.python.util.all_util import remove_undocumented + +_allowed_symbols = [ + "AdditiveExternalRegretOptimizer", + "AdditiveSwapRegretOptimizer", + "ConstrainedMinimizationProblem", + "ConstrainedOptimizer", + "find_best_candidate_distribution", + "find_best_candidate_index", + "MultiplicativeSwapRegretOptimizer", +] + +remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/constrained_optimization/python/candidates.py b/tensorflow/contrib/constrained_optimization/python/candidates.py new file mode 100644 index 0000000000000000000000000000000000000000..ac86a6741be1f244476f917d0e151166db65524b --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/candidates.py @@ -0,0 +1,319 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Code for optimizing over a set of candidate solutions. + +The functions in this file deal with the constrained problem: + +> minimize f(w) +> s.t. g_i(w) <= 0 for all i in {0,1,...,m-1} + +Here, f(w) is the "objective function", and g_i(w) is the ith (of m) "constraint +function". Given the values of the objective and constraint functions for a set +of n "candidate solutions" {w_0,w_1,...,w_{n-1}} (for a total of n objective +function values, and n*m constraint function values), the +`find_best_candidate_distribution` function finds the best DISTRIBUTION over +these candidates, while `find_best_candidate_index' heuristically finds the +single best candidate. + +Both of these functions have dependencies on `scipy`, so if you want to call +them, then you must make sure that `scipy` is available. The imports are +performed inside the functions themselves, so if they're not actually called, +then `scipy` is not needed. + +For more specifics, please refer to: + +> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex +> Constrained Optimization". +> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + +The `find_best_candidate_distribution` function implements the approach +described in Lemma 3, while `find_best_candidate_index` implements the heuristic +used for hyperparameter search in the experiments of Section 5.2. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin + + +def _find_best_candidate_distribution_helper(objective_vector, + constraints_matrix, + maximum_violation=0.0): + """Finds a distribution minimizing an objective subject to constraints. + + This function deals with the constrained problem: + + > minimize f(w) + > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1} + + Here, f(w) is the "objective function", and g_i(w) is the ith (of m) + "constraint function". Given a set of n "candidate solutions" + {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n + candidates that, in expectation, minimizes the objective while violating + the constraints by no more than `maximum_violation`. If no such distribution + exists, it returns an error (using Go-style error reporting). + + The `objective_vector` parameter should be a numpy array with shape (n,), for + which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a + numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j). + + This function will return a distribution for which at most m+1 probabilities, + and often fewer, are nonzero. + + Args: + objective_vector: numpy array of shape (n,), where n is the number of + "candidate solutions". Contains the objective function values. + constraints_matrix: numpy array of shape (m,n), where m is the number of + constraints and n is the number of "candidate solutions". Contains the + constraint violation magnitudes. + maximum_violation: nonnegative float, the maximum amount by which any + constraint may be violated, in expectation. + + Returns: + A pair (`result`, `message`), exactly one of which is None. If `message` is + None, then the `result` contains the optimal distribution as a numpy array + of shape (n,). If `result` is None, then `message` contains an error + message. + + Raises: + ValueError: If `objective_vector` and `constraints_matrix` have inconsistent + shapes, or if `maximum_violation` is negative. + ImportError: If we're unable to import `scipy.optimize`. + """ + if maximum_violation < 0.0: + raise ValueError("maximum_violation must be nonnegative") + + mm, nn = np.shape(constraints_matrix) + if (nn,) != np.shape(objective_vector): + raise ValueError( + "objective_vector must have shape (n,), and constraints_matrix (m, n)," + " where n is the number of candidates, and m is the number of " + "constraints") + + # We import scipy inline, instead of at the top of the file, so that a scipy + # dependency is only introduced if either find_best_candidate_distribution() + # or find_best_candidate_index() are actually called. + import scipy.optimize # pylint: disable=g-import-not-at-top + + # Feasibility (within maximum_violation) constraints. + a_ub = constraints_matrix + b_ub = np.full((mm, 1), maximum_violation) + # Sum-to-one constraint. + a_eq = np.ones((1, nn)) + b_eq = np.ones((1, 1)) + # Nonnegativity constraints. + bounds = (0, None) + + result = scipy.optimize.linprog( + objective_vector, + A_ub=a_ub, + b_ub=b_ub, + A_eq=a_eq, + b_eq=b_eq, + bounds=bounds) + # Go-style error reporting. We don't raise on error, since + # find_best_candidate_distribution() needs to handle the failure case, and we + # shouldn't use exceptions as flow-control. + if not result.success: + return (None, result.message) + else: + return (result.x, None) + + +def find_best_candidate_distribution(objective_vector, + constraints_matrix, + epsilon=0.0): + """Finds a distribution minimizing an objective subject to constraints. + + This function deals with the constrained problem: + + > minimize f(w) + > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1} + + Here, f(w) is the "objective function", and g_i(w) is the ith (of m) + "constraint function". Given a set of n "candidate solutions" + {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n + candidates that, in expectation, minimizes the objective while violating + the constraints by the smallest possible amount (with the amount being found + via bisection search). + + The `objective_vector` parameter should be a numpy array with shape (n,), for + which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a + numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j). + + This function will return a distribution for which at most m+1 probabilities, + and often fewer, are nonzero. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + This function implements the approach described in Lemma 3. + + Args: + objective_vector: numpy array of shape (n,), where n is the number of + "candidate solutions". Contains the objective function values. + constraints_matrix: numpy array of shape (m,n), where m is the number of + constraints and n is the number of "candidate solutions". Contains the + constraint violation magnitudes. + epsilon: nonnegative float, the threshold at which to terminate the binary + search while searching for the minimal expected constraint violation + magnitude. + + Returns: + The optimal distribution, as a numpy array of shape (n,). + + Raises: + ValueError: If `objective_vector` and `constraints_matrix` have inconsistent + shapes, or if `epsilon` is negative. + ImportError: If we're unable to import `scipy.optimize`. + """ + if epsilon < 0.0: + raise ValueError("epsilon must be nonnegative") + + # If there is a feasible solution (i.e. with maximum_violation=0), then that's + # what we'll return. + pp, _ = _find_best_candidate_distribution_helper(objective_vector, + constraints_matrix) + if pp is not None: + return pp + + # The bound is the minimum over all candidates, of the maximum per-candidate + # constraint violation. + lower = 0.0 + upper = np.min(np.amax(constraints_matrix, axis=0)) + best_pp, _ = _find_best_candidate_distribution_helper( + objective_vector, constraints_matrix, maximum_violation=upper) + assert best_pp is not None + + # Throughout this loop, a maximum_violation of "lower" is not achievable, + # but a maximum_violation of "upper" is achiveable. + while True: + middle = 0.5 * (lower + upper) + if (middle - lower <= epsilon) or (upper - middle <= epsilon): + break + else: + pp, _ = _find_best_candidate_distribution_helper( + objective_vector, constraints_matrix, maximum_violation=middle) + if pp is None: + lower = middle + else: + best_pp = pp + upper = middle + + return best_pp + + +def find_best_candidate_index(objective_vector, + constraints_matrix, + rank_objectives=False): + """Heuristically finds the best candidate solution to a constrained problem. + + This function deals with the constrained problem: + + > minimize f(w) + > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1} + + Here, f(w) is the "objective function", and g_i(w) is the ith (of m) + "constraint function". Given a set of n "candidate solutions" + {w_0,w_1,...,w_{n-1}}, this function finds the "best" solution according + to the following heuristic: + + 1. Across all models, the ith constraint violations (i.e. max{0, g_i(0)}) + are ranked, as are the objectives (if rank_objectives=True). + 2. Each model is then associated its MAXIMUM rank across all m constraints + (and the objective, if rank_objectives=True). + 3. The model with the minimal maximum rank is then identified. Ties are + broken using the objective function value. + 4. The index of this "best" model is returned. + + The `objective_vector` parameter should be a numpy array with shape (n,), for + which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a + numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j). + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + This function implements the heuristic used for hyperparameter search in the + experiments of Section 5.2. + + Args: + objective_vector: numpy array of shape (n,), where n is the number of + "candidate solutions". Contains the objective function values. + constraints_matrix: numpy array of shape (m,n), where m is the number of + constraints and n is the number of "candidate solutions". Contains the + constraint violation magnitudes. + rank_objectives: bool, whether the objective function values should be + included in the initial ranking step. If True, both the objective and + constraints will be ranked. If False, only the constraints will be ranked. + In either case, the objective function values will be used for + tiebreaking. + + Returns: + The index (in {0,1,...,n-1}) of the "best" model according to the above + heuristic. + + Raises: + ValueError: If `objective_vector` and `constraints_matrix` have inconsistent + shapes. + ImportError: If we're unable to import `scipy.stats`. + """ + mm, nn = np.shape(constraints_matrix) + if (nn,) != np.shape(objective_vector): + raise ValueError( + "objective_vector must have shape (n,), and constraints_matrix (m, n)," + " where n is the number of candidates, and m is the number of " + "constraints") + + # We import scipy inline, instead of at the top of the file, so that a scipy + # dependency is only introduced if either find_best_candidate_distribution() + # or find_best_candidate_index() are actually called. + import scipy.stats # pylint: disable=g-import-not-at-top + + if rank_objectives: + maximum_ranks = scipy.stats.rankdata(objective_vector, method="min") + else: + maximum_ranks = np.zeros(nn, dtype=np.int64) + for ii in xrange(mm): + # Take the maximum of the constraint functions with zero, since we want to + # rank the magnitude of constraint *violations*. If the constraint is + # satisfied, then we don't care how much it's satisfied by (as a result, we + # we expect all models satisfying a constraint to be tied at rank 1). + ranks = scipy.stats.rankdata( + np.maximum(0.0, constraints_matrix[ii, :]), method="min") + maximum_ranks = np.maximum(maximum_ranks, ranks) + + best_index = None + best_rank = float("Inf") + best_objective = float("Inf") + for ii in xrange(nn): + if maximum_ranks[ii] < best_rank: + best_index = ii + best_rank = maximum_ranks[ii] + best_objective = objective_vector[ii] + elif (maximum_ranks[ii] == best_rank) and (objective_vector[ii] <= + best_objective): + best_index = ii + best_objective = objective_vector[ii] + + return best_index diff --git a/tensorflow/contrib/constrained_optimization/python/candidates_test.py b/tensorflow/contrib/constrained_optimization/python/candidates_test.py new file mode 100644 index 0000000000000000000000000000000000000000..a4c49d48bc5c763489215261a909573af0f19055 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/candidates_test.py @@ -0,0 +1,95 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for constrained_optimization.python.candidates.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.constrained_optimization.python import candidates +from tensorflow.python.platform import test + + +class CandidatesTest(test.TestCase): + + def test_inconsistent_shapes_for_best_distribution(self): + """An error is raised when parameters have inconsistent shapes.""" + objective_vector = np.array([1, 2, 3]) + constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) + with self.assertRaises(ValueError): + _ = candidates.find_best_candidate_distribution(objective_vector, + constraints_matrix) + + def test_inconsistent_shapes_for_best_index(self): + """An error is raised when parameters have inconsistent shapes.""" + objective_vector = np.array([1, 2, 3]) + constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) + with self.assertRaises(ValueError): + _ = candidates.find_best_candidate_index(objective_vector, + constraints_matrix) + + def test_best_distribution(self): + """Distribution should match known solution.""" + objective_vector = np.array( + [0.03053309, -0.06667082, 0.88355145, 0.46529806]) + constraints_matrix = np.array( + [[-0.60164551, 0.36676229, 0.7856454, -0.8441711], + [0.00371592, -0.16392108, -0.59778071, -0.56908492]]) + distribution = candidates.find_best_candidate_distribution( + objective_vector, constraints_matrix) + # Verify that the solution is a probability distribution. + self.assertTrue(np.all(distribution >= 0)) + self.assertAlmostEqual(np.sum(distribution), 1.0) + # Verify that the solution satisfies the constraints. + maximum_constraint_violation = np.amax( + np.dot(constraints_matrix, distribution)) + self.assertLessEqual(maximum_constraint_violation, 0) + # Verify that the solution matches that which we expect. + expected_distribution = np.array([0.37872711, 0.62127289, 0, 0]) + self.assertAllClose(expected_distribution, distribution, rtol=0, atol=1e-6) + + def test_best_index_rank_objectives_true(self): + """Index should match known solution.""" + # Objective ranks = [2, 1, 4, 3]. + objective_vector = np.array( + [0.03053309, -0.06667082, 0.88355145, 0.46529806]) + # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]]. + constraints_matrix = np.array( + [[-0.60164551, 0.36676229, 0.7856454, -0.8441711], + [0.00371592, -0.16392108, -0.59778071, -0.56908492]]) + # Maximum ranks = [4, 3, 4, 3]. + index = candidates.find_best_candidate_index( + objective_vector, constraints_matrix, rank_objectives=True) + self.assertEqual(1, index) + + def test_best_index_rank_objectives_false(self): + """Index should match known solution.""" + # Objective ranks = [2, 1, 4, 3]. + objective_vector = np.array( + [0.03053309, -0.06667082, 0.88355145, 0.46529806]) + # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]]. + constraints_matrix = np.array( + [[-0.60164551, 0.36676229, 0.7856454, -0.8441711], + [0.00371592, -0.16392108, -0.59778071, -0.56908492]]) + # Maximum ranks = [4, 3, 4, 1]. + index = candidates.find_best_candidate_index( + objective_vector, constraints_matrix, rank_objectives=False) + self.assertEqual(3, index) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py new file mode 100644 index 0000000000000000000000000000000000000000..70813fb217956b167b80a7e1d555c8ba79088fdb --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py @@ -0,0 +1,123 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines abstract class for `ConstrainedMinimizationProblem`s. + +A ConstrainedMinimizationProblem consists of an objective function to minimize, +and a set of constraint functions that are constrained to be nonpositive. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc + +import six + + +@six.add_metaclass(abc.ABCMeta) +class ConstrainedMinimizationProblem(object): + """Abstract class representing a `ConstrainedMinimizationProblem`. + + A ConstrainedMinimizationProblem consists of an objective function to + minimize, and a set of constraint functions that are constrained to be + nonpositive. + + In addition to the constraint functions, there may (optionally) be proxy + constraint functions: a ConstrainedOptimizer will attempt to penalize these + proxy constraint functions so as to satisfy the (non-proxy) constraints. Proxy + constraints could be used if the constraints functions are difficult or + impossible to optimize (e.g. if they're piecewise constant), in which case the + proxy constraints should be some approximation of the original constraints + that is well-enough behaved to permit successful optimization. + """ + + @abc.abstractproperty + def objective(self): + """Returns the objective function. + + Returns: + A 0d tensor that should be minimized. + """ + pass + + @property + def num_constraints(self): + """Returns the number of constraints. + + Returns: + An int containing the number of constraints. + + Raises: + ValueError: If the constraints (or proxy_constraints, if present) do not + have fully-known shapes, OR if proxy_constraints are present, and the + shapes of constraints and proxy_constraints are fully-known, but they're + different. + """ + constraints_shape = self.constraints.get_shape() + if self.proxy_constraints is None: + proxy_constraints_shape = constraints_shape + else: + proxy_constraints_shape = self.proxy_constraints.get_shape() + + if (constraints_shape is None or proxy_constraints_shape is None or + any([ii is None for ii in constraints_shape.as_list()]) or + any([ii is None for ii in proxy_constraints_shape.as_list()])): + raise ValueError( + "constraints and proxy_constraints must have fully-known shapes") + if constraints_shape != proxy_constraints_shape: + raise ValueError( + "constraints and proxy_constraints must have the same shape") + + size = 1 + for ii in constraints_shape.as_list(): + size *= ii + return int(size) + + @abc.abstractproperty + def constraints(self): + """Returns the vector of constraint functions. + + Letting g_i be the ith element of the constraints vector, the ith constraint + will be g_i <= 0. + + Returns: + A tensor of constraint functions. + """ + pass + + # This is a property, instead of an abstract property, since it doesn't need + # to be overridden: if proxy_constraints returns None, then there are no + # proxy constraints. + @property + def proxy_constraints(self): + """Returns the optional vector of proxy constraint functions. + + The difference between `constraints` and `proxy_constraints` is that, when + proxy constraints are present, the `constraints` are merely EVALUATED during + optimization, whereas the `proxy_constraints` are DIFFERENTIATED. If there + are no proxy constraints, then the `constraints` are both evaluated and + differentiated. + + For example, if we want to impose constraints on step functions, then we + could use these functions for `constraints`. However, because a step + function has zero gradient almost everywhere, we can't differentiate these + functions, so we would take `proxy_constraints` to be some differentiable + approximation of `constraints`. + + Returns: + A tensor of proxy constraint functions. + """ + return None diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..805554536610a5e2cc650ff0b47185f4fbd6fac5 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py @@ -0,0 +1,208 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines base class for `ConstrainedOptimizer`s.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc + +import six + +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import standard_ops +from tensorflow.python.training import optimizer as train_optimizer + + +@six.add_metaclass(abc.ABCMeta) +class ConstrainedOptimizer(object): + """Base class representing a constrained optimizer. + + A ConstrainedOptimizer wraps a tf.train.Optimizer (or more than one), and + applies it to a ConstrainedMinimizationProblem. Unlike a tf.train.Optimizer, + which takes a tensor to minimize as a parameter to its minimize() method, a + constrained optimizer instead takes a ConstrainedMinimizationProblem. + """ + + def __init__(self, optimizer): + """Constructs a new `ConstrainedOptimizer`. + + Args: + optimizer: tf.train.Optimizer, used to optimize the + ConstraintedMinimizationProblem. + + Returns: + A new `ConstrainedOptimizer`. + """ + self._optimizer = optimizer + + @property + def optimizer(self): + """Returns the `tf.train.Optimizer` used for optimization.""" + return self._optimizer + + def minimize_unconstrained(self, + minimization_problem, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the unconstrained problem. + + Unlike `minimize_constrained`, this function ignores the `constraints` (and + `proxy_constraints`) portion of the minimization problem entirely, and only + minimizes `objective`. + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + """ + return self.optimizer.minimize( + minimization_problem.objective, + global_step=global_step, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + name=name, + grad_loss=grad_loss) + + @abc.abstractmethod + def minimize_constrained(self, + minimization_problem, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the constrained problem. + + Unlike `minimize_unconstrained`, this function attempts to find a solution + that minimizes the `objective` portion of the minimization problem while + satisfying the `constraints` portion. + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + """ + pass + + def minimize(self, + minimization_problem, + unconstrained_steps=None, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the constrained problem. + + This method combines the functionality of `minimize_unconstrained` and + `minimize_constrained`. If global_step < unconstrained_steps, it will + perform an unconstrained update, and if global_step >= unconstrained_steps, + it will perform a constrained update. + + The reason for this functionality is that it may be best to initialize the + constrained optimizer with an approximate optimum of the unconstrained + problem. + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + unconstrained_steps: int, number of steps for which we should perform + unconstrained updates, before transitioning to constrained updates. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + + Raises: + ValueError: If unconstrained_steps is provided, but global_step is not. + """ + + def unconstrained_fn(): + """Returns an `Op` for minimizing the unconstrained problem.""" + return self.minimize_unconstrained( + minimization_problem=minimization_problem, + global_step=global_step, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + name=name, + grad_loss=grad_loss) + + def constrained_fn(): + """Returns an `Op` for minimizing the constrained problem.""" + return self.minimize_constrained( + minimization_problem=minimization_problem, + global_step=global_step, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + name=name, + grad_loss=grad_loss) + + if unconstrained_steps is not None: + if global_step is None: + raise ValueError( + "global_step cannot be None if unconstrained_steps is provided") + unconstrained_steps_tensor = ops.convert_to_tensor(unconstrained_steps) + dtype = unconstrained_steps_tensor.dtype + return control_flow_ops.cond( + standard_ops.cast(global_step, dtype) < unconstrained_steps_tensor, + true_fn=unconstrained_fn, + false_fn=constrained_fn) + else: + return constrained_fn() diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..01c6e4f08afb93e37aa124f31ca7faa10b07d4d6 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py @@ -0,0 +1,375 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines `AdditiveExternalRegretOptimizer`. + +This optimizer minimizes a `ConstrainedMinimizationProblem` by introducing +Lagrange multipliers, and using `tf.train.Optimizer`s to jointly optimize over +the model parameters and Lagrange multipliers. + +For the purposes of constrained optimization, at least in theory, +external-regret minimization suffices if the `ConstrainedMinimizationProblem` +we're optimizing doesn't have any `proxy_constraints`, while swap-regret +minimization should be used if `proxy_constraints` are present. + +For more specifics, please refer to: + +> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex +> Constrained Optimization". +> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + +The formulation used by the AdditiveExternalRegretOptimizer--which is simply the +usual Lagrangian formulation--can be found in Definition 1, and is discussed in +Section 3. This optimizer is most similar to Algorithm 3 in Appendix C.3, with +the two differences being that it uses proxy constraints (if they're provided) +in the update of the model parameters, and uses `tf.train.Optimizer`s, instead +of SGD, for the "inner" updates. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc + +import six + +from tensorflow.contrib.constrained_optimization.python import constrained_optimizer + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import standard_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.training import optimizer as train_optimizer + + +def _project_multipliers_wrt_euclidean_norm(multipliers, radius): + """Projects its argument onto the feasible region. + + The feasible region is the set of all vectors with nonnegative elements that + sum to at most `radius`. + + Args: + multipliers: 1d tensor, the Lagrange multipliers to project. + radius: float, the radius of the feasible region. + + Returns: + The 1d tensor that results from projecting `multipliers` onto the feasible + region w.r.t. the Euclidean norm. + + Raises: + ValueError: if the `multipliers` tensor does not have a fully-known shape, + or is not one-dimensional. + """ + multipliers_shape = multipliers.get_shape() + if multipliers_shape is None: + raise ValueError("multipliers must have known shape") + if multipliers_shape.ndims != 1: + raise ValueError( + "multipliers must be one dimensional (instead is %d-dimensional)" % + multipliers_shape.ndims) + dimension = multipliers_shape[0].value + if dimension is None: + raise ValueError("multipliers must have fully-known shape") + + def while_loop_condition(iteration, multipliers, inactive, old_inactive): + """Returns false if the while loop should terminate.""" + del multipliers # Needed by the body, but not the condition. + not_done = (iteration < dimension) + not_converged = standard_ops.reduce_any( + standard_ops.not_equal(inactive, old_inactive)) + return standard_ops.logical_and(not_done, not_converged) + + def while_loop_body(iteration, multipliers, inactive, old_inactive): + """Performs one iteration of the projection.""" + del old_inactive # Needed by the condition, but not the body. + iteration += 1 + scale = standard_ops.minimum( + 0.0, + (radius - standard_ops.reduce_sum(multipliers)) / standard_ops.maximum( + 1.0, standard_ops.reduce_sum(inactive))) + multipliers += scale * inactive + new_inactive = standard_ops.to_float(multipliers > 0) + multipliers *= new_inactive + return (iteration, multipliers, new_inactive, inactive) + + iteration = standard_ops.constant(0) + inactive = standard_ops.ones_like(multipliers) + + # We actually want a do-while loop, so we explicitly call while_loop_body() + # once before tf.while_loop(). + iteration, multipliers, inactive, old_inactive = while_loop_body( + iteration, multipliers, inactive, inactive) + iteration, multipliers, inactive, old_inactive = control_flow_ops.while_loop( + while_loop_condition, + while_loop_body, + loop_vars=(iteration, multipliers, inactive, old_inactive), + name="euclidean_projection") + + return multipliers + + +@six.add_metaclass(abc.ABCMeta) +class _ExternalRegretOptimizer(constrained_optimizer.ConstrainedOptimizer): + """Base class representing an `_ExternalRegretOptimizer`. + + This class contains most of the logic for performing constrained + optimization, minimizing external regret for the constraints player. What it + *doesn't* do is keep track of the internal state (the Lagrange multipliers). + Instead, the state is accessed via the _initial_state(), + _lagrange_multipliers(), _constraint_grad_and_var() and _projection_op() + methods. + + The reason for this is that we want to make it easy to implement different + representations of the internal state. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by `_ExternalRegretOptimizer`s--which is simply the usual + Lagrangian formulation--can be found in Definition 1, and is discussed in + Section 3. Such optimizers are most similar to Algorithm 3 in Appendix C.3. + """ + + def __init__(self, optimizer, constraint_optimizer=None): + """Constructs a new `_ExternalRegretOptimizer`. + + The difference between `optimizer` and `constraint_optimizer` (if the latter + is provided) is that the former is used for learning the model parameters, + while the latter us used for the Lagrange multipliers. If no + `constraint_optimizer` is provided, then `optimizer` is used for both. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of the ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multipliers. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multipliers. + + Returns: + A new `_ExternalRegretOptimizer`. + """ + super(_ExternalRegretOptimizer, self).__init__(optimizer=optimizer) + self._constraint_optimizer = constraint_optimizer + + @property + def constraint_optimizer(self): + """Returns the `tf.train.Optimizer` used for the Lagrange multipliers.""" + return self._constraint_optimizer + + @abc.abstractmethod + def _initial_state(self, num_constraints): + pass + + @abc.abstractmethod + def _lagrange_multipliers(self, state): + pass + + @abc.abstractmethod + def _constraint_grad_and_var(self, state, gradient): + pass + + @abc.abstractmethod + def _projection_op(self, state, name=None): + pass + + def minimize_constrained(self, + minimization_problem, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the constrained problem. + + The `optimizer` constructor parameter will be used to update the model + parameters, while the Lagrange multipliers will be updated using + `constrained_optimizer` (if provided) or `optimizer` (if not). + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + """ + objective = minimization_problem.objective + + constraints = minimization_problem.constraints + proxy_constraints = minimization_problem.proxy_constraints + if proxy_constraints is None: + proxy_constraints = constraints + # Flatten both constraints tensors to 1d. + num_constraints = minimization_problem.num_constraints + constraints = standard_ops.reshape(constraints, shape=(num_constraints,)) + proxy_constraints = standard_ops.reshape( + proxy_constraints, shape=(num_constraints,)) + + # We use a lambda to initialize the state so that, if this function call is + # inside the scope of a tf.control_dependencies() block, the dependencies + # will not be applied to the initializer. + state = standard_ops.Variable( + lambda: self._initial_state(num_constraints), + trainable=False, + name="external_regret_optimizer_state") + + multipliers = self._lagrange_multipliers(state) + loss = ( + objective + standard_ops.tensordot(multipliers, proxy_constraints, 1)) + multipliers_gradient = constraints + + update_ops = [] + if self.constraint_optimizer is None: + # If we don't have a separate constraint_optimizer, then we use + # self._optimizer for both the update of the model parameters, and that of + # the internal state. + grads_and_vars = self.optimizer.compute_gradients( + loss, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + grad_loss=grad_loss) + grads_and_vars.append( + self._constraint_grad_and_var(state, multipliers_gradient)) + update_ops.append( + self.optimizer.apply_gradients(grads_and_vars, name="update")) + else: + # If we have a separate constraint_optimizer, then we use self._optimizer + # for the update of the model parameters, and self._constraint_optimizer + # for that of the internal state. + grads_and_vars = self.optimizer.compute_gradients( + loss, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + grad_loss=grad_loss) + multiplier_grads_and_vars = [ + self._constraint_grad_and_var(state, multipliers_gradient) + ] + + gradients = [ + gradient for gradient, _ in grads_and_vars + multiplier_grads_and_vars + if gradient is not None + ] + with ops.control_dependencies(gradients): + update_ops.append( + self.optimizer.apply_gradients(grads_and_vars, name="update")) + update_ops.append( + self.constraint_optimizer.apply_gradients( + multiplier_grads_and_vars, name="optimizer_state_update")) + + with ops.control_dependencies(update_ops): + if global_step is None: + # If we don't have a global step, just project, and we're done. + return self._projection_op(state, name=name) + else: + # If we have a global step, then we need to increment it in addition to + # projecting. + projection_op = self._projection_op(state, name="project") + with ops.colocate_with(global_step): + global_step_op = state_ops.assign_add( + global_step, 1, name="global_step_increment") + return control_flow_ops.group(projection_op, global_step_op, name=name) + + +class AdditiveExternalRegretOptimizer(_ExternalRegretOptimizer): + """A `ConstrainedOptimizer` based on external-regret minimization. + + This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly + minimize over the model parameters, and maximize over Lagrange multipliers, + with the latter maximization using additive updates and an algorithm that + minimizes external regret. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by this optimizer--which is simply the usual Lagrangian + formulation--can be found in Definition 1, and is discussed in Section 3. It + is most similar to Algorithm 3 in Appendix C.3, with the two differences being + that it uses proxy constraints (if they're provided) in the update of the + model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for the + "inner" updates. + """ + + def __init__(self, + optimizer, + constraint_optimizer=None, + maximum_multiplier_radius=None): + """Constructs a new `AdditiveExternalRegretOptimizer`. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multipliers. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multipliers. + maximum_multiplier_radius: float, an optional upper bound to impose on the + sum of the Lagrange multipliers. + + Returns: + A new `AdditiveExternalRegretOptimizer`. + + Raises: + ValueError: If the maximum_multiplier_radius parameter is nonpositive. + """ + super(AdditiveExternalRegretOptimizer, self).__init__( + optimizer=optimizer, constraint_optimizer=constraint_optimizer) + + if maximum_multiplier_radius and (maximum_multiplier_radius <= 0.0): + raise ValueError("maximum_multiplier_radius must be strictly positive") + + self._maximum_multiplier_radius = maximum_multiplier_radius + + def _initial_state(self, num_constraints): + # For an AdditiveExternalRegretOptimizer, the internal state is simply a + # tensor of Lagrange multipliers with shape (m,), where m is the number of + # constraints. + return standard_ops.zeros((num_constraints,), dtype=dtypes.float32) + + def _lagrange_multipliers(self, state): + return state + + def _constraint_grad_and_var(self, state, gradient): + # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True? + return (-gradient, state) + + def _projection_op(self, state, name=None): + with ops.colocate_with(state): + if self._maximum_multiplier_radius: + projected_multipliers = _project_multipliers_wrt_euclidean_norm( + state, self._maximum_multiplier_radius) + else: + projected_multipliers = standard_ops.maximum(state, 0.0) + return state_ops.assign(state, projected_multipliers, name=name) diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9b4bf6271009161c4c449cd9c3cdab9fba90aa59 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py @@ -0,0 +1,136 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for constrained_optimization.python.external_regret_optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.constrained_optimization.python import external_regret_optimizer +from tensorflow.contrib.constrained_optimization.python import test_util + +from tensorflow.python.ops import standard_ops +from tensorflow.python.platform import test +from tensorflow.python.training import gradient_descent + + +class AdditiveExternalRegretOptimizerWrapper( + external_regret_optimizer.AdditiveExternalRegretOptimizer): + """Testing wrapper class around AdditiveExternalRegretOptimizer. + + This class is identical to AdditiveExternalRegretOptimizer, except that it + caches the internal optimization state when _lagrange_multipliers() is called, + so that we can test that the Lagrange multipliers take on their expected + values. + """ + + def __init__(self, + optimizer, + constraint_optimizer=None, + maximum_multiplier_radius=None): + """Same as AdditiveExternalRegretOptimizer.__init__.""" + super(AdditiveExternalRegretOptimizerWrapper, self).__init__( + optimizer=optimizer, + constraint_optimizer=constraint_optimizer, + maximum_multiplier_radius=maximum_multiplier_radius) + self._cached_lagrange_multipliers = None + + @property + def lagrange_multipliers(self): + """Returns the cached Lagrange multipliers.""" + return self._cached_lagrange_multipliers + + def _lagrange_multipliers(self, state): + """Caches the internal state for testing.""" + self._cached_lagrange_multipliers = super( + AdditiveExternalRegretOptimizerWrapper, + self)._lagrange_multipliers(state) + return self._cached_lagrange_multipliers + + +class ExternalRegretOptimizerTest(test.TestCase): + + def test_project_multipliers_wrt_euclidean_norm(self): + """Tests Euclidean projection routine on some known values.""" + multipliers1 = standard_ops.constant([-0.1, -0.6, -0.3]) + expected_projected_multipliers1 = np.array([0.0, 0.0, 0.0]) + + multipliers2 = standard_ops.constant([-0.1, 0.6, 0.3]) + expected_projected_multipliers2 = np.array([0.0, 0.6, 0.3]) + + multipliers3 = standard_ops.constant([0.4, 0.7, -0.2, 0.5, 0.1]) + expected_projected_multipliers3 = np.array([0.2, 0.5, 0.0, 0.3, 0.0]) + + with self.test_session() as session: + projected_multipliers1 = session.run( + external_regret_optimizer._project_multipliers_wrt_euclidean_norm( + multipliers1, 1.0)) + projected_multipliers2 = session.run( + external_regret_optimizer._project_multipliers_wrt_euclidean_norm( + multipliers2, 1.0)) + projected_multipliers3 = session.run( + external_regret_optimizer._project_multipliers_wrt_euclidean_norm( + multipliers3, 1.0)) + + self.assertAllClose( + expected_projected_multipliers1, + projected_multipliers1, + rtol=0, + atol=1e-6) + self.assertAllClose( + expected_projected_multipliers2, + projected_multipliers2, + rtol=0, + atol=1e-6) + self.assertAllClose( + expected_projected_multipliers3, + projected_multipliers3, + rtol=0, + atol=1e-6) + + def test_additive_external_regret_optimizer(self): + """Tests that the Lagrange multipliers update as expected.""" + minimization_problem = test_util.ConstantMinimizationProblem( + np.array([0.6, -0.1, 0.4])) + optimizer = AdditiveExternalRegretOptimizerWrapper( + gradient_descent.GradientDescentOptimizer(1.0), + maximum_multiplier_radius=1.0) + train_op = optimizer.minimize_constrained(minimization_problem) + + expected_multipliers = [ + np.array([0.0, 0.0, 0.0]), + np.array([0.6, 0.0, 0.4]), + np.array([0.7, 0.0, 0.3]), + np.array([0.8, 0.0, 0.2]), + np.array([0.9, 0.0, 0.1]), + np.array([1.0, 0.0, 0.0]), + np.array([1.0, 0.0, 0.0]), + ] + + multipliers = [] + with self.test_session() as session: + session.run(standard_ops.global_variables_initializer()) + while len(multipliers) < len(expected_multipliers): + multipliers.append(session.run(optimizer.lagrange_multipliers)) + session.run(train_op) + + for expected, actual in zip(expected_multipliers, multipliers): + self.assertAllClose(expected, actual, rtol=0, atol=1e-6) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..04014ab4aebd6d9cd70653c53f9361320e803329 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py @@ -0,0 +1,595 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines `{Additive,Multiplicative}SwapRegretOptimizer`s. + +These optimizers minimize a `ConstrainedMinimizationProblem` by using a +swap-regret minimizing algorithm (either SGD or multiplicative weights) to learn +what weights should be associated with the objective function and constraints. +These algorithms do *not* use Lagrange multipliers, but the idea is similar. +The main differences between the formulation used here, and the standard +Lagrangian formulation, are that (i) the objective function is weighted, in +addition to the constraints, and (ii) we learn a matrix of weights, instead of a +vector. + +For the purposes of constrained optimization, at least in theory, +external-regret minimization suffices if the `ConstrainedMinimizationProblem` +we're optimizing doesn't have any `proxy_constraints`, while swap-regret +minimization should be used if `proxy_constraints` are present. + +For more specifics, please refer to: + +> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex +> Constrained Optimization". +> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + +The formulation used by both of the SwapRegretOptimizers can be found in +Definition 2, and is discussed in Section 4. The +`MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2 in Section 4, +with the difference being that it uses `tf.train.Optimizer`s, instead of SGD, +for the "inner" updates. The `AdditiveSwapRegretOptimizer` differs further in +that it performs additive (instead of multiplicative) updates of the stochastic +matrix. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc +import math + +import six + +from tensorflow.contrib.constrained_optimization.python import constrained_optimizer + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import standard_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.training import optimizer as train_optimizer + + +def _maximal_eigenvector_power_method(matrix, + epsilon=1e-6, + maximum_iterations=100): + """Returns the maximal right-eigenvector of `matrix` using the power method. + + Args: + matrix: 2D Tensor, the matrix of which we will find the maximal + right-eigenvector. + epsilon: nonnegative float, if two iterations of the power method differ (in + L2 norm) by no more than epsilon, we will terminate. + maximum_iterations: nonnegative int, if we perform this many iterations, we + will terminate. + + Result: + The maximal right-eigenvector of `matrix`. + + Raises: + ValueError: If the epsilon or maximum_iterations parameters violate their + bounds. + """ + if epsilon <= 0.0: + raise ValueError("epsilon must be strictly positive") + if maximum_iterations <= 0: + raise ValueError("maximum_iterations must be strictly positive") + + def while_loop_condition(iteration, eigenvector, old_eigenvector): + """Returns false if the while loop should terminate.""" + not_done = (iteration < maximum_iterations) + not_converged = (standard_ops.norm(eigenvector - old_eigenvector) > epsilon) + return standard_ops.logical_and(not_done, not_converged) + + def while_loop_body(iteration, eigenvector, old_eigenvector): + """Performs one iteration of the power method.""" + del old_eigenvector # Needed by the condition, but not the body. + iteration += 1 + # We need to use tf.matmul() and tf.expand_dims(), instead of + # tf.tensordot(), since the former will infer the shape of the result, while + # the latter will not (tf.while_loop() needs the shapes). + new_eigenvector = standard_ops.matmul( + matrix, standard_ops.expand_dims(eigenvector, 1))[:, 0] + new_eigenvector /= standard_ops.norm(new_eigenvector) + return (iteration, new_eigenvector, eigenvector) + + iteration = standard_ops.constant(0) + eigenvector = standard_ops.ones_like(matrix[:, 0]) + eigenvector /= standard_ops.norm(eigenvector) + + # We actually want a do-while loop, so we explicitly call while_loop_body() + # once before tf.while_loop(). + iteration, eigenvector, old_eigenvector = while_loop_body( + iteration, eigenvector, eigenvector) + iteration, eigenvector, old_eigenvector = control_flow_ops.while_loop( + while_loop_condition, + while_loop_body, + loop_vars=(iteration, eigenvector, old_eigenvector), + name="power_method") + + return eigenvector + + +def _project_stochastic_matrix_wrt_euclidean_norm(matrix): + """Projects its argument onto the set of left-stochastic matrices. + + This algorithm is O(n^3) at worst, where `matrix` is n*n. It can be done in + O(n^2 * log(n)) time by sorting each column (and maybe better with a different + algorithm), but the algorithm implemented here is easier to implement in + TensorFlow. + + Args: + matrix: 2d square tensor, the matrix to project. + + Returns: + The 2d square tensor that results from projecting `matrix` onto the set of + left-stochastic matrices w.r.t. the Euclidean norm applied column-wise + (i.e. the Frobenius norm). + + Raises: + ValueError: if the `matrix` tensor does not have a fully-known shape, or is + not two-dimensional and square. + """ + matrix_shape = matrix.get_shape() + if matrix_shape is None: + raise ValueError("matrix must have known shape") + if matrix_shape.ndims != 2: + raise ValueError( + "matrix must be two dimensional (instead is %d-dimensional)" % + matrix_shape.ndims) + if matrix_shape[0] != matrix_shape[1]: + raise ValueError("matrix must be be square (instead has shape (%d,%d))" % + (matrix_shape[0], matrix_shape[1])) + dimension = matrix_shape[0].value + if dimension is None: + raise ValueError("matrix must have fully-known shape") + + def while_loop_condition(iteration, matrix, inactive, old_inactive): + """Returns false if the while loop should terminate.""" + del matrix # Needed by the body, but not the condition. + not_done = (iteration < dimension) + not_converged = standard_ops.reduce_any( + standard_ops.not_equal(inactive, old_inactive)) + return standard_ops.logical_and(not_done, not_converged) + + def while_loop_body(iteration, matrix, inactive, old_inactive): + """Performs one iteration of the projection.""" + del old_inactive # Needed by the condition, but not the body. + iteration += 1 + scale = (1.0 - standard_ops.reduce_sum( + matrix, axis=0, keep_dims=True)) / standard_ops.maximum( + 1.0, standard_ops.reduce_sum(inactive, axis=0, keep_dims=True)) + matrix += scale * inactive + new_inactive = standard_ops.to_float(matrix > 0) + matrix *= new_inactive + return (iteration, matrix, new_inactive, inactive) + + iteration = standard_ops.constant(0) + inactive = standard_ops.ones_like(matrix) + + # We actually want a do-while loop, so we explicitly call while_loop_body() + # once before tf.while_loop(). + iteration, matrix, inactive, old_inactive = while_loop_body( + iteration, matrix, inactive, inactive) + iteration, matrix, inactive, old_inactive = control_flow_ops.while_loop( + while_loop_condition, + while_loop_body, + loop_vars=(iteration, matrix, inactive, old_inactive), + name="euclidean_projection") + + return matrix + + +def _project_log_stochastic_matrix_wrt_kl_divergence(log_matrix): + """Projects its argument onto the set of log-left-stochastic matrices. + + Args: + log_matrix: 2d square tensor, the element-wise logarithm of the matrix to + project. + + Returns: + The 2d square tensor that results from projecting exp(`matrix`) onto the set + of left-stochastic matrices w.r.t. the KL-divergence applied column-wise. + """ + + # For numerical reasons, make sure that the largest matrix element is zero + # before exponentiating. + log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keep_dims=True) + log_matrix -= standard_ops.log( + standard_ops.reduce_sum( + standard_ops.exp(log_matrix), axis=0, keep_dims=True)) + return log_matrix + + +@six.add_metaclass(abc.ABCMeta) +class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer): + """Base class representing a `_SwapRegretOptimizer`. + + This class contains most of the logic for performing constrained optimization, + minimizing external regret for the constraints player. What it *doesn't* do is + keep track of the internal state (the stochastic matrix). Instead, the state + is accessed via the _initial_state(), _stochastic_matrix(), + _constraint_grad_and_var() and _projection_op() methods. + + The reason for this is that we want to make it easy to implement different + representations of the internal state. For example, for additive updates, it's + most natural to store the stochastic matrix directly, whereas for + multiplicative updates, it's most natural to store its element-wise logarithm. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by `_SwapRegretOptimizer`s can be found in Definition 2, + and is discussed in Section 4. Such optimizers are most similar to Algorithm + 2 in Section 4. Most notably, the internal state is a left-stochastic matrix + of shape (m+1,m+1), where m is the number of constraints. + """ + + def __init__(self, optimizer, constraint_optimizer=None): + """Constructs a new `_SwapRegretOptimizer`. + + The difference between `optimizer` and `constraint_optimizer` (if the latter + is provided) is that the former is used for learning the model parameters, + while the latter us used for the update to the constraint/objective weight + matrix (the analogue of Lagrange multipliers). If no `constraint_optimizer` + is provided, then `optimizer` is used for both. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multiplier analogues. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multiplier analogues. + + Returns: + A new `_SwapRegretOptimizer`. + """ + super(_SwapRegretOptimizer, self).__init__(optimizer=optimizer) + self._constraint_optimizer = constraint_optimizer + + @property + def constraint_optimizer(self): + """Returns the `tf.train.Optimizer` used for the matrix.""" + return self._constraint_optimizer + + @abc.abstractmethod + def _initial_state(self, num_constraints): + pass + + @abc.abstractmethod + def _stochastic_matrix(self, state): + pass + + def _distribution(self, state): + distribution = _maximal_eigenvector_power_method( + self._stochastic_matrix(state)) + distribution = standard_ops.abs(distribution) + distribution /= standard_ops.reduce_sum(distribution) + return distribution + + @abc.abstractmethod + def _constraint_grad_and_var(self, state, gradient): + pass + + @abc.abstractmethod + def _projection_op(self, state, name=None): + pass + + def minimize_constrained(self, + minimization_problem, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the constrained problem. + + The `optimizer` constructor parameter will be used to update the model + parameters, while the constraint/objective weight matrix (the analogue of + Lagrange multipliers) will be updated using `constrained_optimizer` (if + provided) or `optimizer` (if not). Whether the matrix updates are additive + or multiplicative depends on the derived class. + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + """ + objective = minimization_problem.objective + + constraints = minimization_problem.constraints + proxy_constraints = minimization_problem.proxy_constraints + if proxy_constraints is None: + proxy_constraints = constraints + # Flatten both constraints tensors to 1d. + num_constraints = minimization_problem.num_constraints + constraints = standard_ops.reshape(constraints, shape=(num_constraints,)) + proxy_constraints = standard_ops.reshape( + proxy_constraints, shape=(num_constraints,)) + + # We use a lambda to initialize the state so that, if this function call is + # inside the scope of a tf.control_dependencies() block, the dependencies + # will not be applied to the initializer. + state = standard_ops.Variable( + lambda: self._initial_state(num_constraints), + trainable=False, + name="swap_regret_optimizer_state") + + zero_and_constraints = standard_ops.concat( + (standard_ops.zeros((1,)), constraints), axis=0) + objective_and_proxy_constraints = standard_ops.concat( + (standard_ops.expand_dims(objective, 0), proxy_constraints), axis=0) + + distribution = self._distribution(state) + loss = standard_ops.tensordot(distribution, objective_and_proxy_constraints, + 1) + matrix_gradient = standard_ops.matmul( + standard_ops.expand_dims(zero_and_constraints, 1), + standard_ops.expand_dims(distribution, 0)) + + update_ops = [] + if self.constraint_optimizer is None: + # If we don't have a separate constraint_optimizer, then we use + # self._optimizer for both the update of the model parameters, and that of + # the internal state. + grads_and_vars = self.optimizer.compute_gradients( + loss, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + grad_loss=grad_loss) + grads_and_vars.append( + self._constraint_grad_and_var(state, matrix_gradient)) + update_ops.append( + self.optimizer.apply_gradients(grads_and_vars, name="update")) + else: + # If we have a separate constraint_optimizer, then we use self._optimizer + # for the update of the model parameters, and self._constraint_optimizer + # for that of the internal state. + grads_and_vars = self.optimizer.compute_gradients( + loss, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + grad_loss=grad_loss) + matrix_grads_and_vars = [ + self._constraint_grad_and_var(state, matrix_gradient) + ] + + gradients = [ + gradient for gradient, _ in grads_and_vars + matrix_grads_and_vars + if gradient is not None + ] + with ops.control_dependencies(gradients): + update_ops.append( + self.optimizer.apply_gradients(grads_and_vars, name="update")) + update_ops.append( + self.constraint_optimizer.apply_gradients( + matrix_grads_and_vars, name="optimizer_state_update")) + + with ops.control_dependencies(update_ops): + if global_step is None: + # If we don't have a global step, just project, and we're done. + return self._projection_op(state, name=name) + else: + # If we have a global step, then we need to increment it in addition to + # projecting. + projection_op = self._projection_op(state, name="project") + with ops.colocate_with(global_step): + global_step_op = state_ops.assign_add( + global_step, 1, name="global_step_increment") + return control_flow_ops.group(projection_op, global_step_op, name=name) + + +class AdditiveSwapRegretOptimizer(_SwapRegretOptimizer): + """A `ConstrainedOptimizer` based on swap-regret minimization. + + This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly + minimize over the model parameters, and maximize over constraint/objective + weight matrix (the analogue of Lagrange multipliers), with the latter + maximization using additive updates and an algorithm that minimizes swap + regret. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by this optimizer can be found in Definition 2, and is + discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with + the differences being that it uses `tf.train.Optimizer`s, instead of SGD, for + the "inner" updates, and performs additive (instead of multiplicative) updates + of the stochastic matrix. + """ + + def __init__(self, optimizer, constraint_optimizer=None): + """Constructs a new `AdditiveSwapRegretOptimizer`. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multiplier analogues. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multiplier analogues. + + Returns: + A new `AdditiveSwapRegretOptimizer`. + """ + # TODO(acotter): add a parameter determining the initial values of the + # matrix elements (like initial_multiplier_radius in + # MultiplicativeSwapRegretOptimizer). + super(AdditiveSwapRegretOptimizer, self).__init__( + optimizer=optimizer, constraint_optimizer=constraint_optimizer) + + def _initial_state(self, num_constraints): + # For an AdditiveSwapRegretOptimizer, the internal state is a tensor of + # shape (m+1,m+1), where m is the number of constraints, representing a + # left-stochastic matrix. + dimension = num_constraints + 1 + # Initialize by putting all weight on the objective, and none on the + # constraints. + return standard_ops.concat( + (standard_ops.ones( + (1, dimension)), standard_ops.zeros((dimension - 1, dimension))), + axis=0) + + def _stochastic_matrix(self, state): + return state + + def _constraint_grad_and_var(self, state, gradient): + # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True? + return (-gradient, state) + + def _projection_op(self, state, name=None): + with ops.colocate_with(state): + return state_ops.assign( + state, + _project_stochastic_matrix_wrt_euclidean_norm(state), + name=name) + + +class MultiplicativeSwapRegretOptimizer(_SwapRegretOptimizer): + """A `ConstrainedOptimizer` based on swap-regret minimization. + + This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly + minimize over the model parameters, and maximize over constraint/objective + weight matrix (the analogue of Lagrange multipliers), with the latter + maximization using multiplicative updates and an algorithm that minimizes swap + regret. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by this optimizer can be found in Definition 2, and is + discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with + the difference being that it uses `tf.train.Optimizer`s, instead of SGD, for + the "inner" updates. + """ + + def __init__(self, + optimizer, + constraint_optimizer=None, + minimum_multiplier_radius=1e-3, + initial_multiplier_radius=None): + """Constructs a new `MultiplicativeSwapRegretOptimizer`. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multiplier analogues. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multiplier analogues. + minimum_multiplier_radius: float, each element of the matrix will be lower + bounded by `minimum_multiplier_radius` divided by one plus the number of + constraints. + initial_multiplier_radius: float, the initial value of each element of the + matrix associated with a constraint (i.e. excluding those elements + associated with the objective) will be `initial_multiplier_radius` + divided by one plus the number of constraints. Defaults to the value of + `minimum_multiplier_radius`. + + Returns: + A new `MultiplicativeSwapRegretOptimizer`. + + Raises: + ValueError: If the two radius parameters are inconsistent. + """ + super(MultiplicativeSwapRegretOptimizer, self).__init__( + optimizer=optimizer, constraint_optimizer=constraint_optimizer) + + if (minimum_multiplier_radius <= 0.0) or (minimum_multiplier_radius >= 1.0): + raise ValueError("minimum_multiplier_radius must be in the range (0,1)") + if initial_multiplier_radius is None: + initial_multiplier_radius = minimum_multiplier_radius + elif (initial_multiplier_radius < + minimum_multiplier_radius) or (minimum_multiplier_radius > 1.0): + raise ValueError("initial_multiplier_radius must be in the range " + "[minimum_multiplier_radius,1]") + + self._minimum_multiplier_radius = minimum_multiplier_radius + self._initial_multiplier_radius = initial_multiplier_radius + + def _initial_state(self, num_constraints): + # For a MultiplicativeSwapRegretOptimizer, the internal state is a tensor of + # shape (m+1,m+1), where m is the number of constraints, representing the + # element-wise logarithm of a left-stochastic matrix. + dimension = num_constraints + 1 + # Initialize by putting as much weight as possible on the objective, and as + # little as possible on the constraints. + log_initial_one = math.log(1.0 - (self._initial_multiplier_radius * + (dimension - 1) / (dimension))) + log_initial_zero = math.log(self._initial_multiplier_radius / dimension) + return standard_ops.concat( + (standard_ops.constant( + log_initial_one, dtype=dtypes.float32, shape=(1, dimension)), + standard_ops.constant( + log_initial_zero, + dtype=dtypes.float32, + shape=(dimension - 1, dimension))), + axis=0) + + def _stochastic_matrix(self, state): + return standard_ops.exp(state) + + def _constraint_grad_and_var(self, state, gradient): + # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True? + return (-gradient, state) + + def _projection_op(self, state, name=None): + with ops.colocate_with(state): + # Gets the dimension of the state (num_constraints + 1)--all of these + # assertions are of things that should be impossible, since the state + # passed into this method will have the same shape as that returned by + # _initial_state(). + state_shape = state.get_shape() + assert state_shape is not None + assert state_shape.ndims == 2 + assert state_shape[0] == state_shape[1] + dimension = state_shape[0].value + assert dimension is not None + + minimum_log_multiplier = standard_ops.log( + self._minimum_multiplier_radius / standard_ops.to_float(dimension)) + + return state_ops.assign( + state, + standard_ops.maximum( + _project_log_stochastic_matrix_wrt_kl_divergence(state), + minimum_log_multiplier), + name=name) diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py new file mode 100644 index 0000000000000000000000000000000000000000..34c4543dca97e12c8335e4c90b849820edaefa81 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py @@ -0,0 +1,212 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for constrained_optimization.python.swap_regret_optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.constrained_optimization.python import swap_regret_optimizer +from tensorflow.contrib.constrained_optimization.python import test_util + +from tensorflow.python.ops import standard_ops +from tensorflow.python.platform import test +from tensorflow.python.training import gradient_descent + + +class AdditiveSwapRegretOptimizerWrapper( + swap_regret_optimizer.AdditiveSwapRegretOptimizer): + """Testing wrapper class around AdditiveSwapRegretOptimizer. + + This class is identical to AdditiveSwapRegretOptimizer, except that it caches + the internal optimization state when _stochastic_matrix() is called, so that + we can test that the stochastic matrices take on their expected values. + """ + + def __init__(self, optimizer, constraint_optimizer=None): + """Same as AdditiveSwapRegretOptimizer.__init__().""" + super(AdditiveSwapRegretOptimizerWrapper, self).__init__( + optimizer=optimizer, constraint_optimizer=constraint_optimizer) + self._cached_stochastic_matrix = None + + @property + def stochastic_matrix(self): + """Returns the cached stochastic matrix.""" + return self._cached_stochastic_matrix + + def _stochastic_matrix(self, state): + """Caches the internal state for testing.""" + self._cached_stochastic_matrix = super(AdditiveSwapRegretOptimizerWrapper, + self)._stochastic_matrix(state) + return self._cached_stochastic_matrix + + +class MultiplicativeSwapRegretOptimizerWrapper( + swap_regret_optimizer.MultiplicativeSwapRegretOptimizer): + """Testing wrapper class around MultiplicativeSwapRegretOptimizer. + + This class is identical to MultiplicativeSwapRegretOptimizer, except that it + caches the internal optimization state when _stochastic_matrix() is called, so + that we can test that the stochastic matrices take on their expected values. + """ + + def __init__(self, + optimizer, + constraint_optimizer=None, + minimum_multiplier_radius=None, + initial_multiplier_radius=None): + """Same as MultiplicativeSwapRegretOptimizer.__init__().""" + super(MultiplicativeSwapRegretOptimizerWrapper, self).__init__( + optimizer=optimizer, + constraint_optimizer=constraint_optimizer, + minimum_multiplier_radius=1e-3, + initial_multiplier_radius=initial_multiplier_radius) + self._cached_stochastic_matrix = None + + @property + def stochastic_matrix(self): + """Returns the cached stochastic matrix.""" + return self._cached_stochastic_matrix + + def _stochastic_matrix(self, state): + """Caches the internal state for testing.""" + self._cached_stochastic_matrix = super( + MultiplicativeSwapRegretOptimizerWrapper, + self)._stochastic_matrix(state) + return self._cached_stochastic_matrix + + +class SwapRegretOptimizerTest(test.TestCase): + + def test_maximum_eigenvector_power_method(self): + """Tests power method routine on some known left-stochastic matrices.""" + matrix1 = np.matrix([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9], [0.4, 0.3, 0.0]]) + matrix2 = np.matrix([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5], [0.4, 0.5, 0.3]]) + + with self.test_session() as session: + eigenvector1 = session.run( + swap_regret_optimizer._maximal_eigenvector_power_method( + standard_ops.constant(matrix1))) + eigenvector2 = session.run( + swap_regret_optimizer._maximal_eigenvector_power_method( + standard_ops.constant(matrix2))) + + # Check that eigenvector1 and eigenvector2 are eigenvectors of matrix1 and + # matrix2 (respectively) with associated eigenvalue 1. + matrix_eigenvector1 = np.tensordot(matrix1, eigenvector1, axes=1) + matrix_eigenvector2 = np.tensordot(matrix2, eigenvector2, axes=1) + self.assertAllClose(eigenvector1, matrix_eigenvector1, rtol=0, atol=1e-6) + self.assertAllClose(eigenvector2, matrix_eigenvector2, rtol=0, atol=1e-6) + + def test_project_stochastic_matrix_wrt_euclidean_norm(self): + """Tests Euclidean projection routine on some known values.""" + matrix = standard_ops.constant([[-0.1, -0.1, 0.4], [-0.8, 0.4, 1.2], + [-0.3, 0.1, 0.2]]) + expected_projected_matrix = np.array([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9], + [0.4, 0.3, 0.0]]) + + with self.test_session() as session: + projected_matrix = session.run( + swap_regret_optimizer._project_stochastic_matrix_wrt_euclidean_norm( + matrix)) + + self.assertAllClose( + expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6) + + def test_project_log_stochastic_matrix_wrt_kl_divergence(self): + """Tests KL-divergence projection routine on some known values.""" + matrix = standard_ops.constant([[0.2, 0.8, 0.6], [0.1, 0.2, 1.5], + [0.2, 1.0, 0.9]]) + expected_projected_matrix = np.array([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5], + [0.4, 0.5, 0.3]]) + + with self.test_session() as session: + projected_matrix = session.run( + standard_ops.exp( + swap_regret_optimizer. + _project_log_stochastic_matrix_wrt_kl_divergence( + standard_ops.log(matrix)))) + + self.assertAllClose( + expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6) + + def test_additive_swap_regret_optimizer(self): + """Tests that the stochastic matrices update as expected.""" + minimization_problem = test_util.ConstantMinimizationProblem( + np.array([0.6, -0.1, 0.4])) + optimizer = AdditiveSwapRegretOptimizerWrapper( + gradient_descent.GradientDescentOptimizer(1.0)) + train_op = optimizer.minimize_constrained(minimization_problem) + + # Calculated using a numpy+python implementation of the algorithm. + expected_matrices = [ + np.array([[1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]), + np.array([[0.66666667, 1.0, 1.0, 1.0], [0.26666667, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0], [0.06666667, 0.0, 0.0, 0.0]]), + np.array([[0.41666667, 0.93333333, 1.0, + 0.98333333], [0.46666667, 0.05333333, 0.0, + 0.01333333], [0.0, 0.0, 0.0, 0.0], + [0.11666667, 0.01333333, 0.0, 0.00333333]]), + ] + + matrices = [] + with self.test_session() as session: + session.run(standard_ops.global_variables_initializer()) + while len(matrices) < len(expected_matrices): + matrices.append(session.run(optimizer.stochastic_matrix)) + session.run(train_op) + + for expected, actual in zip(expected_matrices, matrices): + self.assertAllClose(expected, actual, rtol=0, atol=1e-6) + + def test_multiplicative_swap_regret_optimizer(self): + """Tests that the stochastic matrices update as expected.""" + minimization_problem = test_util.ConstantMinimizationProblem( + np.array([0.6, -0.1, 0.4])) + optimizer = MultiplicativeSwapRegretOptimizerWrapper( + gradient_descent.GradientDescentOptimizer(1.0), + initial_multiplier_radius=0.8) + train_op = optimizer.minimize_constrained(minimization_problem) + + # Calculated using a numpy+python implementation of the algorithm. + expected_matrices = [ + np.array([[0.4, 0.4, 0.4, 0.4], [0.2, 0.2, 0.2, 0.2], + [0.2, 0.2, 0.2, 0.2], [0.2, 0.2, 0.2, 0.2]]), + np.array([[0.36999014, 0.38528351, 0.38528351, 0.38528351], [ + 0.23517483, 0.21720297, 0.21720297, 0.21720297 + ], [0.17774131, 0.18882719, 0.18882719, 0.18882719], + [0.21709373, 0.20868632, 0.20868632, 0.20868632]]), + np.array([[0.33972109, 0.36811863, 0.37118462, 0.36906575], [ + 0.27114826, 0.23738228, 0.23376693, 0.23626491 + ], [0.15712313, 0.17641793, 0.17858959, 0.17708679], + [0.23200752, 0.21808115, 0.21645886, 0.21758255]]), + ] + + matrices = [] + with self.test_session() as session: + session.run(standard_ops.global_variables_initializer()) + while len(matrices) < len(expected_matrices): + matrices.append(session.run(optimizer.stochastic_matrix)) + session.run(train_op) + + for expected, actual in zip(expected_matrices, matrices): + self.assertAllClose(expected, actual, rtol=0, atol=1e-6) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/constrained_optimization/python/test_util.py b/tensorflow/contrib/constrained_optimization/python/test_util.py new file mode 100644 index 0000000000000000000000000000000000000000..704b36ca4c9cf94e7c304f9bed4f6ac7ca275deb --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/test_util.py @@ -0,0 +1,58 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains helpers used by tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.constrained_optimization.python import constrained_minimization_problem + +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import standard_ops + + +class ConstantMinimizationProblem( + constrained_minimization_problem.ConstrainedMinimizationProblem): + """A `ConstrainedMinimizationProblem` with constant constraint violations. + + This minimization problem is intended for use in performing simple tests of + the Lagrange multiplier (or equivalent) update in the optimizers. There is a + one-element "dummy" model parameter, but it should be ignored. + """ + + def __init__(self, constraints): + """Constructs a new `ConstantMinimizationProblem'. + + Args: + constraints: 1d numpy array, the constant constraint violations. + + Returns: + A new `ConstantMinimizationProblem'. + """ + # We make an fake 1-parameter linear objective so that we don't get a "no + # variables to optimize" error. + self._objective = standard_ops.Variable(0.0, dtype=dtypes.float32) + self._constraints = standard_ops.constant(constraints, dtype=dtypes.float32) + + @property + def objective(self): + """Returns the objective function.""" + return self._objective + + @property + def constraints(self): + """Returns the constant constraint violations.""" + return self._constraints diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD index d68015ae1565b778b1ba0744f515d09007175e93..aeefa3cee62281c74388765ea5e2cbc7f16ff927 100644 --- a/tensorflow/contrib/cudnn_rnn/BUILD +++ b/tensorflow/contrib/cudnn_rnn/BUILD @@ -25,7 +25,7 @@ tf_custom_op_py_library( srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ - "//tensorflow/contrib/eager/python:checkpointable_utils", + "//tensorflow/contrib/checkpoint/python:split_dependency", "//tensorflow/contrib/util:util_py", "//tensorflow/python:array_ops", "//tensorflow/python:control_flow_ops", diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py index 6fb56b0858786662546ecab425b1a2564fbd9a64..012b17cee88aecb44c45dd44ea72fd43b5494085 100644 --- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py +++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py @@ -1072,6 +1072,17 @@ class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase): class CudnnRNNTestTraining(test_util.TensorFlowTestCase): + def setUp(self): + super(CudnnRNNTestTraining, self).setUp() + self._reset_rnd_gen_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", + str(False)) + self._rnn_use_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0") + + def tearDown(self): + super(CudnnRNNTestTraining, self).tearDown() + os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = self._reset_rnd_gen_state + os.environ["TF_CUDNN_RNN_USE_V2"] = self._rnn_use_v2 + def _ComputeNumericGrad(self, sess, y, x, delta=1e-4, step=1): """Compute the numeric gradient of y wrt to x. @@ -1184,11 +1195,10 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase): def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, dtype, - delta, tolerance): + use_v2, delta, tolerance): # Gradient checking runs two forward ops with almost the same input. Need to # make sure the drop patterns across the two runs are the same. logging.info("Training test with config: %s", locals()) - old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False)) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True) np.random.seed(1234) @@ -1196,6 +1206,10 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase): has_input_c = (rnn_mode == CUDNN_LSTM) direction = (CUDNN_RNN_UNIDIRECTION if dir_count == 1 else CUDNN_RNN_BIDIRECTION) + if use_v2: + os.environ["TF_CUDNN_RNN_USE_V2"] = "1" + else: + os.environ["TF_CUDNN_RNN_USE_V2"] = "0" model = CudnnTestModel( rnn_mode, num_layers, @@ -1245,22 +1259,22 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase): self._GradientCheck( sess, total_sum, all_inputs, tolerance=tolerance, delta=delta) - os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state def _TestSimpleTrainingHelper(self, rnn_mode, test_configs): dropouts = [0, 0.5, 1.] - for config, dropout in itertools.product(test_configs, dropouts): + v2_options = [str(False), str(True)] + for config, dropout, use_v2 in itertools.product(test_configs, dropouts, + v2_options): dtype = config.get("dtype", dtypes.float32) delta = config.get("delta", 1e-4) tolerance = config.get("tolerance", 1e-6) dir_count = config.get("dir_count", 1) shape = config["shape"] with ops.Graph().as_default(): - self._TestOneSimpleTraining(rnn_mode, shape["num_layers"], - shape["num_units"], shape["input_size"], - shape["batch_size"], shape["seq_length"], - dir_count, dropout, dtype, delta, - tolerance) + self._TestOneSimpleTraining( + rnn_mode, shape["num_layers"], shape["num_units"], + shape["input_size"], shape["batch_size"], shape["seq_length"], + dir_count, dropout, dtype, use_v2, delta, tolerance) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py index b615824460be59697d2207f5d5af0eba748c5237..73a961992e19fabec5d0f75be1b52dbba20eb7af 100644 --- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py +++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py @@ -17,7 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.eager.python import checkpointable_utils +import os +from tensorflow.contrib.checkpoint.python import split_dependency from tensorflow.contrib.rnn.python.ops import lstm_ops from tensorflow.python.framework import common_shapes from tensorflow.python.framework import dtypes @@ -318,7 +319,7 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject): dependencies too (typically the cuDNN `Layer`). dtype: The dtype for the canonical parameter Tensors. """ - split_dependencies = checkpointable_utils.split_dependency( + split_dependencies = split_dependency.split_dependency( component_names=self._param_names, component_dtypes=(dtype,) * len(self._param_names), fill_save_buffer_fn=self._checkpointable_save, @@ -901,19 +902,27 @@ def _cudnn_rnn(inputs, check_direction(direction) check_input_mode(input_mode) seed, seed2 = random_seed.get_seed(seed) - outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn( - input=inputs, - input_h=input_h, - input_c=input_c, - params=params, - is_training=is_training, - rnn_mode=rnn_mode, - input_mode=input_mode, - direction=direction, - dropout=dropout, - seed=seed, - seed2=seed2, - name=name) + # TODO(jamesqin): switch default value to "1" on May 25th 2018, and get rid + # of V1 ops. + use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0") + args = { + "input": inputs, + "input_h": input_h, + "input_c": input_c, + "params": params, + "is_training": is_training, + "rnn_mode": rnn_mode, + "input_mode": input_mode, + "direction": direction, + "dropout": dropout, + "seed": seed, + "seed2": seed2, + "name": name + } + if use_cudnn_v2 is not "1": + outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args) + else: + outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args) return (outputs, output_h, output_c) diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index b15b9663f4c1bd48ed2a482f657490b0b5677673..d59dd17aea42618075e69516bcfa4ee2b9eafc81 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -122,7 +122,10 @@ py_test( size = "small", srcs = ["filter_dataset_op_test.py"], srcs_version = "PY2AND3", - tags = ["no_pip"], + tags = [ + "no_pip", + "optonly", + ], deps = [ ":dataset_serialization_test", "//tensorflow/python:array_ops", @@ -211,7 +214,11 @@ py_test( size = "medium", srcs = ["map_dataset_op_test.py"], srcs_version = "PY2AND3", - tags = ["no_pip"], + tags = [ + "no_pip", + "noasan", # times out + "optonly", + ], deps = [ ":dataset_serialization_test", "//tensorflow/contrib/data/python/ops:error_ops", @@ -306,7 +313,10 @@ py_test( srcs = ["resample_test.py"], shard_count = 2, srcs_version = "PY2AND3", - tags = ["noasan"], + tags = [ + "noasan", + "optonly", + ], deps = [ "//tensorflow/contrib/data/python/ops:resampling", "//tensorflow/python:client_testlib", @@ -333,6 +343,7 @@ py_test( "//tensorflow/python:dtypes", "//tensorflow/python:errors", "//tensorflow/python/data/ops:dataset_ops", + "//tensorflow/python/eager:context", "//third_party/py/numpy", ], ) @@ -506,3 +517,23 @@ tf_py_test( "//third_party/py/numpy", ], ) + +tf_py_test( + name = "writer_ops_test", + size = "small", + srcs = ["writer_ops_test.py"], + additional_deps = [ + "//tensorflow/contrib/data/python/ops:writers", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:dataset_ops_gen", + "//tensorflow/python:dtypes", + "//tensorflow/python:errors", + "//tensorflow/python:framework_ops", + "//tensorflow/python:io_ops", + "//tensorflow/python:lib", + "//tensorflow/python:tensor_shape", + "//tensorflow/python:util", + "//tensorflow/python/data/ops:readers", + ], +) diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py index 413d8737978b695ac443c92036d6641e5c73f28c..a4a0ce79b6013d8813f2d8d294168ea8189d53ef 100644 --- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py @@ -18,15 +18,18 @@ from __future__ import division from __future__ import print_function import math +import time import numpy as np from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base from tensorflow.contrib.data.python.ops import batching +from tensorflow.python.client import session from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops @@ -34,6 +37,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import script_ops from tensorflow.python.ops import string_ops from tensorflow.python.platform import test +from tensorflow.python.util import compat class BatchDatasetTest(test.TestCase): @@ -151,6 +155,69 @@ class BatchDatasetTest(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(op) + def testUnbatchDatasetWithStrings(self): + data = tuple([math_ops.range(10) for _ in range(3)]) + data = dataset_ops.Dataset.from_tensor_slices(data) + data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z)) + expected_types = (dtypes.int32, dtypes.string, dtypes.int32) + data = data.batch(2) + self.assertEqual(expected_types, data.output_types) + data = data.apply(batching.unbatch()) + self.assertEqual(expected_types, data.output_types) + + iterator = data.make_one_shot_iterator() + op = iterator.get_next() + + with self.test_session() as sess: + for i in range(10): + self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op)) + + with self.assertRaises(errors.OutOfRangeError): + sess.run(op) + + def testUnbatchDatasetWithSparseTensor(self): + st = sparse_tensor.SparseTensorValue( + indices=[[i, i] for i in range(10)], + values=list(range(10)), + dense_shape=[10, 10]) + data = dataset_ops.Dataset.from_tensors(st) + data = data.apply(batching.unbatch()) + data = data.batch(5) + data = data.apply(batching.unbatch()) + iterator = data.make_one_shot_iterator() + next_element = iterator.get_next() + + with self.test_session() as sess: + for i in range(10): + st_row = sess.run(next_element) + self.assertEqual([i], st_row.indices) + self.assertEqual([i], st_row.values) + self.assertEqual([10], st_row.dense_shape) + with self.assertRaises(errors.OutOfRangeError): + sess.run(next_element) + + def testUnbatchDatasetWithDenseAndSparseTensor(self): + st = sparse_tensor.SparseTensorValue( + indices=[[i, i] for i in range(10)], + values=list(range(10)), + dense_shape=[10, 10]) + data = dataset_ops.Dataset.from_tensors((list(range(10)), st)) + data = data.apply(batching.unbatch()) + data = data.batch(5) + data = data.apply(batching.unbatch()) + iterator = data.make_one_shot_iterator() + next_element = iterator.get_next() + + with self.test_session() as sess: + for i in range(10): + dense_elem, st_row = sess.run(next_element) + self.assertEqual(i, dense_elem) + self.assertEqual([i], st_row.indices) + self.assertEqual([i], st_row.values) + self.assertEqual([10], st_row.dense_shape) + with self.assertRaises(errors.OutOfRangeError): + sess.run(next_element) + def testUnbatchSingleElementTupleDataset(self): data = tuple([(math_ops.range(10),) for _ in range(3)]) data = dataset_ops.Dataset.from_tensor_slices(data) @@ -191,6 +258,53 @@ class BatchDatasetTest(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(op) + def testUnbatchEmpty(self): + data = dataset_ops.Dataset.from_tensors( + (constant_op.constant([]), constant_op.constant([], shape=[0, 4]), + constant_op.constant([], shape=[0, 4, 0]))) + data = data.apply(batching.unbatch()) + iterator = data.make_one_shot_iterator() + next_element = iterator.get_next() + + with self.test_session() as sess: + with self.assertRaises(errors.OutOfRangeError): + sess.run(next_element) + + def testUnbatchStaticShapeMismatch(self): + data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8), + np.arange(9))) + with self.assertRaises(ValueError): + data.apply(batching.unbatch()) + + def testUnbatchDynamicShapeMismatch(self): + ph1 = array_ops.placeholder(dtypes.int32, shape=[None]) + ph2 = array_ops.placeholder(dtypes.int32, shape=None) + data = dataset_ops.Dataset.from_tensors((ph1, ph2)) + data = data.apply(batching.unbatch()) + iterator = data.make_initializable_iterator() + next_element = iterator.get_next() + + with self.test_session() as sess: + # Mismatch in the 0th dimension. + sess.run( + iterator.initializer, + feed_dict={ + ph1: np.arange(7).astype(np.int32), + ph2: np.arange(8).astype(np.int32) + }) + with self.assertRaises(errors.InvalidArgumentError): + print(sess.run(next_element)) + + # No 0th dimension (i.e. scalar value) for one component. + sess.run( + iterator.initializer, + feed_dict={ + ph1: np.arange(7).astype(np.int32), + ph2: 7 + }) + with self.assertRaises(errors.InvalidArgumentError): + print(sess.run(next_element)) + def testBatchAndDropRemainder(self): components = (np.arange(7), np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis], @@ -545,6 +659,59 @@ class BatchDatasetSerializationTest( self.run_core_tests(self._build_dataset_nested_sparse, None, 1) +class UnbatchDatasetSerializationTest( + dataset_serialization_test_base.DatasetSerializationTestBase): + + def build_dataset(self, multiplier=15.0, tensor_slice_len=2, batch_size=2): + components = ( + np.arange(tensor_slice_len), + np.array([[1, 2, 3]]) * np.arange(tensor_slice_len)[:, np.newaxis], + np.array(multiplier) * np.arange(tensor_slice_len)) + + return dataset_ops.Dataset.from_tensor_slices(components).batch( + batch_size).apply(batching.unbatch()) + + def testCore(self): + tensor_slice_len = 8 + batch_size = 2 + num_outputs = tensor_slice_len + self.run_core_tests( + lambda: self.build_dataset(15.0, tensor_slice_len, batch_size), + lambda: self.build_dataset(20.0, tensor_slice_len, batch_size), + num_outputs) + + +class MapAndBatchDatasetSerializationTest( + dataset_serialization_test_base.DatasetSerializationTestBase): + + def testSerializationCore(self): + range_size = 11 + num_repeats = 2 + batch_size = 5 + total_outputs = range_size * num_repeats + num_outputs_drop_remainder = total_outputs // batch_size + num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size)) + num_parallel_batches = 2 + + def build_ds(range_start, drop_remainder=False): + + def _map_fn(x): + return math_ops.square(x) + + return dataset_ops.Dataset.range( + range_start, range_start + range_size).repeat(num_repeats).apply( + batching.map_and_batch( + map_func=_map_fn, + batch_size=batch_size, + num_parallel_batches=num_parallel_batches, + drop_remainder=drop_remainder)) + + self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15), + num_outputs_keep_remainder) + self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True), + num_outputs_drop_remainder) + + class PaddedBatchDatasetSerializationTest( dataset_serialization_test_base.DatasetSerializationTestBase): @@ -586,10 +753,12 @@ class RestructuredDatasetTest(test.TestCase): def test_assert_element_shape(self): def create_unknown_shape_dataset(x): - return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32), - np.zeros((3, 4), dtype=np.int32)), - [x], - [dtypes.float32, dtypes.int32]) + return script_ops.py_func( + lambda _: ( # pylint: disable=g-long-lambda + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), + [x], + [dtypes.float32, dtypes.int32]) dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset) unknown_shapes = (tensor_shape.TensorShape(None), @@ -626,10 +795,12 @@ class RestructuredDatasetTest(test.TestCase): def test_assert_wrong_element_shape_on_unknown_shape_dataset(self): def create_unknown_shape_dataset(x): - return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32), - np.zeros((3, 4), dtype=np.int32)), - [x], - [dtypes.float32, dtypes.int32]) + return script_ops.py_func( + lambda _: ( # pylint: disable=g-long-lambda + np.ones(2, dtype=np.float32), + np.zeros((3, 4), dtype=np.int32)), + [x], + [dtypes.float32, dtypes.int32]) dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset) unknown_shapes = (tensor_shape.TensorShape(None), @@ -649,5 +820,77 @@ class RestructuredDatasetTest(test.TestCase): sess.run(get_next) +class UnbatchDatasetBenchmark(test.Benchmark): + + def benchmarkNativeUnbatch(self): + batch_sizes = [1, 2, 5, 10, 20, 50] + elems_per_trial = 10000 + with ops.Graph().as_default(): + dataset = dataset_ops.Dataset.from_tensors("element").repeat(None) + batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[]) + dataset = dataset.batch(batch_size_placeholder) + dataset = dataset.apply(batching.unbatch()) + dataset = dataset.skip(elems_per_trial) + iterator = dataset.make_initializable_iterator() + next_element = iterator.get_next() + + with session.Session() as sess: + for batch_size in batch_sizes: + deltas = [] + for _ in range(5): + sess.run( + iterator.initializer, + feed_dict={batch_size_placeholder: batch_size}) + start = time.time() + sess.run(next_element.op) + end = time.time() + deltas.append((end - start) / elems_per_trial) + + median_wall_time = np.median(deltas) + print("Unbatch (native) batch size: %d Median wall time per element:" + " %f microseconds" % (batch_size, median_wall_time * 1e6)) + self.report_benchmark( + iters=10000, + wall_time=median_wall_time, + name="benchmark_unbatch_dataset_native_batch_size_%d" % + batch_size) + + # Include a benchmark of the previous `unbatch()` implementation that uses + # a composition of more primitive ops. Eventually we'd hope to generate code + # that is as good in both cases. + def benchmarkOldUnbatchImplementation(self): + batch_sizes = [1, 2, 5, 10, 20, 50] + elems_per_trial = 10000 + with ops.Graph().as_default(): + dataset = dataset_ops.Dataset.from_tensors("element").repeat(None) + batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[]) + dataset = dataset.batch(batch_size_placeholder) + dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices) + dataset = dataset.skip(elems_per_trial) + iterator = dataset.make_initializable_iterator() + next_element = iterator.get_next() + + with session.Session() as sess: + for batch_size in batch_sizes: + deltas = [] + for _ in range(5): + sess.run( + iterator.initializer, + feed_dict={batch_size_placeholder: batch_size}) + start = time.time() + sess.run(next_element.op) + end = time.time() + deltas.append((end - start) / elems_per_trial) + + median_wall_time = np.median(deltas) + print("Unbatch (unfused) batch size: %d Median wall time per element:" + " %f microseconds" % (batch_size, median_wall_time * 1e6)) + self.report_benchmark( + iters=10000, + wall_time=median_wall_time, + name="benchmark_unbatch_dataset_unfused_batch_size_%d" % + batch_size) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py index 6002cc73c8b41c2f20beaf0158af813807e58c90..55a56b83a8efba899c6b296264d766839a824da5 100644 --- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py @@ -61,7 +61,7 @@ class GroupByWindowTest(test.TestCase): self.assertEqual(len(components), sum(counts)) num_full_batches = len([c for c in counts if c == 4]) - self.assertGreaterEqual(num_full_batches, 23) + self.assertGreaterEqual(num_full_batches, 24) self.assertTrue(all(c == 4 for c in counts[:num_full_batches])) def testImmediateOutput(self): diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py index e0494736b72ae52f586cb80d42a5c1e50ac17a61..1a97a84b2cba13e82c8af9c4c8ee413ee8264a5e 100644 --- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py @@ -24,9 +24,11 @@ import numpy as np from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base from tensorflow.contrib.data.python.ops import scan_ops from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -57,19 +59,24 @@ class ScanDatasetTest(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) + @test_util.run_in_graph_and_eager_modes() def testFibonacci(self): iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply( scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1])) ).make_one_shot_iterator() - next_element = iterator.get_next() - with self.test_session() as sess: - self.assertEqual(1, sess.run(next_element)) - self.assertEqual(1, sess.run(next_element)) - self.assertEqual(2, sess.run(next_element)) - self.assertEqual(3, sess.run(next_element)) - self.assertEqual(5, sess.run(next_element)) - self.assertEqual(8, sess.run(next_element)) + if context.executing_eagerly(): + next_element = iterator.get_next + else: + get_next = iterator.get_next() + next_element = lambda: get_next + + self.assertEqual(1, self.evaluate(next_element())) + self.assertEqual(1, self.evaluate(next_element())) + self.assertEqual(2, self.evaluate(next_element())) + self.assertEqual(3, self.evaluate(next_element())) + self.assertEqual(5, self.evaluate(next_element())) + self.assertEqual(8, self.evaluate(next_element())) def testChangingStateShape(self): # Test the fixed-point shape invariant calculations: start with diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py index c3a7f291c59a72dc6057f7e1c51d5ac78334176b..5c74ed6ae7210e8e22efb6e8fdb773397459ce1e 100644 --- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py @@ -50,17 +50,17 @@ class StatsDatasetTest(test.TestCase): self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto)) def testBytesProduced(self): + stats_aggregator = stats_ops.StatsAggregator() dataset = dataset_ops.Dataset.range(100).map( lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply( - stats_ops.bytes_produced_stats("bytes_produced")) + stats_ops.bytes_produced_stats("bytes_produced")).apply( + stats_ops.set_stats_aggregator(stats_aggregator)) iterator = dataset.make_initializable_iterator() - stats_aggregator = stats_ops.StatsAggregator() - stats_aggregator_subscriber = stats_aggregator.subscribe(iterator) next_element = iterator.get_next() summary_t = stats_aggregator.get_summary() with self.test_session() as sess: - sess.run([iterator.initializer, stats_aggregator_subscriber]) + sess.run(iterator.initializer) expected_sum = 0.0 for i in range(100): self.assertAllEqual( @@ -76,16 +76,16 @@ class StatsDatasetTest(test.TestCase): self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum) def testLatencyStats(self): + stats_aggregator = stats_ops.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( - stats_ops.latency_stats("record_latency")) + stats_ops.latency_stats("record_latency")).apply( + stats_ops.set_stats_aggregator(stats_aggregator)) iterator = dataset.make_initializable_iterator() - stats_aggregator = stats_ops.StatsAggregator() - stats_aggregator_subscriber = stats_aggregator.subscribe(iterator) next_element = iterator.get_next() summary_t = stats_aggregator.get_summary() with self.test_session() as sess: - sess.run([iterator.initializer, stats_aggregator_subscriber]) + sess.run(iterator.initializer) for i in range(100): self.assertEqual(i, sess.run(next_element)) self._assertSummaryHasCount( @@ -95,16 +95,15 @@ class StatsDatasetTest(test.TestCase): self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0) def testReinitialize(self): + stats_aggregator = stats_ops.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( - stats_ops.latency_stats("record_latency")) + stats_ops.latency_stats("record_latency")).apply( + stats_ops.set_stats_aggregator(stats_aggregator)) iterator = dataset.make_initializable_iterator() - stats_aggregator = stats_ops.StatsAggregator() - stats_aggregator_subscriber = stats_aggregator.subscribe(iterator) next_element = iterator.get_next() summary_t = stats_aggregator.get_summary() with self.test_session() as sess: - sess.run(stats_aggregator_subscriber) for j in range(5): sess.run(iterator.initializer) for i in range(100): @@ -130,17 +129,17 @@ class StatsDatasetTest(test.TestCase): sess.run(next_element) def testMultipleTags(self): + stats_aggregator = stats_ops.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")).apply( - stats_ops.latency_stats("record_latency_2")) + stats_ops.latency_stats("record_latency_2")).apply( + stats_ops.set_stats_aggregator(stats_aggregator)) iterator = dataset.make_initializable_iterator() - stats_aggregator = stats_ops.StatsAggregator() - stats_aggregator_subscriber = stats_aggregator.subscribe(iterator) next_element = iterator.get_next() summary_t = stats_aggregator.get_summary() with self.test_session() as sess: - sess.run([iterator.initializer, stats_aggregator_subscriber]) + sess.run(iterator.initializer) for i in range(100): self.assertEqual(i, sess.run(next_element)) self._assertSummaryHasCount( @@ -154,17 +153,17 @@ class StatsDatasetTest(test.TestCase): sess.run(summary_t), "record_latency_2", 100.0) def testRepeatedTags(self): + stats_aggregator = stats_ops.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")).apply( - stats_ops.latency_stats("record_latency")) + stats_ops.latency_stats("record_latency")).apply( + stats_ops.set_stats_aggregator(stats_aggregator)) iterator = dataset.make_initializable_iterator() - stats_aggregator = stats_ops.StatsAggregator() - stats_aggregator_subscriber = stats_aggregator.subscribe(iterator) next_element = iterator.get_next() summary_t = stats_aggregator.get_summary() with self.test_session() as sess: - sess.run([iterator.initializer, stats_aggregator_subscriber]) + sess.run(iterator.initializer) for i in range(100): self.assertEqual(i, sess.run(next_element)) self._assertSummaryHasCount( @@ -174,19 +173,17 @@ class StatsDatasetTest(test.TestCase): self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0) def testMultipleIteratorsSameAggregator(self): + stats_aggregator = stats_ops.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( - stats_ops.latency_stats("record_latency")) + stats_ops.latency_stats("record_latency")).apply( + stats_ops.set_stats_aggregator(stats_aggregator)) iterator_0 = dataset.make_initializable_iterator() iterator_1 = dataset.make_initializable_iterator() - stats_aggregator = stats_ops.StatsAggregator() - stats_aggregator_subscribers = [stats_aggregator.subscribe(iterator_0), - stats_aggregator.subscribe(iterator_1)] next_element = iterator_0.get_next() + iterator_1.get_next() summary_t = stats_aggregator.get_summary() with self.test_session() as sess: - sess.run([iterator_0.initializer, iterator_1.initializer, - stats_aggregator_subscribers]) + sess.run([iterator_0.initializer, iterator_1.initializer]) for i in range(100): self.assertEqual(i * 2, sess.run(next_element)) self._assertSummaryHasCount( @@ -195,20 +192,6 @@ class StatsDatasetTest(test.TestCase): sess.run(next_element) self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0) - def testMultipleStatsAggregatorsSameIteratorFail(self): - dataset = dataset_ops.Dataset.range(100).apply( - stats_ops.latency_stats("record_latency")) - iterator = dataset.make_initializable_iterator() - stats_aggregator_0 = stats_ops.StatsAggregator() - stats_aggregator_1 = stats_ops.StatsAggregator() - - with self.test_session() as sess: - sess.run(stats_aggregator_0.subscribe(iterator)) - # TODO(mrry): Consider making this allowable (and also allowing - # aggregators to unsubscribe). - with self.assertRaises(errors.FailedPreconditionError): - sess.run(stats_aggregator_1.subscribe(iterator)) - class StatsDatasetSerializationTest( dataset_serialization_test_base.DatasetSerializationTestBase): @@ -269,5 +252,9 @@ class StatsDatasetSerializationTest( None, num_outputs) +# TODO(shivaniagrawal): Can not checkpoint input_pipeline with the +# transformation `stats_ops.set_stats_aggregator`, since we don't support +# serializing StatsAggregator yet. + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py new file mode 100644 index 0000000000000000000000000000000000000000..c603ecc5ab27a711557376246b093fd5f80f8aec --- /dev/null +++ b/tensorflow/contrib/data/python/kernel_tests/writer_ops_test.py @@ -0,0 +1,117 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for the experimental input pipeline ops.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from tensorflow.contrib.data.python.ops import writers +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.ops import readers +from tensorflow.python.framework import dtypes +from tensorflow.python.lib.io import python_io +from tensorflow.python.lib.io import tf_record +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import test +from tensorflow.python.util import compat + + +class TFRecordWriterTest(test.TestCase): + + def setUp(self): + super(TFRecordWriterTest, self).setUp() + self._num_records = 7 + self.filename = array_ops.placeholder(dtypes.string, shape=[]) + self.compression_type = array_ops.placeholder_with_default("", shape=[]) + + input_dataset = readers.TFRecordDataset([self.filename], + self.compression_type) + self.writer = writers.TFRecordWriter( + self._outputFilename(), self.compression_type).write(input_dataset) + + def _record(self, i): + return compat.as_bytes("Record %d" % (i)) + + def _createFile(self, options=None): + filename = self._inputFilename() + writer = python_io.TFRecordWriter(filename, options) + for i in range(self._num_records): + writer.write(self._record(i)) + writer.close() + return filename + + def _inputFilename(self): + return os.path.join(self.get_temp_dir(), "tf_record.in.txt") + + def _outputFilename(self): + return os.path.join(self.get_temp_dir(), "tf_record.out.txt") + + def testWrite(self): + with self.test_session() as sess: + sess.run( + self.writer, feed_dict={ + self.filename: self._createFile(), + }) + for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())): + self.assertAllEqual(self._record(i), r) + + def testWriteZLIB(self): + options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB) + with self.test_session() as sess: + sess.run( + self.writer, + feed_dict={ + self.filename: self._createFile(options), + self.compression_type: "ZLIB", + }) + for i, r in enumerate( + tf_record.tf_record_iterator(self._outputFilename(), options=options)): + self.assertAllEqual(self._record(i), r) + + def testWriteGZIP(self): + options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP) + with self.test_session() as sess: + sess.run( + self.writer, + feed_dict={ + self.filename: self._createFile(options), + self.compression_type: "GZIP", + }) + for i, r in enumerate( + tf_record.tf_record_iterator(self._outputFilename(), options=options)): + self.assertAllEqual(self._record(i), r) + + def testFailDataset(self): + with self.assertRaises(TypeError): + writers.TFRecordWriter(self._outputFilename(), + self.compression_type).write("whoops") + + def testFailDType(self): + input_dataset = dataset_ops.Dataset.from_tensors(10) + with self.assertRaises(TypeError): + writers.TFRecordWriter(self._outputFilename(), + self.compression_type).write(input_dataset) + + def testFailShape(self): + input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]]) + with self.assertRaises(TypeError): + writers.TFRecordWriter(self._outputFilename(), + self.compression_type).write(input_dataset) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD index e00f2304cc415e45b76f4f53c51bca1cfe4ac114..5b04c5316cfbb7577b3f8b3b6d364fc665d14c21 100644 --- a/tensorflow/contrib/data/python/ops/BUILD +++ b/tensorflow/contrib/data/python/ops/BUILD @@ -280,6 +280,18 @@ py_library( ], ) +py_library( + name = "writers", + srcs = [ + "writers.py", + ], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/python:dtypes", + "//tensorflow/python/data/ops:dataset_ops", + ], +) + tf_gen_op_wrapper_py( name = "gen_dataset_ops", out = "gen_dataset_ops.py", @@ -342,6 +354,7 @@ py_library( ":stats_ops", ":threadpool", ":unique", + ":writers", "//tensorflow/python:dataset_ops_gen", "//tensorflow/python:util", "//tensorflow/python/data/ops:dataset_ops", diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py index 28db949da9e371bfa27fa847faee88f0366699ba..2152bcde84aae6b0c2b368e43750aafab3a04bf2 100644 --- a/tensorflow/contrib/data/python/ops/batching.py +++ b/tensorflow/contrib/data/python/ops/batching.py @@ -80,28 +80,98 @@ def dense_to_sparse_batch(batch_size, row_shape): return _apply_fn +class UnbatchDataset(dataset_ops.Dataset): + """A dataset that splits the elements of its input into multiple elements.""" + + def __init__(self, input_dataset): + """See `unbatch()` for more details.""" + super(UnbatchDataset, self).__init__() + flat_shapes = nest.flatten(input_dataset.output_shapes) + if any(s.ndims == 0 for s in flat_shapes): + raise ValueError("Cannot unbatch an input with scalar components.") + known_batch_dim = tensor_shape.Dimension(None) + for s in flat_shapes: + try: + known_batch_dim = known_batch_dim.merge_with(s[0]) + except ValueError: + raise ValueError("Cannot unbatch an input whose components have " + "different batch sizes.") + self._input_dataset = input_dataset + + def _as_variant_tensor(self): + return gen_dataset_ops.unbatch_dataset( + self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access + output_shapes=nest.flatten( + sparse.as_dense_shapes(self.output_shapes, self.output_classes)), + output_types=nest.flatten( + sparse.as_dense_types(self.output_types, self.output_classes))) + + @property + def output_classes(self): + return self._input_dataset.output_classes + + @property + def output_shapes(self): + return nest.map_structure(lambda s: s[1:], + self._input_dataset.output_shapes) + + @property + def output_types(self): + return self._input_dataset.output_types + + def unbatch(): - """A Transformation which splits the elements of a dataset. + """Splits elements of a dataset into multiple elements on the batch dimension. For example, if elements of the dataset are shaped `[B, a0, a1, ...]`, - where `B` may vary from element to element, then for each element in - the dataset, the unbatched dataset will contain `B` consecutive elements + where `B` may vary for each input element, then for each element in the + dataset, the unbatched dataset will contain `B` consecutive elements of shape `[a0, a1, ...]`. + ```python + # NOTE: The following example uses `{ ... }` to represent the contents + # of a dataset. + a = { ['a', 'b', 'c'], ['a', 'b'], ['a', 'b', 'c', 'd'] } + + a.apply(tf.contrib.data.unbatch()) == { + 'a', 'b', 'c', 'a', 'b', 'a', 'b', 'c', 'd'} + ``` + Returns: A `Dataset` transformation function, which can be passed to @{tf.data.Dataset.apply}. """ def _apply_fn(dataset): - - def unbatch_map(arg, *rest): + """Function from `Dataset` to `Dataset` that applies the transformation.""" + if not sparse.any_sparse(dataset.output_classes): + return UnbatchDataset(dataset) + + # NOTE(mrry): We must ensure that any SparseTensors in `dataset` + # are normalized to the rank-1 dense representation, so that the + # sparse-oblivious unbatching logic will slice them + # appropriately. This leads to a somewhat inefficient re-encoding step + # for all SparseTensor components. + # TODO(mrry): Consider optimizing this in future + # if it turns out to be a bottleneck. + def normalize(arg, *rest): if rest: - return dataset_ops.Dataset.from_tensor_slices((arg,) + rest) + return sparse.serialize_many_sparse_tensors((arg,) + rest) else: - return dataset_ops.Dataset.from_tensor_slices(arg) + return sparse.serialize_many_sparse_tensors(arg) + + normalized_dataset = dataset.map(normalize) - return dataset.flat_map(map_func=unbatch_map) + # NOTE(mrry): Our `map()` has lost information about the sparseness + # of any SparseTensor components, so re-apply the structure of the + # original dataset. + restructured_dataset = _RestructuredDataset( + normalized_dataset, + dataset.output_types, + dataset.output_shapes, + dataset.output_classes, + allow_unsafe_cast=True) + return UnbatchDataset(restructured_dataset) return _apply_fn @@ -265,7 +335,8 @@ class _RestructuredDataset(dataset_ops.Dataset): dataset, output_types, output_shapes=None, - output_classes=None): + output_classes=None, + allow_unsafe_cast=False): """Creates a new dataset with the given output types and shapes. The given `dataset` must have a structure that is convertible: @@ -283,6 +354,10 @@ class _RestructuredDataset(dataset_ops.Dataset): If omitted, the shapes will be inherited from `dataset`. output_classes: (Optional.) A nested structure of class types. If omitted, the class types will be inherited from `dataset`. + allow_unsafe_cast: (Optional.) If `True`, the caller may switch the + reported output types and shapes of the restructured dataset, e.g. to + switch a sparse tensor represented as `tf.variant` to its user-visible + type and shape. Raises: ValueError: If either `output_types` or `output_shapes` is not compatible @@ -291,14 +366,15 @@ class _RestructuredDataset(dataset_ops.Dataset): super(_RestructuredDataset, self).__init__() self._dataset = dataset - # Validate that the types are compatible. - output_types = nest.map_structure(dtypes.as_dtype, output_types) - flat_original_types = nest.flatten(dataset.output_types) - flat_new_types = nest.flatten(output_types) - if flat_original_types != flat_new_types: - raise ValueError( - "Dataset with output types %r cannot be restructured to have output " - "types %r" % (dataset.output_types, output_types)) + if not allow_unsafe_cast: + # Validate that the types are compatible. + output_types = nest.map_structure(dtypes.as_dtype, output_types) + flat_original_types = nest.flatten(dataset.output_types) + flat_new_types = nest.flatten(output_types) + if flat_original_types != flat_new_types: + raise ValueError( + "Dataset with output types %r cannot be restructured to have " + "output types %r" % (dataset.output_types, output_types)) self._output_types = output_types @@ -308,18 +384,19 @@ class _RestructuredDataset(dataset_ops.Dataset): nest.flatten( dataset.output_shapes)) else: - # Validate that the shapes are compatible. - nest.assert_same_structure(output_types, output_shapes) - flat_original_shapes = nest.flatten(dataset.output_shapes) - flat_new_shapes = nest.flatten_up_to(output_types, output_shapes) - - for original_shape, new_shape in zip(flat_original_shapes, - flat_new_shapes): - if not original_shape.is_compatible_with(new_shape): - raise ValueError( - "Dataset with output shapes %r cannot be restructured to have " - "incompatible output shapes %r" % (dataset.output_shapes, - output_shapes)) + if not allow_unsafe_cast: + # Validate that the shapes are compatible. + nest.assert_same_structure(output_types, output_shapes) + flat_original_shapes = nest.flatten(dataset.output_shapes) + flat_new_shapes = nest.flatten_up_to(output_types, output_shapes) + + for original_shape, new_shape in zip(flat_original_shapes, + flat_new_shapes): + if not original_shape.is_compatible_with(new_shape): + raise ValueError( + "Dataset with output shapes %r cannot be restructured to have " + "incompatible output shapes %r" % (dataset.output_shapes, + output_shapes)) self._output_shapes = nest.map_structure_up_to( output_types, tensor_shape.as_shape, output_shapes) if output_classes is None: diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py index 4ec8ae1c79d1eb99c56b31c6a0709a84c38f5f90..bbb808fbd7730002e48cab47fa8d0fe09e2124d2 100644 --- a/tensorflow/contrib/data/python/ops/readers.py +++ b/tensorflow/contrib/data/python/ops/readers.py @@ -156,12 +156,21 @@ def _infer_column_names(filenames, field_delim, use_quote_delim): "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE } with file_io.FileIO(filenames[0], "r") as f: - column_names = next(csv.reader(f, **csv_kwargs)) + try: + column_names = next(csv.reader(f, **csv_kwargs)) + except StopIteration: + raise ValueError(("Received StopIteration when reading the header line " + "of %s. Empty file?") % filenames[0]) for name in filenames[1:]: with file_io.FileIO(name, "r") as f: - if next(csv.reader(f, **csv_kwargs)) != column_names: - raise ValueError("Files have different column names in the header row.") + try: + if next(csv.reader(f, **csv_kwargs)) != column_names: + raise ValueError( + "Files have different column names in the header row.") + except StopIteration: + raise ValueError(("Received StopIteration when reading the header line " + "of %s. Empty file?") % filenames[0]) return column_names diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py index b465397437adbdfaf865efb8ed2f80e57f48fcab..a182dddd38d23d096979eebb8de29f07573833dd 100644 --- a/tensorflow/contrib/data/python/ops/resampling.py +++ b/tensorflow/contrib/data/python/ops/resampling.py @@ -110,7 +110,6 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None): .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p)) return filtered_ds.map(lambda class_value, _, data: (class_value, data)) - return _apply_fn diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py index fe49ee8b1946c03dca37c2bc17dde71646b85bbd..60ef7efba4bb2bc281bc624ec3f58117ffa9a824 100644 --- a/tensorflow/contrib/data/python/ops/scan_ops.py +++ b/tensorflow/contrib/data/python/ops/scan_ops.py @@ -144,6 +144,7 @@ class _ScanDataset(dataset_ops.Dataset): weakened_state_shapes) self._scan_func = tf_scan_func + self._scan_func.add_to_graph(ops.get_default_graph()) def _as_variant_tensor(self): input_t = self._input_dataset._as_variant_tensor() # pylint: disable=protected-access diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py index b5cf0fcfe91ebc22444302fca5d488a278ef2994..d39172039683fe8f333572bb1dbf12abfab65113 100644 --- a/tensorflow/contrib/data/python/ops/stats_ops.py +++ b/tensorflow/contrib/data/python/ops/stats_ops.py @@ -18,7 +18,6 @@ from __future__ import division from __future__ import print_function from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.data.ops import iterator_ops from tensorflow.python.data.util import nest from tensorflow.python.data.util import sparse from tensorflow.python.framework import dtypes @@ -85,25 +84,53 @@ class StatsAggregator(object): """ return gen_dataset_ops.stats_aggregator_summary(self._resource) - def subscribe(self, iterator): - """Returns a @{tf.Operation} to associate this aggregator with `iterator`. - Note: Each @{tf.data.Iterator} can be associated with at most one - `StatsAggregator`. After running the operation that this function - returns, all statistics recorded in the iteration of `iterator` - will be stored in `stats_aggregator`. +class _SetStatsAggregatorDataset(dataset_ops.Dataset): + """A `Dataset` that acts as an identity, and sets given stats_aggregator.""" - Args: - iterator: A @{tf.data.Iterator} object. + def __init__(self, input_dataset, stats_aggregator): + super(_SetStatsAggregatorDataset, self).__init__() + self._input_dataset = input_dataset + self._stats_aggregator = stats_aggregator - Returns: - A @{tf.Operation} that, when run, associates this aggregator with - `iterator`. - """ - if not isinstance(iterator, iterator_ops.Iterator): - raise TypeError("`iterator` must be a `tf.data.Iterator` object.") - return gen_dataset_ops.iterator_set_stats_aggregator( - iterator._iterator_resource, self._resource) # pylint: disable=protected-access + def _as_variant_tensor(self): + return gen_dataset_ops.set_stats_aggregator_dataset( + self._input_dataset._as_variant_tensor(), # pylint: disable=protected-access + self._stats_aggregator._resource, # pylint: disable=protected-access + output_types=nest.flatten( + sparse.as_dense_types(self.output_types, self.output_classes)), + output_shapes=nest.flatten( + sparse.as_dense_shapes(self.output_shapes, self.output_classes))) + + @property + def output_shapes(self): + return self._input_dataset.output_shapes + + @property + def output_types(self): + return self._input_dataset.output_types + + @property + def output_classes(self): + return self._input_dataset.output_classes + + +# TODO(shivaniagrawal): Expose these methods in `tf.contrib.data`. +def set_stats_aggregator(stats_aggregator): + """Set the given stats_aggregator for aggregating the input dataset stats. + + Args: + stats_aggregator: A `StatsAggregator` object. + + Returns: + A `Dataset` transformation function, which can be passed to + @{tf.data.Dataset.apply}. + """ + + def _apply_fn(dataset): + return _SetStatsAggregatorDataset(dataset, stats_aggregator) + + return _apply_fn def bytes_produced_stats(tag): diff --git a/tensorflow/contrib/data/python/ops/writers.py b/tensorflow/contrib/data/python/ops/writers.py new file mode 100644 index 0000000000000000000000000000000000000000..f53bd3f7383950d6cfdb35e12811fb1daf24b320 --- /dev/null +++ b/tensorflow/contrib/data/python/ops/writers.py @@ -0,0 +1,58 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Python wrappers for tf.data writers.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.util import convert +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import gen_dataset_ops + + +class TFRecordWriter(object): + """Writes data to a TFRecord file.""" + + def __init__(self, filename, compression_type=None): + self._filename = ops.convert_to_tensor( + filename, dtypes.string, name="filename") + self._compression_type = convert.optional_param_to_tensor( + "compression_type", + compression_type, + argument_default="", + argument_dtype=dtypes.string) + + def write(self, dataset): + """Returns a @{tf.Operation} to write a dataset to a file. + + Args: + dataset: a @{tf.data.Dataset} whose elements are to be written to a file + + Returns: + A @{tf.Operation} that, when run, writes contents of `dataset` to a file. + """ + if not isinstance(dataset, dataset_ops.Dataset): + raise TypeError("`dataset` must be a `tf.data.Dataset` object.") + if (dataset.output_types != dtypes.string or + dataset.output_shapes != tensor_shape.scalar()): + raise TypeError( + "`dataset` must produce scalar `DT_STRING` tensors whereas it " + "produces shape {0} and types {1}".format(dataset.output_shapes, + dataset.output_types)) + return gen_dataset_ops.dataset_to_tf_record( + dataset._as_variant_tensor(), self._filename, self._compression_type) # pylint: disable=protected-access diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md index 5d22d9aa2bb88337e9b740d297cdf87683bdd578..44a4481021c380e72b535cf0aca39df2bf04d3b7 100644 --- a/tensorflow/contrib/distribute/README.md +++ b/tensorflow/contrib/distribute/README.md @@ -131,8 +131,6 @@ adjusting your learning rate or batch size according to the number of GPUs. We are working on addressing this limitation by splitting each batch across GPUs instead. * PartitionedVariables are not supported yet. -* Input pipelines with Datasets that capture stateful objects and rely on -`make_initializable_iterator` are not supported yet. ## What's next? diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD index 837a1f134800b0f9faa72f5907d3398964294469..c2834d822664b9d60690c5d5dd527bbbd01a106f 100644 --- a/tensorflow/contrib/distribute/python/BUILD +++ b/tensorflow/contrib/distribute/python/BUILD @@ -231,15 +231,14 @@ py_library( srcs = ["tpu_strategy.py"], visibility = ["//tensorflow:internal"], deps = [ - "//tensorflow/contrib/distribute/python:one_device_strategy", - "//tensorflow/contrib/eager/python:datasets", - "//tensorflow/contrib/optimizer_v2:training", + ":one_device_strategy", + ":values", "//tensorflow/contrib/tpu", - "//tensorflow/python:array_ops", + "//tensorflow/contrib/tpu:tpu_py", + "//tensorflow/python:constant_op", + "//tensorflow/python:control_flow_ops", "//tensorflow/python:framework_ops", - "//tensorflow/python:math_ops", - "//tensorflow/python/eager:context", - "@six_archive//:six", + "//tensorflow/python:util", ], ) @@ -249,9 +248,13 @@ py_library( srcs = ["minimize_loss_test.py"], deps = [ ":combinations", + ":mirrored_strategy", ":single_loss_example", + "//tensorflow/contrib/tpu:tpu_lib", "//tensorflow/python:control_flow_ops", + "//tensorflow/python:framework_ops", "//tensorflow/python:math_ops", + "//tensorflow/python:variable_scope", "//tensorflow/python:variables", "//tensorflow/python/data/ops:dataset_ops", "//tensorflow/python/eager:context", @@ -324,6 +327,7 @@ py_library( srcs = ["single_loss_example.py"], deps = [ ":step_fn", + "//tensorflow/contrib/data/python/ops:batching", "//tensorflow/python:array_ops", "//tensorflow/python:constant_op", "//tensorflow/python:layers", diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py index 1f66997e6eca0b72c2d6eb52ace38cc568ef5d42..946310aa6fc2101d75e86d3ff2e9f3284e6c6625 100644 --- a/tensorflow/contrib/distribute/python/combinations.py +++ b/tensorflow/contrib/distribute/python/combinations.py @@ -266,7 +266,7 @@ one_device_strategy = NamedDistribution( "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"), None) tpu_strategy = NamedDistribution( - "TPU", tpu_strategy.TpuStrategy(), required_tpu=True) + "TPU", tpu_strategy.TPUStrategy(), required_tpu=True) mirrored_strategy_with_gpu_and_cpu = NamedDistribution( "MirroredCPUAndGPU", mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1) diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py index c5a520ab5aeafb932092ebbbaaf07480cf40403b..34410a6470185ac2821bc6a59de9230ff478aeb6 100644 --- a/tensorflow/contrib/distribute/python/estimator_integration_test.py +++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py @@ -61,7 +61,8 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase, mode=['graph'], distribution=[ combinations.one_device_strategy, - combinations.mirrored_strategy_with_gpu_and_cpu + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.mirrored_strategy_with_two_gpus ])) def test_complete_flow_with_mode(self, distribution): label_dimension = 2 diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py index 4219d54cbd43032e9c5a0ee2e76fe025045ba21b..e134fe34e10be402f028db986b8cbf14222db07f 100644 --- a/tensorflow/contrib/distribute/python/minimize_loss_test.py +++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py @@ -54,30 +54,21 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss, is_tpu): with distribution.scope(): - model_fn, dataset, layer = minimize_loss_example( - optimizer_fn, - use_bias=True, - use_callable_loss=use_callable_loss) + model_fn, dataset_fn, layer = minimize_loss_example( + optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) # TODO(isaprykin): Eliminate `is_tpu`. Probably add a # `DistributionStrategy.create_monitor` so that each DistributionStrategy # could influence its training loop. That method would return an instance # of Monitor. TPUMonitor would execute tpu.initialize_system() and # tpu.shutdown_system(). - if is_tpu: - dataset = dataset.batch(2) - - iterator = distribution.distribute_dataset(dataset) + iterator = distribution.distribute_dataset( + dataset_fn).make_one_shot_iterator() def run_step(): - # TODO(isaprykin): Make iterator get_next() return a list of sub- - # batches for each iteration. Pass iterator.get_next() and not iterator - # to call_for_each_tower. return distribution.group( distribution.call_for_each_tower( - model_fn, - iterator.get_next() if not is_tpu else iterator, - run_concurrently=layer.built)) + model_fn, iterator.get_next(), run_concurrently=layer.built)) if not context.executing_eagerly(): with self.test_session() as sess: @@ -105,8 +96,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): combinations.times( combinations.distributions_and_v1_optimizers() + combinations.distributions_and_v2_optimizers(), - combinations.combine(mode=["graph", "eager"]))) - def testOptimizerInsideModelFn(self, distribution, optimizer_fn): + combinations.combine(mode=["graph", "eager"], is_tpu=[False])) + + combinations.combine( + distribution=[combinations.tpu_strategy], + optimizer_fn=[ + combinations.adam_optimizer_v1_fn, + combinations.gradient_descent_optimizer_v1_fn + ], + mode=["graph"], + is_tpu=[True])) + + def testOptimizerInsideModelFn(self, distribution, optimizer_fn, is_tpu): created_variables = [] trainable_variables = [] @@ -121,13 +121,14 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): # `distribution.scope`. with variable_scope.variable_creator_scope( appending_creator), distribution.scope(): - model_fn, dataset, layer = minimize_loss_example( + model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=True, create_optimizer_inside_model_fn=True) - iterator = distribution.distribute_dataset(dataset) + iterator = distribution.distribute_dataset( + dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( @@ -136,11 +137,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): if not context.executing_eagerly(): with self.test_session() as sess: + if is_tpu: + sess.run(tpu.initialize_system()) run_step = sess.make_callable(run_step()) self.evaluate(variables_lib.global_variables_initializer()) run_step() + if is_tpu: + with self.test_session() as sess: + sess.run(tpu.shutdown_system()) + def get_expected_variables(optimizer_fn, num_parameter_devices): variables_map = { "GradientDescent": ["dense/kernel", "dense/bias"], @@ -174,7 +181,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): """Verifies that moving mean updates are reduced across towers.""" with distribution.scope(): num_towers = len(distribution.worker_devices) - model_fn, dataset, batchnorm = batchnorm_example( + model_fn, dataset_fn, batchnorm = batchnorm_example( optimizer_fn, batch_per_epoch=num_towers, momentum=momentum, @@ -185,7 +192,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): # on each device. if isinstance(distribution, mirrored_strategy.MirroredStrategy): distribution._prefetch_on_device = False - iterator = distribution.distribute_dataset(dataset) + iterator = distribution.distribute_dataset( + dataset_fn).make_one_shot_iterator() def run_step(): return control_flow_ops.group( @@ -257,10 +265,13 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase): else: return optimizer.minimize(loss_fn()) - features = dataset_ops.Dataset.from_tensors([[2.], [7.]]) - labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) - dataset = dataset_ops.Dataset.zip((features, labels)).repeat() - iterator = distribution.distribute_dataset(dataset) + def dataset_fn(): + features = dataset_ops.Dataset.from_tensors([[2.], [7.]]) + labels = dataset_ops.Dataset.from_tensors([[6.], [21.]]) + return dataset_ops.Dataset.zip((features, labels)).repeat() + + iterator = distribution.distribute_dataset( + dataset_fn).make_one_shot_iterator() def run_step(): return distribution.group( diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py index eb0edb3a11df7788991ca14f957494d87593a449..6efd578a775da7bf326826289bd5bd50a57be892 100644 --- a/tensorflow/contrib/distribute/python/mirrored_strategy.py +++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py @@ -140,10 +140,10 @@ class MirroredStrategy(distribute_lib.DistributionStrategy): g.add_to_collections(collections, result) return result - def distribute_dataset(self, dataset): - per_device_dataset = values.PerDeviceDataset( - dataset, self._devices, self._prefetch_on_device) - return per_device_dataset.make_one_shot_iterator() + def distribute_dataset(self, dataset_fn): + return values.PerDeviceDataset( + self._call_dataset_fn(dataset_fn), self._devices, + self._prefetch_on_device) def _broadcast(self, tensor, destinations): # TODO(josh11b): In eager mode, use one thread per device, or async mode. diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py index 9e9f06da8e2ed185c2c32f79a5a4f5407165fb1d..6c5c055070c0fc88ed8f3a459e3f346596f077a6 100644 --- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py +++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py @@ -247,8 +247,9 @@ class MirroredStrategyVariableCreationTest(test.TestCase): dist = mirrored_strategy.MirroredStrategy( ["/device:GPU:0", "/device:CPU:0"]) - features = dataset_ops.Dataset.from_tensors([[1.]]).repeat(10) - features = dist.distribute_dataset(features).get_next() + features = dist.distribute_dataset( + lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10) + ).make_one_shot_iterator().get_next() with dist.scope(): result = dist.call_for_each_tower( diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py index 39c49442b9c3245cfd0b67a51be68773a6fd3ff4..646d2a5c3b3b0bfcce6f89be0e588baacc6b9237 100644 --- a/tensorflow/contrib/distribute/python/one_device_strategy.py +++ b/tensorflow/contrib/distribute/python/one_device_strategy.py @@ -21,8 +21,6 @@ from __future__ import print_function import six from tensorflow.contrib.distribute.python import values -from tensorflow.contrib.eager.python import datasets -from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -62,11 +60,8 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy): with ops.colocate_with(colocate_with): return next_creator(*args, **kwargs) - def distribute_dataset(self, dataset): - if context.executing_eagerly(): - return datasets.Iterator(dataset) - else: - return dataset.make_one_shot_iterator() + def distribute_dataset(self, dataset_fn): + return self._call_dataset_fn(dataset_fn) def _broadcast(self, tensor, destinations): return tensor diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py index a0912b625f44342d22acc0ce9bb52a6b632c75a0..abd3a65ac4e19ece6b69b9834f4218fde55b60c2 100644 --- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py +++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py @@ -39,10 +39,11 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase): def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss=True): with distribution.scope(): - model_fn, dataset, layer = minimize_loss_example( + model_fn, dataset_fn, layer = minimize_loss_example( optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss) - iterator = distribution.distribute_dataset(dataset) + iterator = distribution.distribute_dataset( + dataset_fn).make_one_shot_iterator() def run_step(): return control_flow_ops.group(distribution.unwrap( diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py index dfcbb8568f92ebabbeeedb45ee677e4ee23d77dc..7b3670b45aba801cf8c18e04bfea03e23eb67184 100644 --- a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py +++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py @@ -26,6 +26,7 @@ from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import iterator_ops from tensorflow.python.data.util import nest as data_nest from tensorflow.python.data.util import sparse +from tensorflow.python.eager import context from tensorflow.python.framework import dtypes from tensorflow.python.framework import function from tensorflow.python.framework import ops @@ -34,26 +35,55 @@ from tensorflow.python.util import nest # pylint: disable=protected-access class _PrefetchToDeviceIterator(object): - """A replacement for @{tf.data.Iterator} that prefetches to another device.""" + """A replacement for @{tf.data.Iterator} that prefetches to another device. - def __init__(self, input_dataset, devices, buffer_size): + Args: + input_dataset: The input dataset. + one_shot: If true, we make a one shot iterator that's already initialized. + devices: Devices on which to prefetch. + buffer_size: Size of the prefetching buffer. + shared_name: (Optional.) If non-empty, the returned iterator will be + shared under the given name across multiple sessions that share the + same devices (e.g. when using a remote server). Only used if one_shot + is False. + + Returns: + An Iterator type object. + """ + + def __init__(self, + input_dataset, + one_shot, + devices, + buffer_size, + shared_name=None): self._input_dataset = input_dataset self._get_next_call_count = 0 + self._one_shot = one_shot + if shared_name is None: + shared_name = "" self._devices = devices - input_iterator = input_dataset.make_one_shot_iterator() - input_iterator_handle = input_iterator.string_handle() + + if self._one_shot: + self._input_iterator = input_dataset.make_one_shot_iterator() + else: + self._input_iterator = iterator_ops.Iterator.from_structure( + self._input_dataset.output_types, self._input_dataset.output_shapes, + shared_name, self._input_dataset.output_classes) + input_iterator_handle = self._input_iterator.string_handle() @function.Defun(dtypes.string) def _prefetch_fn(handle): """Prefetches one element from `input_iterator`.""" remote_iterator = iterator_ops.Iterator.from_string_handle( - handle, input_iterator.output_types, input_iterator.output_shapes, - input_iterator.output_classes) + handle, self._input_iterator.output_types, + self._input_iterator.output_shapes, + self._input_iterator.output_classes) ret = remote_iterator.get_next() return nest.flatten(sparse.serialize_sparse_tensors(ret)) target_device = gen_dataset_ops.iterator_get_device( - input_iterator._iterator_resource) + self._input_iterator._iterator_resource) self._buffering_resources = [] for device in nest.flatten(self._devices): with ops.device(device): @@ -61,9 +91,19 @@ class _PrefetchToDeviceIterator(object): f=_prefetch_fn, target_device=target_device, string_arg=input_iterator_handle, - buffer_size=buffer_size) + buffer_size=buffer_size, + shared_name=shared_name) self._buffering_resources.append(buffer_resource_handle) + if not self._one_shot: + reset_ops = [] + for buffer_resource in self._buffering_resources: + reset_ops.append( + prefetching_ops.function_buffering_resource_reset(buffer_resource)) + with ops.control_dependencies(reset_ops): + self._initializer = self._input_iterator.make_initializer( + self._input_dataset) + def get_next(self, name=None): """See @{tf.data.Iterator.get_next}.""" self._get_next_call_count += 1 @@ -92,6 +132,12 @@ class _PrefetchToDeviceIterator(object): return nest.pack_sequence_as(self._devices, flat_result) + @property + def initializer(self): + if self._one_shot: + raise NotImplementedError("Can't initialize a one_shot_iterator") + return self._initializer + @property def output_classes(self): return self._input_dataset.output_classes @@ -115,13 +161,24 @@ class _PrefetchToDeviceDataset(dataset_ops.Dataset): self._buffer_size = buffer_size if buffer_size is not None else 1 def make_one_shot_iterator(self): - return _PrefetchToDeviceIterator(self._input_dataset, self._devices, - self._buffer_size) + return _PrefetchToDeviceIterator( + self._input_dataset, + one_shot=True, + devices=self._devices, + buffer_size=self._buffer_size) def make_initializable_iterator(self, shared_name=None): - raise NotImplementedError("`prefetch_to_devices()` is not currently " - "compatible with initializable iterators. Use " - "`make_one_shot_iterator()` instead.") + if context.executing_eagerly(): + raise RuntimeError( + "make_initializable_iterator is not supported when eager " + "execution is enabled.") + + return _PrefetchToDeviceIterator( + self._input_dataset, + one_shot=False, + devices=self._devices, + buffer_size=self._buffer_size, + shared_name=shared_name) def _as_variant_tensor(self): # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py index 8ed16f4607881f2864479c04b4c25e95d9fa1850..a68dbce6c7d03f6a1695ebfcd00178e21ac1cda0 100644 --- a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py +++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py @@ -64,5 +64,27 @@ class PrefetchingOpsV2Test(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) + def testPrefetchToTwoDevicesWithReinit(self): + if not test_util.is_gpu_available(): + self.skipTest("No GPU available") + + host_dataset = dataset_ops.Dataset.range(10) + device_dataset = host_dataset.apply( + prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"])) + + iterator = device_dataset.make_initializable_iterator() + next_element = iterator.get_next() + + with self.test_session() as sess: + sess.run(iterator.initializer) + for _ in range(5): + sess.run(next_element) + with self.assertRaises(errors.OutOfRangeError): + sess.run(next_element) + sess.run(iterator.initializer) + for _ in range(5): + sess.run(next_element) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py index cef5fd2f8943d348a0721cd72032bf6cb2199ad9..0db0b59fcacee2785eb8191bb84ed5216a79b081 100644 --- a/tensorflow/contrib/distribute/python/single_loss_example.py +++ b/tensorflow/contrib/distribute/python/single_loss_example.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.contrib.data.python.ops import batching from tensorflow.contrib.distribute.python import step_fn from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import constant_op @@ -29,7 +30,10 @@ from tensorflow.python.ops import math_ops def single_loss_example(optimizer_fn, distribution, use_bias=False): """Build a very simple network to use in tests and examples.""" - dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat() + + def dataset_fn(): + return dataset_ops.Dataset.from_tensors([[1.]]).repeat() + optimizer = optimizer_fn() layer = core.Dense(1, use_bias=use_bias) @@ -37,8 +41,8 @@ def single_loss_example(optimizer_fn, distribution, use_bias=False): y = array_ops.reshape(layer(x), []) - constant_op.constant(1.) return y * y - single_loss_step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer, - distribution) + single_loss_step = step_fn.StandardSingleLossStep(dataset_fn, loss_fn, + optimizer, distribution) # Layer is returned for inspecting the kernels in tests. return single_loss_step, layer @@ -49,7 +53,14 @@ def minimize_loss_example(optimizer_fn, use_callable_loss=True, create_optimizer_inside_model_fn=False): """Example of non-distribution-aware legacy code.""" - dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat() + + def dataset_fn(): + dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat() + # TODO(isaprykin): map_and_batch with drop_remainder causes shapes to be + # fully defined for TPU. Remove this when XLA supports dynamic shapes. + return dataset.apply( + batching.map_and_batch(lambda x: x, batch_size=2, drop_remainder=True)) + # An Optimizer instance is created either outside or inside model_fn. outer_optimizer = None if not create_optimizer_inside_model_fn: @@ -57,10 +68,11 @@ def minimize_loss_example(optimizer_fn, layer = core.Dense(1, use_bias=use_bias) - def model_fn(x): + def model_fn(xs): """A very simple model written by the user.""" def loss_fn(): + x = math_ops.reduce_mean(xs, keepdims=True) y = array_ops.reshape(layer(x), []) - constant_op.constant(1.) return y * y @@ -71,7 +83,7 @@ def minimize_loss_example(optimizer_fn, else: return optimizer.minimize(loss_fn()) - return model_fn, dataset, layer + return model_fn, dataset_fn, layer def batchnorm_example(optimizer_fn, @@ -79,12 +91,15 @@ def batchnorm_example(optimizer_fn, momentum=0.9, renorm=False): """Example of non-distribution-aware legacy code with batch normalization.""" - # input shape is [16, 8], input values are increasing in both dimensions. - dataset = dataset_ops.Dataset.from_tensor_slices( - [[[float(x * 8 + y + z * 100) - for y in range(8)] - for x in range(16)] - for z in range(batch_per_epoch)]).repeat() + + def dataset_fn(): + # input shape is [16, 8], input values are increasing in both dimensions. + return dataset_ops.Dataset.from_tensor_slices( + [[[float(x * 8 + y + z * 100) + for y in range(8)] + for x in range(16)] + for z in range(batch_per_epoch)]).repeat() + optimizer = optimizer_fn() batchnorm = normalization.BatchNormalization( renorm=renorm, momentum=momentum, fused=False) @@ -99,4 +114,4 @@ def batchnorm_example(optimizer_fn, # Callable loss. return optimizer.minimize(loss_fn) - return model_fn, dataset, batchnorm + return model_fn, dataset_fn, batchnorm diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py index 82514c64be40b421c4a9887932f2cfb8e1ac4be0..d1910622b38c748fc5a814f9e83c2294850d5d12 100644 --- a/tensorflow/contrib/distribute/python/step_fn.py +++ b/tensorflow/contrib/distribute/python/step_fn.py @@ -49,12 +49,14 @@ class StandardInputStep(Step): """Step with a standard implementation of input handling. Args: - input_dataset: a tf.data Dataset that provides input. + dataset_fn: a function that returns a tf.data Dataset that produces the + input for the model. """ - def __init__(self, input_dataset, distribution): + def __init__(self, dataset_fn, distribution): Step.__init__(self, distribution) - self._distributed_input = distribution.distribute_dataset(input_dataset) + self._distributed_input = distribution.distribute_dataset( + dataset_fn).make_one_shot_iterator() def inputs(self): return self._distributed_input.get_next() @@ -76,14 +78,15 @@ class StandardSingleLossStep(StandardInputStep): ``` Args: - input_dataset: a tf.data Dataset that provides input. + dataset_fn: a function that returns a tf.data Dataset that produces the + input for the model. loss_fn: a function that returns loss. optimizer: an optimizer that implements an update rule. distribution: a `DistributionStrategy` object. """ - def __init__(self, input_dataset, loss_fn, optimizer, distribution): - StandardInputStep.__init__(self, input_dataset, distribution) + def __init__(self, dataset_fn, loss_fn, optimizer, distribution): + StandardInputStep.__init__(self, dataset_fn, distribution) self._loss_fn = loss_fn self._optimizer = optimizer self._is_run_concurrently = False diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py index 0ac307dd6a919179fda3af2448b39eaa59b811c6..a7e4fe80f3e65907fa4b48c5fe0fcfd422ba033f 100644 --- a/tensorflow/contrib/distribute/python/tpu_strategy.py +++ b/tensorflow/contrib/distribute/python/tpu_strategy.py @@ -21,62 +21,99 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import itertools + from tensorflow.contrib import tpu from tensorflow.contrib.distribute.python import one_device_strategy +from tensorflow.contrib.distribute.python import values from tensorflow.contrib.tpu.python.ops import tpu_ops from tensorflow.python.framework import constant_op -from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops +from tensorflow.python.util import nest # TODO(isaprykin): Consider whether inheriting is really appropriate. -class TpuStrategy(one_device_strategy.OneDeviceStrategy): - - def __init__(self, master=None, iterations=None, model_dir=None): - super(TpuStrategy, self).__init__('/cpu:0') +class TPUStrategy(one_device_strategy.OneDeviceStrategy): + """Experimental TPU distribution strategy implementation.""" + + def __init__(self, + num_cores_per_host=2, + iterations_per_step=2): + # TODO(isaprykin): Generalize the defaults. They are currently tailored for + # the unit test. + super(TPUStrategy, self).__init__('/cpu:0') + # TODO(isaprykin): Auto-detect number of cores and hosts. + self._num_cores_per_host = num_cores_per_host + # TODO(isaprykin): This might have to be per-call. + self._iterations_per_step = iterations_per_step + + def distribute_dataset(self, dataset_fn): + return values.PerIterationDataset( + self._call_dataset_fn(dataset_fn), self._iterations_per_step, + self._num_cores_per_host) def _call_for_each_tower(self, fn, *args, **kwargs): kwargs.pop('run_concurrently', None) - # TODO(isaprykin): Give an API for many iterations per step. - iterations = 1 - - # TODO(isaprykin): Do not hard code shapes and input format :) - # TODO(isaprykin): Detect the number of TPU cores automatically. + inputs = {'args': args, 'kwargs': kwargs} + flat_inputs = nest.flatten(inputs) - def dequeueing_fn(*args, **kwargs): - del args, kwargs - x, = tpu.infeed_dequeue_tuple(dtypes=[dtypes.float32], shapes=[[1, 1, 1]]) - return fn(x) + feed_mask = [isinstance(f, values.PerIteration) for f in flat_inputs] - iterator = args[0] + feeds = lambda: itertools.compress(flat_inputs, feed_mask) + shapes = [f.get_shape() for f in feeds()] + if any([not s.is_fully_defined() for s in shapes]): + raise ValueError( + 'TPU currently requires fully defined shapes. Either use ' + 'set_shape() on the input tensors or use ' + 'dataset.apply(map_and_batch(..., drop_remainder=True)).') + types = [f.get_dtype() for f in feeds()] def infeed_input(i): """Get input, split it and then enqueue.""" - batches = iterator.get_next() - batches = array_ops.split(batches, 2) + iteration_inputs = [f.get(i) for f in feeds()] + + infeed_inputs = [[inputs_per_core[core_id] + for inputs_per_core in iteration_inputs] + for core_id in range(self._num_cores_per_host)] - infeeds = [ - tpu_ops.infeed_enqueue_tuple( - inputs=[batches[j]], shapes=[[1, 1, 1]], device_ordinal=j) - for j in range(2) - ] + infeed_ops = [] + for core_id, infeed_input in enumerate(infeed_inputs): + infeed_ops.append( + tpu_ops.infeed_enqueue_tuple( + inputs=infeed_input, shapes=shapes, device_ordinal=core_id)) - with ops.control_dependencies(infeeds): + with ops.control_dependencies(infeed_ops): return i + 1 with ops.device('/task:0/device:CPU:0'): enqueue_ops = control_flow_ops.while_loop( - lambda i: i < iterations, + lambda i: i < self._iterations_per_step, infeed_input, [constant_op.constant(0)], parallel_iterations=1) + def dequeueing_fn(*args, **kwargs): + """Dequeue input arguments and supply them to `fn`.""" + del args, kwargs + dequeued = tpu.infeed_dequeue_tuple(dtypes=types, shapes=shapes) + dequeued = iter(dequeued) + + fn_inputs = [] + for inp, is_feed in zip(flat_inputs, feed_mask): + if is_feed: + fn_inputs.append(next(dequeued)) + else: + fn_inputs.append(inp) + + fn_inputs = nest.pack_sequence_as(inputs, fn_inputs) + return fn(*fn_inputs['args'], **fn_inputs['kwargs']) + def iterate_on_tpu(): - return tpu.repeat(iterations, dequeueing_fn, []) + return tpu.repeat(self._iterations_per_step, dequeueing_fn, []) with one_device_strategy._OneDeviceTowerContext(self): # pylint: disable=protected-access - tpu_result = tpu.batch_parallel(iterate_on_tpu, [], num_shards=2) + tpu_result = tpu.batch_parallel( + iterate_on_tpu, [], num_shards=self._num_cores_per_host) return control_flow_ops.group(tpu_result, enqueue_ops) diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py index 87bf0590384cc74ca0f0575bcef4e84599a8b666..8cb5276579f48f9ea5781c5351cbf9bf3db16e6c 100644 --- a/tensorflow/contrib/distribute/python/values.py +++ b/tensorflow/contrib/distribute/python/values.py @@ -28,7 +28,6 @@ import six from tensorflow.contrib.data.python.ops import batching from tensorflow.contrib.distribute.python import prefetching_ops_v2 -from tensorflow.contrib.eager.python import datasets from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -510,6 +509,10 @@ class PerDeviceDataIterator(object): self._devices = devices self._prefetch_on_device = prefetch_on_device + @property + def initializer(self): + return self._iterator.initializer + def get_next(self, name=None): """Scatter the input across devices.""" if self._prefetch_on_device: @@ -545,7 +548,8 @@ class PerDeviceDataset(object): "Prefetching is only supported in graph mode currently") if self._prefetch_on_device: - self._dataset = dataset + self._dataset = dataset.apply( + prefetching_ops_v2.prefetch_to_devices(self._devices)) else: # TODO(priyag): If dropping remainder is not appropriate, find another # approach to distributing the dataset when not possible to divide evenly. @@ -555,19 +559,72 @@ class PerDeviceDataset(object): def make_one_shot_iterator(self): """Get a one time use iterator for the distributed PerDeviceDataset.""" - if self._prefetch_on_device: - on_device_dataset = self._dataset.apply( - prefetching_ops_v2.prefetch_to_devices(self._devices)) - dataset_iterator = on_device_dataset.make_one_shot_iterator() - elif context.executing_eagerly(): - dataset_iterator = datasets.Iterator(self._dataset) - else: - dataset_iterator = self._dataset.make_one_shot_iterator() + dataset_iterator = self._dataset.make_one_shot_iterator() + return PerDeviceDataIterator( + dataset_iterator, self._devices, self._prefetch_on_device) + def make_initializable_iterator(self): + """Get an initializable iterator for the distributed PerDeviceDataset.""" + dataset_iterator = self._dataset.make_initializable_iterator() return PerDeviceDataIterator( dataset_iterator, self._devices, self._prefetch_on_device) +class PerIteration(object): + """Holds input for multiple iterations at once.""" + + def __init__(self, index): + self._index = index + + def get(self, iteration): + return array_ops.gather(self._index, iteration) + + def get_shape(self): + return self._index[-1][-1].get_shape() + + def get_dtype(self): + return self._index[-1][-1].dtype + + +class MultiIterator(object): + """Iterator that returns results of multiple get_next()s.""" + + def __init__(self, dataset_iterator, iterations, batches_per_iteration): + self._dataset_iterator = dataset_iterator + self._iterations = iterations + self._batches_per_iteration = batches_per_iteration + + def get_next(self, name=None): + return PerIteration([[ + self._dataset_iterator.get_next(name=name) + for _ in range(self._batches_per_iteration) + ] + for _ in range(self._iterations)]) + + @property + def initializer(self): + return self._dataset_iterator.initializer + + +class PerIterationDataset(object): + """A dataset that returns MultiIterators.""" + + def __init__(self, dataset, iterations, batches_per_iteration): + self._dataset = dataset + self._iterations = iterations + self._batches_per_iteration = batches_per_iteration + + def make_one_shot_iterator(self): + iterator = self._dataset.make_one_shot_iterator() + return MultiIterator(iterator, self._iterations, + self._batches_per_iteration) + + def make_initializable_iterator(self): + iterator = self._dataset.make_initializable_iterator() + return MultiIterator(iterator, self._iterations, + self._batches_per_iteration) + + class MapOutput(object): """Map can result in multiple outputs per device.""" diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py index 5c0d4b7d6c78b7cf63c613201d83d4793ecfe76b..e96ce547415fcb2bf3da8b6085ee11f51717db8d 100644 --- a/tensorflow/contrib/distribute/python/values_test.py +++ b/tensorflow/contrib/distribute/python/values_test.py @@ -32,6 +32,7 @@ from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops +from tensorflow.python.ops import random_ops from tensorflow.python.ops import variable_scope from tensorflow.python.training import device_util from tensorflow.python.training import saver as saver_lib @@ -408,6 +409,32 @@ class PerDeviceDatasetTest(test.TestCase): expected_values = [[i, i+1] for i in range(0, 10, 2)] self._test_iterator(devices, dataset, expected_values) + def testInitializableIterator(self): + with context.graph_mode(): + devices = ["/device:CPU:0"] + # Using random input since that is only allowed with initializable + # iterator. + dataset = dataset_ops.Dataset.from_tensor_slices( + random_ops.random_uniform((10,))) + + per_device_dataset = values.PerDeviceDataset( + dataset, devices, prefetch_on_device=False) + iterator = per_device_dataset.make_initializable_iterator() + + self.evaluate(iterator.initializer) + next_element = iterator.get_next() + for _ in range(10): + self.evaluate(next_element) + + # Should fail after the input is finished. + with self.assertRaises(errors.OutOfRangeError): + self.evaluate(next_element) + + # After re-initializing the iterator, should be able to iterate again. + self.evaluate(iterator.initializer) + for _ in range(10): + self.evaluate(next_element) + @test_util.with_c_api class MirroredVariableTest(test.TestCase): diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD index 20e432b88dc60d45fd32710574ed6e57d0f8a792..2d99e8172d220a697c3efaf07c6606874bd7dd01 100644 --- a/tensorflow/contrib/distributions/BUILD +++ b/tensorflow/contrib/distributions/BUILD @@ -877,6 +877,7 @@ cuda_py_test( "//tensorflow/python:framework_test_lib", "//tensorflow/python:platform_test", ], + tags = ["optonly"], ) cuda_py_test( diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py new file mode 100644 index 0000000000000000000000000000000000000000..a5f5219588fb3be67beb797ba68ed8148e9e9fd2 --- /dev/null +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py @@ -0,0 +1,109 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for Bijector.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.distributions.python.ops.bijectors.ordered import Ordered +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops +from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite +from tensorflow.python.platform import test + + + +class OrderedBijectorTest(test.TestCase): + """Tests correctness of the ordered transformation.""" + + def setUp(self): + self._rng = np.random.RandomState(42) + + @test_util.run_in_graph_and_eager_modes() + def testBijectorVector(self): + with self.test_session(): + ordered = Ordered() + self.assertEqual("ordered", ordered.name) + x = np.asarray([[2., 3, 4], [4., 8, 13]]) + y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]] + self.assertAllClose(y, self.evaluate(ordered.forward(x))) + self.assertAllClose(x, self.evaluate(ordered.inverse(y))) + self.assertAllClose( + np.sum(np.asarray(y)[..., 1:], axis=-1), + self.evaluate(ordered.inverse_log_det_jacobian(y, event_ndims=1)), + atol=0., + rtol=1e-7) + self.assertAllClose( + self.evaluate(-ordered.inverse_log_det_jacobian(y, event_ndims=1)), + self.evaluate(ordered.forward_log_det_jacobian(x, event_ndims=1)), + atol=0., + rtol=1e-7) + + def testBijectorUnknownShape(self): + with self.test_session(): + ordered = Ordered() + self.assertEqual("ordered", ordered.name) + x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32) + real_x = np.asarray([[2., 3, 4], [4., 8, 13]]) + y = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32) + real_y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]] + self.assertAllClose(real_y, ordered.forward(x).eval( + feed_dict={x: real_x})) + self.assertAllClose(real_x, ordered.inverse(y).eval( + feed_dict={y: real_y})) + self.assertAllClose( + np.sum(np.asarray(real_y)[..., 1:], axis=-1), + ordered.inverse_log_det_jacobian(y, event_ndims=1).eval( + feed_dict={y: real_y}), + atol=0., + rtol=1e-7) + self.assertAllClose( + -ordered.inverse_log_det_jacobian(y, event_ndims=1).eval( + feed_dict={y: real_y}), + ordered.forward_log_det_jacobian(x, event_ndims=1).eval( + feed_dict={x: real_x}), + atol=0., + rtol=1e-7) + + @test_util.run_in_graph_and_eager_modes() + def testShapeGetters(self): + with self.test_session(): + x = tensor_shape.TensorShape([4]) + y = tensor_shape.TensorShape([4]) + bijector = Ordered(validate_args=True) + self.assertAllEqual(y, bijector.forward_event_shape(x)) + self.assertAllEqual(y.as_list(), + self.evaluate(bijector.forward_event_shape_tensor( + x.as_list()))) + self.assertAllEqual(x, bijector.inverse_event_shape(y)) + self.assertAllEqual(x.as_list(), + self.evaluate(bijector.inverse_event_shape_tensor( + y.as_list()))) + + def testBijectiveAndFinite(self): + with self.test_session(): + ordered = Ordered() + x = np.sort(self._rng.randn(3, 10), axis=-1).astype(np.float32) + y = (self._rng.randn(3, 10)).astype(np.float32) + assert_bijective_and_finite(ordered, x, y, event_ndims=1) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py index 68e0d9cb8277f3953039963fec0da499db7a16d1..f42feae25d851eb9ae0bf48649fc3bbe2a221be0 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py @@ -190,11 +190,30 @@ class DistributionTest(test.TestCase): y = dist._set_sample_static_shape(x, sample_shape) self.assertTrue(y.get_shape().ndims is None) + def testNameScopeWorksCorrectly(self): + x = tfd.Normal(loc=0., scale=1., name="x") + x_duplicate = tfd.Normal(loc=0., scale=1., name="x") + with ops.name_scope("y") as name: + y = tfd.Bernoulli(logits=0., name=name) + x_sample = x.sample(name="custom_sample") + x_sample_duplicate = x.sample(name="custom_sample") + x_log_prob = x.log_prob(0., name="custom_log_prob") + x_duplicate_sample = x_duplicate.sample(name="custom_sample") + + self.assertEqual(x.name, "x/") + self.assertEqual(x_duplicate.name, "x_1/") + self.assertEqual(y.name, "y/") + self.assertTrue(x_sample.name.startswith("x/custom_sample")) + self.assertTrue(x_sample_duplicate.name.startswith("x/custom_sample_1")) + self.assertTrue(x_log_prob.name.startswith("x/custom_log_prob")) + self.assertTrue(x_duplicate_sample.name.startswith( + "x_1/custom_sample")) + def testStrWorksCorrectlyScalar(self): normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1)) self.assertEqual( ("tf.distributions.Normal(" - "\"Normal\", " + "\"Normal/\", " "batch_shape=(), " "event_shape=(), " "dtype=float16)"), # Got the dtype right. @@ -203,7 +222,7 @@ class DistributionTest(test.TestCase): chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly") self.assertEqual( ("tf.distributions.Chi2(" - "\"silly\", " # What a silly name that is! + "\"silly/\", " # What a silly name that is! "batch_shape=(2,), " "event_shape=(), " "dtype=float32)"), @@ -211,7 +230,7 @@ class DistributionTest(test.TestCase): exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32)) self.assertEqual( - ("tf.distributions.Exponential(\"Exponential\", " + ("tf.distributions.Exponential(\"Exponential/\", " # No batch shape. "event_shape=(), " "dtype=float32)"), @@ -222,7 +241,7 @@ class DistributionTest(test.TestCase): loc=np.zeros([2, 2]), name="MVN") self.assertEqual( ("tf.distributions.MultivariateNormalDiag(" - "\"MVN\", " + "\"MVN/\", " "batch_shape=(2,), " "event_shape=(2,), " "dtype=float64)"), @@ -233,7 +252,7 @@ class DistributionTest(test.TestCase): name="MVN2") self.assertEqual( ("tf.distributions.MultivariateNormalDiag(" - "\"MVN2\", " + "\"MVN2/\", " "batch_shape=(?,), " # Partially known. "event_shape=(3,), " "dtype=float32)"), @@ -243,7 +262,7 @@ class DistributionTest(test.TestCase): normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1)) self.assertEqual( (""), # Got the dtype right. @@ -252,7 +271,7 @@ class DistributionTest(test.TestCase): chi2 = tfd.Chi2(df=np.float32([1., 2.]), name="silly") self.assertEqual( (""), @@ -261,7 +280,7 @@ class DistributionTest(test.TestCase): exp = tfd.Exponential(rate=array_ops.placeholder(dtype=dtypes.float32)) self.assertEqual( ("" " event_shape=()" " dtype=float32>"), @@ -272,7 +291,7 @@ class DistributionTest(test.TestCase): loc=np.zeros([2, 2]), name="MVN") self.assertEqual( (""), @@ -283,7 +302,7 @@ class DistributionTest(test.TestCase): name="MVN2") self.assertEqual( (""), diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py index 1a02fbefb8e88599f5fedeb38fb06f5a09036439..7435bcbc684c1660a648cef4ab30c888723853f8 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py @@ -52,7 +52,7 @@ class MultivariateNormalFullCovarianceTest(test.TestCase): mu = [1., 2.] sigma = [[1., 0.], [0., 1.]] mvn = ds.MultivariateNormalFullCovariance(mu, sigma, name="Billy") - self.assertEqual(mvn.name, "Billy") + self.assertEqual(mvn.name, "Billy/") def testDoesNotRaiseIfInitializedWithSymmetricMatrix(self): with self.test_session(): diff --git a/tensorflow/contrib/distributions/python/ops/autoregressive.py b/tensorflow/contrib/distributions/python/ops/autoregressive.py index 69f3d57ff000d6c9acc8aa9e3d0ad8d9cbb6bb3c..88ed0127841093cc1a1168d988f14e7bb0277b12 100644 --- a/tensorflow/contrib/distributions/python/ops/autoregressive.py +++ b/tensorflow/contrib/distributions/python/ops/autoregressive.py @@ -145,7 +145,7 @@ class Autoregressive(distribution_lib.Distribution): ValueError: if `num_steps < 1`. """ parameters = locals() - with ops.name_scope(name): + with ops.name_scope(name) as name: self._distribution_fn = distribution_fn self._sample0 = sample0 self._distribution0 = (distribution_fn() if sample0 is None diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py index babce80396cfc41b53e99f91038d4f077c7efe82..51478dbeffaabc58ce3662f25f06bc579e8a407e 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py @@ -30,6 +30,7 @@ @@Invert @@Kumaraswamy @@MaskedAutoregressiveFlow +@@Ordered @@Permute @@PowerTransform @@RealNVP @@ -67,6 +68,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.inline import * from tensorflow.contrib.distributions.python.ops.bijectors.invert import * from tensorflow.contrib.distributions.python.ops.bijectors.kumaraswamy import * from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import * +from tensorflow.contrib.distributions.python.ops.bijectors.ordered import * from tensorflow.contrib.distributions.python.ops.bijectors.permute import * from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import * from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import * diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py index caae2adcfac7643cdc8f76dd1cccddd516105410..ecdb8967f43e5960b2285de05125d0c3dbafe63c 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py @@ -170,7 +170,7 @@ class CholeskyOuterProduct(bijector.Bijector): sum_weighted_log_diag = array_ops.squeeze( math_ops.matmul(math_ops.log(diag), exponents[..., array_ops.newaxis]), - squeeze_dims=-1) + axis=-1) fldj = p_float * np.log(2.) + sum_weighted_log_diag return fldj diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py index 1904239a0e7009c35cc4f3c8876fd749463a2b83..84a3289ba2160ed22a2bc7030dd612ba9ca6f6df 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py @@ -18,14 +18,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.python.ops.distributions import bijector as bijector_lib +from tensorflow.python.ops.distributions import bijector __all__ = [ "Invert", ] -class Invert(bijector_lib.Bijector): +class Invert(bijector.Bijector): """Bijector which inverts another Bijector. Example Use: [ExpGammaDistribution (see Background & Context)]( diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py index ef56cf6ddda4dca2b1575e844b2584689e531b81..83667b0e80cfcc1c4f0617cdc739221f24439665 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py @@ -32,7 +32,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops from tensorflow.python.ops import template as template_ops from tensorflow.python.ops import variable_scope as variable_scope_lib -from tensorflow.python.ops.distributions import bijector as bijector_lib +from tensorflow.python.ops.distributions import bijector __all__ = [ @@ -42,7 +42,7 @@ __all__ = [ ] -class MaskedAutoregressiveFlow(bijector_lib.Bijector): +class MaskedAutoregressiveFlow(bijector.Bijector): """Affine MaskedAutoregressiveFlow bijector for vector-valued events. The affine autoregressive flow [(Papamakarios et al., 2016)][3] provides a diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py new file mode 100644 index 0000000000000000000000000000000000000000..3f03592f314cc13e8a9ea7e2ae18c5bb1f14e74f --- /dev/null +++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py @@ -0,0 +1,125 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Ordered bijector.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import check_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops.distributions import bijector + + +__all__ = [ + "Ordered", +] + + +class Ordered(bijector.Bijector): + """Bijector which maps a tensor x_k that has increasing elements in the last + dimension to an unconstrained tensor y_k. + + Both the domain and the codomain of the mapping is [-inf, inf], however, + the input of the forward mapping must be strictly increasing. + The inverse of the bijector applied to a normal random vector `y ~ N(0, 1)` + gives back a sorted random vector with the same distribution `x ~ N(0, 1)` + where `x = sort(y)` + + On the last dimension of the tensor, Ordered bijector performs: + `y[0] = x[0]` + `y[1:] = math_ops.log(x[1:] - x[:-1])` + + #### Example Use: + + ```python + bijector.Ordered().forward([2, 3, 4]) + # Result: [2., 0., 0.] + + bijector.Ordered().inverse([0.06428002, -1.07774478, -0.71530371]) + # Result: [0.06428002, 0.40464228, 0.8936858] + ``` + """ + + def __init__(self, validate_args=False, name="ordered"): + super(Ordered, self).__init__( + forward_min_event_ndims=1, + validate_args=validate_args, + name=name) + + def _forward_event_shape(self, input_shape): + if input_shape.ndims is None or input_shape[-1] is None: + return input_shape + return tensor_shape.TensorShape([input_shape[-1]]) + + def _forward_event_shape_tensor(self, input_shape): + return (input_shape[-1])[..., array_ops.newaxis] + + def _inverse_event_shape(self, output_shape): + if output_shape.ndims is None or output_shape[-1] is None: + return output_shape + if output_shape[-1] <= 1: + raise ValueError("output_shape[-1] = %d <= 1" % output_shape[-1]) + return tensor_shape.TensorShape([output_shape[-1]]) + + def _inverse_event_shape_tensor(self, output_shape): + if self.validate_args: + is_greater_one = check_ops.assert_greater( + output_shape[-1], 1, message="Need last dimension greater than 1.") + output_shape = control_flow_ops.with_dependencies( + [is_greater_one], output_shape) + return (output_shape[-1])[..., array_ops.newaxis] + + def _forward(self, x): + x = self._maybe_assert_valid_x(x) + y0 = x[..., 0, array_ops.newaxis] + yk = math_ops.log(x[..., 1:] - x[..., :-1]) + y = array_ops.concat([y0, yk], axis=-1) + return y + + def _inverse(self, y): + x0 = y[..., 0, array_ops.newaxis] + xk = math_ops.exp(y[..., 1:]) + x = array_ops.concat([x0, xk], axis=-1) + return math_ops.cumsum(x, axis=-1) + + def _inverse_log_det_jacobian(self, y): + # The Jacobian of the inverse mapping is lower + # triangular, with the diagonal elements being: + # J[i,i] = 1 if i=1, and + # exp(y_i) if 1GPU->CPU, which forces - # a sync. This is a roundabout way, yes. + # then this will force a copy from CPU->NON_CPU_DEVICE->CPU, + # which forces a sync. This is a roundabout way, yes. tf.constant(1.).cpu() - def _benchmark_eager_apply(self, label, defun=False, execution_mode=None): + def _benchmark_eager_apply(self, label, defun=False, execution_mode=None, + device_and_format=None): with tfe.execution_mode(execution_mode): - device, data_format = device_and_data_format() + device, data_format = device_and_format or device_and_data_format() model = resnet50.ResNet50(data_format) if defun: model.call = tfe.defun(model.call) @@ -207,7 +213,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): num_burn = 5 num_iters = 30 with tf.device(device): - images, _ = random_batch(batch_size) + images, _ = random_batch(batch_size, device_and_format) for _ in xrange(num_burn): model(images, training=False).cpu() if execution_mode: @@ -220,7 +226,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): tfe.async_wait() self._report(label, start, num_iters, device, batch_size, data_format) - def benchmark_eager_apply(self): + def benchmark_eager_apply_sync(self): self._benchmark_eager_apply('eager_apply', defun=False) def benchmark_eager_apply_async(self): @@ -234,11 +240,12 @@ class ResNet50Benchmarks(tf.test.Benchmark): label, make_iterator, defun=False, - execution_mode=None): + execution_mode=None, + device_and_format=None): with tfe.execution_mode(execution_mode): - device, data_format = device_and_data_format() + device, data_format = device_and_format or device_and_data_format() for batch_size in self._train_batch_sizes(): - (images, labels) = random_batch(batch_size) + (images, labels) = random_batch(batch_size, device_and_format) num_burn = 3 num_iters = 10 model = resnet50.ResNet50(data_format) @@ -253,7 +260,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): train_one_step(model, images, labels, optimizer) if execution_mode: tfe.async_wait() - self._force_gpu_sync() + self._force_device_sync() gc.collect() start = time.time() @@ -262,10 +269,10 @@ class ResNet50Benchmarks(tf.test.Benchmark): train_one_step(model, images, labels, optimizer) if execution_mode: tfe.async_wait() - self._force_gpu_sync() + self._force_device_sync() self._report(label, start, num_iters, device, batch_size, data_format) - def benchmark_eager_train(self): + def benchmark_eager_train_sync(self): self._benchmark_eager_train('eager_train', MockIterator, defun=False) def benchmark_eager_train_async(self): diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD index 9f4cd44afbede286966ba0e7357c5dac92a2b729..b473de86ee8be92e8111ee5098b2536d4b957a8c 100644 --- a/tensorflow/contrib/estimator/BUILD +++ b/tensorflow/contrib/estimator/BUILD @@ -210,7 +210,7 @@ py_library( py_test( name = "head_test", - size = "small", + size = "medium", srcs = ["python/estimator/head_test.py"], srcs_version = "PY2AND3", deps = [ @@ -250,7 +250,7 @@ py_library( py_test( name = "linear_test", - size = "small", + size = "medium", srcs = ["python/estimator/linear_test.py"], srcs_version = "PY2AND3", tags = [ @@ -367,6 +367,7 @@ py_library( "//tensorflow/python:sparse_tensor", "//tensorflow/python:state_ops", "//tensorflow/python:training", + "//tensorflow/python:util", "//tensorflow/python:variable_scope", "//tensorflow/python/estimator:export_output", "//tensorflow/python/estimator:model_fn", @@ -447,6 +448,7 @@ py_test( srcs_version = "PY2AND3", tags = [ "no_pip", + "noasan", # times out "notsan", ], deps = [ diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py index a8774d6dab9205439e6e312827f9cd1306e3f1ea..f8564446e5da3e785b85010998d18dca0424d16b 100644 --- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py +++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py @@ -47,8 +47,12 @@ from tensorflow.python.ops.losses import losses from tensorflow.python.platform import tf_logging from tensorflow.python.training import device_setter as device_setter_lib from tensorflow.python.training import optimizer as optimizer_lib +from tensorflow.python.util import deprecation +@deprecation.deprecated( + '2018-05-31', + 'Please use `tf.contrib.distribute.MirroredStrategy` instead.') def replicate_model_fn(model_fn, loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS, devices=None): @@ -255,6 +259,9 @@ class TowerOptimizer(optimizer_lib.Optimizer): COLLECTION_FOR_GRAPH_STATES = 'replicate_model_fn_graph_states' + @deprecation.deprecated( + '2018-05-31', + 'Please use `tf.contrib.distribute.MirroredStrategy` instead.') def __init__(self, optimizer_or_optimizer_fn): """Wrap an existing optimizer for gathering gradients across towers. diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py index 144b45982c8aec2e2b115c812b24e8843d60ce1e..dd8a3a95f1b83bfd29e8a38ec1512f90e22968d9 100644 --- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py +++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py @@ -540,59 +540,6 @@ class ReplicateAcrossASingleDeviceWithoutTowerOptimizer( self.assertEqual(7.0, session.run(c)) -class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase): - - def model_fn(self, mode, features, labels, params): - c = variable_scope.get_variable( - 'c', - initializer=constant_op.constant(10, dtype=dtypes.float64), - dtype=dtypes.float64) - - features = features['features'] - predictions = math_ops.multiply(features, c) - - loss = losses.absolute_difference( - labels=labels, predictions=predictions, reduction=losses.Reduction.SUM) - loss = math_ops.reduce_sum(loss) - - metrics = { - 'accuracy': metrics_lib.accuracy(labels, predictions), - 'auc': metrics_lib.auc(labels, predictions) - } - - optimizer = replicate_model_fn.TowerOptimizer( - gradient_descent.GradientDescentOptimizer(params['learning_rate'])) - - return model_fn_lib.EstimatorSpec( - mode=mode, - loss=loss, - eval_metric_ops=metrics, - predictions={'probabilities': predictions}, - train_op=optimizer.minimize(loss)) - - @property - def params(self): - params = {} - params['learning_rate'] = 1.0 - return params - - def test_train_single_tower(self): - features = np.array([[1.0], [2.0]]) - labels = np.array([[1.0], [2.0]]) - - train_input_fn = numpy_io.numpy_input_fn( - x={'features': features}, y=labels, batch_size=2, shuffle=False) - - with self.test_session(): - estimator = estimator_lib.Estimator( - model_fn=self.model_fn, - model_dir=tempfile.mkdtemp(), - params=self.params) - estimator.train(train_input_fn, steps=1) - - self.assertEqual(7.0, estimator.get_variable_value('c')) - - class MakeSureSyncReplicasOptimizerWorks(test_util.TensorFlowTestCase): def model_fn(self, mode, features, labels, params): diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD index 0a648d5d40e431bedb42017b15cabe078ac22fa7..f28d95401c3a5dd89ace9ff2014dd2207a3ea743 100644 --- a/tensorflow/contrib/factorization/BUILD +++ b/tensorflow/contrib/factorization/BUILD @@ -215,6 +215,7 @@ tf_py_test( "//tensorflow/python:platform_test", "//tensorflow/python:sparse_tensor", ], + tags = ["noasan"], # times out b/78588193 ) # Estimators tests diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/contrib/factorization/kernels/clustering_ops.cc index 2a6c97e8b9526894eba057505a2bf823ad778f56..025534d540bb82cdb87bb2977d08dfa4f02f1bc8 100644 --- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc +++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc @@ -32,6 +32,7 @@ #include "tensorflow/core/lib/gtl/top_n.h" #include "tensorflow/core/lib/random/philox_random.h" #include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py index ccdd679d6ae197ab21d5fed5e89daa26bc15a809..e076631bc16fd379a2ad31af9055a7388d98c7ca 100644 --- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py +++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py @@ -397,7 +397,7 @@ class GmmAlgorithm(object): # Compute the effective number of data points assigned to component k. with ops.control_dependencies(self._w): points_in_k = array_ops.squeeze( - math_ops.add_n(self._points_in_k), squeeze_dims=[0]) + math_ops.add_n(self._points_in_k), axis=[0]) # Update alpha. if 'w' in self._params: final_points_in_k = points_in_k / num_batches diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc index 35341406a08dc681c861aea30fcff784e3b963ef..cca1a054193815793846a8753678f75bdfd72a6c 100644 --- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc +++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc @@ -28,7 +28,7 @@ #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/env.h" using tensorflow::strings::StrCat; diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD index b1c8ad49eaf8d2400e431fcf4820fca6e0314557..249debbdf6dff412a5be6cb1032fc4a3567c7d0b 100644 --- a/tensorflow/contrib/framework/BUILD +++ b/tensorflow/contrib/framework/BUILD @@ -93,6 +93,7 @@ tf_kernel_library( ], deps = [ "//tensorflow/core:framework", + "//tensorflow/core:framework_headers_lib", "//third_party/eigen3", ], alwayslink = 1, @@ -177,6 +178,8 @@ cuda_py_test( "//tensorflow/python:platform_test", "//tensorflow/python:resource_variable_ops", "//tensorflow/python:tensor_array_ops", + "//tensorflow/python/data/ops:dataset_ops", + "//tensorflow/python/eager:context", ], ) diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py index 11397e86bd811e376ff4b60f86109d9ee5dd1a62..10d1ecc738de6777784200ba934a521dff592e28 100644 --- a/tensorflow/contrib/framework/__init__.py +++ b/tensorflow/contrib/framework/__init__.py @@ -108,6 +108,7 @@ from __future__ import print_function # pylint: disable=unused-import,wildcard-import from tensorflow.contrib.framework.python.framework import * +from tensorflow.contrib.framework.python.framework import nest from tensorflow.contrib.framework.python.ops import * # pylint: enable=unused-import,wildcard-import @@ -126,5 +127,20 @@ from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d from tensorflow.python.util.all_util import remove_undocumented _allowed_symbols = ['nest', 'broadcast_to'] - +_nest_allowed_symbols = [ + 'assert_same_structure', + 'is_sequence', + 'flatten', + 'flatten_dict_items', + 'pack_sequence_as', + 'map_structure', + 'assert_shallow_structure', + 'flatten_up_to', + 'map_structure_up_to', + 'get_traverse_shallow_structure', + 'yield_flat_paths', + 'flatten_with_joined_string_paths', +] + +remove_undocumented(nest.__name__, allowed_exception_list=_nest_allowed_symbols) remove_undocumented(__name__, allowed_exception_list=_allowed_symbols) diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc index 5bf6b67529579e71a615c27e035111a58d5c02e0..6ab3f460b36d5dd632daee1af68d62529df9cb09 100644 --- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc +++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_var.h" namespace tensorflow { @@ -85,4 +86,74 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_KERNELS +template +class ZeroVarInitializer : public OpKernel { + public: + explicit ZeroVarInitializer(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_)); + } + + void Compute(OpKernelContext* ctx) override { + Var* variable = nullptr; + OP_REQUIRES_OK(ctx, LookupOrCreateResource( + ctx, HandleFromInput(ctx, 0), &variable, + [this, ctx](Var** var_ptr) { + *var_ptr = new Var(dtype_); + PersistentTensor unused; + Tensor* var_tensor = nullptr; + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + attr.set_nic_compatible(true); + TF_RETURN_IF_ERROR(ctx->allocate_persistent( + dtype_, shape_, &unused, &var_tensor, attr)); + + functor::TensorSetZero()( + ctx->eigen_device(), + var_tensor->flat()); + + *(*var_ptr)->tensor() = *var_tensor; + + return Status::OK(); + })); + + core::ScopedUnref scoped(variable); + mutex_lock ml(*variable->mu()); + + OP_REQUIRES(ctx, !variable->is_initialized, + errors::InvalidArgument("input is already initialized")); + + variable->is_initialized = true; + + Tensor* output = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output)); + output->scalar()() = HandleFromInput(ctx, 0); + } + + private: + DataType dtype_; + TensorShape shape_; +}; + +#define REGISTER_CPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("dtype"), \ + ZeroVarInitializer); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS + +#if GOOGLE_CUDA +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("dtype") \ + .HostMemory("var"), \ + ZeroVarInitializer); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#endif // GOOGLE_CUDA + } // namespace tensorflow diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc index 706134ba9a51de6253ba7463b17ff662ea740ed0..f6ee6cdb5713c113aff2228db58244ac73536d9a 100644 --- a/tensorflow/contrib/framework/ops/variable_ops.cc +++ b/tensorflow/contrib/framework/ops/variable_ops.cc @@ -39,4 +39,33 @@ ref: Should be from a `Variable` node. output_ref:= Same as "ref". )doc"); +REGISTER_OP("ZeroVarInitializer") + .Input("var: resource") + .Output("output_var: resource") + .Attr("dtype: type") + .Attr("shape: shape") + .SetAllowsUninitializedInput() + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->Scalar()); + DataType t; + TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t)); + PartialTensorShape p; + TF_RETURN_IF_ERROR(c->GetAttr("shape", &p)); + shape_inference::ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s)); + c->set_output_handle_shapes_and_types( + 0, std::vector{{s, t}}); + + return Status::OK(); + }) + .Doc(R"doc( +Initialize 'var' with all zeros. This op requires that the resource var is not +initialized. The var will first be allocated memory, then be filled with all +zeros. This op is intended to save memory during initialization, +if you use this op, you should not run initializer of the var. + +var: Should be a ResourceVariable. +output_var:= Same as "var". +)doc"); + } // namespace tensorflow diff --git a/tensorflow/contrib/framework/python/ops/critical_section_test.py b/tensorflow/contrib/framework/python/ops/critical_section_test.py index ba660295cb3c97d26da7bf892c78bceee53cf2d4..df7d7e9dae80722569efccbc9cc0d1b75e90cf03 100644 --- a/tensorflow/contrib/framework/python/ops/critical_section_test.py +++ b/tensorflow/contrib/framework/python/ops/critical_section_test.py @@ -19,6 +19,8 @@ from __future__ import division from __future__ import print_function from tensorflow.contrib.framework.python.ops import critical_section_ops +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops @@ -330,6 +332,25 @@ class CriticalSectionTest(test.TestCase): self.evaluate(v.initializer) self.assertEqual(10, self.evaluate(out)) + @test_util.run_in_graph_and_eager_modes() + def testInsideFunction(self): + cs = critical_section_ops.CriticalSection() + v = resource_variable_ops.ResourceVariable(1) + def fn(): + return v.read_value() + + # map() creates a TensorFlow function. + ds = dataset_ops.Dataset.range(1).map(lambda _: cs.execute(fn)) + + def get_first(): + if context.executing_eagerly(): + return self.evaluate(ds.make_one_shot_iterator().get_next()) + itr = ds.make_initializable_iterator() + self.evaluate([v.initializer, itr.initializer]) + return self.evaluate(itr.get_next()) + + self.assertEqual(1, get_first()) + # TODO(ebrevdo): Re-enable once CriticalSection is in core. # # def testCriticalSectionAndExecuteOpSaverRoundTrip(self): diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py index 0754c3e0e30a340910a43a3ce86f6ca10afe848e..40ae01bfcce1dde580e6a5f6d9c8ec1aa1abb83f 100644 --- a/tensorflow/contrib/framework/python/ops/variables.py +++ b/tensorflow/contrib/framework/python/ops/variables.py @@ -32,6 +32,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import resource_loader from tensorflow.python.platform import tf_logging as logging @@ -82,7 +83,12 @@ def zero_initializer(ref, use_locking=True, name="zero_initializer"): """ loader.load_op_library( resource_loader.get_path_to_datafile("_variable_ops.so")) - return gen_variable_ops.zero_initializer(ref, name=name) + if resource_variable_ops.is_resource_variable(ref): + return gen_variable_ops.zero_var_initializer( + ref.handle, shape=ref.shape, dtype=ref.dtype, name=name) + else: + return gen_variable_ops.zero_initializer(ref, name=name) + @deprecated(None, "Please switch to tf.train.assert_global_step") def assert_global_step(global_step_tensor): diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py index 2f06df93acb0a4c0b36c68839ff531e3c22c5ee3..37ea6eb12aba7d25656f19cbbc86475c1228d916 100644 --- a/tensorflow/contrib/framework/python/ops/variables_test.py +++ b/tensorflow/contrib/framework/python/ops/variables_test.py @@ -1284,6 +1284,32 @@ class ZeroInitializerOpTest(test.TestCase): [10, 20], dtype=dtype), use_init) +class ZeroVarInitializerOpTest(test.TestCase): + + def _testZeroVarInitializer(self, shape, initializer, use_init): + var = resource_variable_ops.ResourceVariable(initializer) + var_zero = variables_lib2.zero_initializer(var) + + with self.test_session() as sess: + with self.assertRaisesOpError('Error while reading resource variable'): + var.eval() + if use_init: + sess.run(var.initializer) + with self.assertRaisesOpError('input is already initialized'): + var_zero.eval() + self.assertAllClose(np.ones(shape), var.eval()) + else: + var_zero.eval() + self.assertAllClose(np.zeros(shape), var.eval()) + + def testZeroVarInitializer(self): + for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64): + for use_init in (False, True): + self._testZeroVarInitializer([10, 20], + array_ops.ones([10, 20], dtype=dtype), + use_init) + + class FilterVariablesTest(test.TestCase): def setUp(self): diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc index 0e06575d96f9b9538f0245b12d48cfd7c0e8d981..2458f7554afdc12709571c551a8323cda7fa5c17 100644 --- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc +++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc @@ -247,7 +247,7 @@ class FusedConv2DBiasActivationOp : public OpKernel { }; #if GOOGLE_CUDA -namespace dnn = ::perftools::gputools::dnn; +namespace dnn = se::dnn; // A dummy type to group forward convolution autotune results together. struct ConvBiasActivationAutoTuneGroup { @@ -543,7 +543,8 @@ void LaunchFusedConv2DBiasActivationOp:: fused_conv_parameters, &algorithm_config)) { std::vector algorithms; CHECK(stream->parent()->GetConvolveAlgorithms( - fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), + fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo( + stream->parent()), &algorithms)); dnn::ProfileResult best_result; dnn::ProfileResult best_result_no_scratch; diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py index 47e51415fd9e7daa360ca06a11078f6edcf63b5b..d914f549457a1e893ed43a3b8bc1ae5be7bb4303 100644 --- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py +++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py @@ -488,25 +488,25 @@ def frechet_classifier_distance(real_images, The Frechet Inception distance. A floating-point scalar of the same type as the output of `classifier_fn`. """ - real_images_list = array_ops.split( real_images, num_or_size_splits=num_batches) generated_images_list = array_ops.split( generated_images, num_or_size_splits=num_batches) - imgs = array_ops.stack(real_images_list + generated_images_list) + real_imgs = array_ops.stack(real_images_list) + generated_imgs = array_ops.stack(generated_images_list) # Compute the activations using the memory-efficient `map_fn`. - activations = functional_ops.map_fn( - fn=classifier_fn, - elems=imgs, - parallel_iterations=1, - back_prop=False, - swap_memory=True, - name='RunClassifier') + def compute_activations(elems): + return functional_ops.map_fn(fn=classifier_fn, + elems=elems, + parallel_iterations=1, + back_prop=False, + swap_memory=True, + name='RunClassifier') - # Split the activations by the real and generated images. - real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0) + real_a = compute_activations(real_imgs) + gen_a = compute_activations(generated_imgs) # Ensure the activations have the right shapes. real_a = array_ops.concat(array_ops.unstack(real_a), 0) @@ -697,18 +697,20 @@ def frechet_classifier_distance_from_activations(real_activations, # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) - num_examples = math_ops.to_double(array_ops.shape(real_activations)[0]) + num_examples_real = math_ops.to_double(array_ops.shape(real_activations)[0]) + num_examples_generated = math_ops.to_double( + array_ops.shape(generated_activations)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T real_centered = real_activations - m sigma = math_ops.matmul( real_centered, real_centered, transpose_a=True) / ( - num_examples - 1) + num_examples_real - 1) gen_centered = generated_activations - m_w sigma_w = math_ops.matmul( gen_centered, gen_centered, transpose_a=True) / ( - num_examples - 1) + num_examples_generated - 1) # Find the Tr(sqrt(sigma sigma_w)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_w) diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc index 28f68cec8cce126f1b177a73e197ccd7ab749f4a..94f522c04e5a09ed2d9355fa675125c340407923 100644 --- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc +++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc @@ -155,7 +155,7 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous { } Device* dst_device; - Status s = sess->device_mgr->LookupDevice(parsed.dst_device, &dst_device); + Status s = sess->device_mgr()->LookupDevice(parsed.dst_device, &dst_device); if (!s.ok()) { sess->worker_cache->ReleaseWorker(src_worker, rwi); done(s, Args(), recv_args, Tensor{}, false); diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py index e982030bc8959309e72d0f4e02b9755c48535a10..8f406ace1d5dcc13a018e56cc98c621a511da29b 100755 --- a/tensorflow/contrib/image/__init__.py +++ b/tensorflow/contrib/image/__init__.py @@ -25,6 +25,8 @@ projective transforms (including rotation) are supported. @@angles_to_projective_transforms @@compose_transforms @@adjust_yiq_hsv +@@flat_transforms_to_matrices +@@matrices_to_flat_transforms @@random_yiq_hsv @@rotate @@transform @@ -58,6 +60,8 @@ from tensorflow.contrib.image.python.ops.distort_image_ops import random_hsv_in_ from tensorflow.contrib.image.python.ops.image_ops import angles_to_projective_transforms from tensorflow.contrib.image.python.ops.image_ops import compose_transforms from tensorflow.contrib.image.python.ops.image_ops import connected_components +from tensorflow.contrib.image.python.ops.image_ops import flat_transforms_to_matrices +from tensorflow.contrib.image.python.ops.image_ops import matrices_to_flat_transforms from tensorflow.contrib.image.python.ops.image_ops import rotate from tensorflow.contrib.image.python.ops.image_ops import transform from tensorflow.contrib.image.python.ops.image_ops import translate diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc index 645abbf0b0ea5465dadf55d065e997e16940c18d..bbb3a3b18fd7bfdc68e8b8532568985245154794 100644 --- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc +++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc @@ -59,7 +59,7 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count, delta_h, scale_s, scale_v, tranformation_matrix.flat().data(), tranformation_matrix.flat().size()); // Call cuBlas C = A * B directly. - auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; + auto no_transpose = se::blas::Transpose::kNoTranspose; auto a_ptr = AsDeviceMemory(input->flat().data(), input->flat().size()); auto b_ptr = AsDeviceMemory(tranformation_matrix.flat().data(), diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py index 2a3592c53fdda488561e504ba2712aadc3214cc4..432b67e5690003d37bc0a4f53b3b091468b40a5c 100644 --- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py +++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py @@ -814,6 +814,21 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase): new_cov = sess.run(factor.make_covariance_update_op(0.)) self.assertAllClose([[(1. + 4.) / 2.]], new_cov) + def testSubSample(self): + with tf_ops.Graph().as_default(): + patches_1 = array_ops.constant(1, shape=(10, 2)) + patches_2 = array_ops.constant(1, shape=(10, 8)) + patches_3 = array_ops.constant(1, shape=(3, 3)) + patches_1_sub = ff._subsample_for_cov_computation(patches_1) + patches_2_sub = ff._subsample_for_cov_computation(patches_2) + patches_3_sub = ff._subsample_for_cov_computation(patches_3) + patches_1_sub_batch_size = patches_1_sub.shape.as_list()[0] + patches_2_sub_batch_size = patches_2_sub.shape.as_list()[0] + patches_3_sub_batch_size = patches_3_sub.shape.as_list()[0] + self.assertEqual(2, patches_1_sub_batch_size) + self.assertEqual(8, patches_2_sub_batch_size) + self.assertEqual(3, patches_3_sub_batch_size) + class ConvOutputKroneckerFactorTest(ConvFactorTestCase): diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD index b897fd68a080e819042cd36f2a1acfcf175e656b..cb0917bb851cff2dd25e398ec9dc3cc0ffcddedf 100644 --- a/tensorflow/contrib/kfac/python/ops/BUILD +++ b/tensorflow/contrib/kfac/python/ops/BUILD @@ -37,10 +37,13 @@ py_library( deps = [ ":utils", "//tensorflow/python:array_ops", + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", "//tensorflow/python:init_ops", "//tensorflow/python:linalg_ops", "//tensorflow/python:math_ops", + "//tensorflow/python:random_ops", "//tensorflow/python:special_math_ops", "//tensorflow/python:training", "//tensorflow/python:variable_scope", diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py index 0d40d265a1727075d0ba721b0d9a756c38269a96..7988a3b92bf0139b382e43acc534664ca5bb261e 100644 --- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py +++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py @@ -32,6 +32,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops from tensorflow.python.ops import special_math_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables @@ -55,6 +56,22 @@ EIGENVALUE_DECOMPOSITION_THRESHOLD = 2 # matrix powers. Must be nonnegative. EIGENVALUE_CLIPPING_THRESHOLD = 0.0 +# Used to subsample the flattened extracted image patches. The number of +# outer products per row of the covariance matrix should not exceed this +# value. This parameter is used only if `_SUB_SAMPLE_OUTER_PRODUCTS` is True. +_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = 1 + +# Used to subsample the inputs passed to the extract image patches. The batch +# size of number of inputs to extract image patches is multiplied by this +# factor. This parameter is used only if `_SUB_SAMPLE_INPUTS` is True. +_INPUTS_TO_EXTRACT_PATCHES_FACTOR = 0.5 + +# If True, then subsamples the tensor passed to compute the covaraince matrix. +_SUB_SAMPLE_OUTER_PRODUCTS = False + +# If True, then subsamples the tensor passed to compute the covaraince matrix. +_SUB_SAMPLE_INPUTS = False + # TOWER_STRATEGY can be one of "concat" or "separate". If "concat", the data # passed to the factors from the blocks will be concatenated across towers # (lazilly via PartitionedTensor objects). Otherwise a tuple of tensors over @@ -67,12 +84,20 @@ def set_global_constants(init_covariances_at_zero=None, zero_debias=None, eigenvalue_decomposition_threshold=None, eigenvalue_clipping_threshold=None, + max_num_outer_products_per_cov_row=None, + sub_sample_outer_products=None, + inputs_to_extract_patches_factor=None, + sub_sample_inputs=None, tower_strategy=None): """Sets various global constants used by the classes in this module.""" global INIT_COVARIANCES_AT_ZERO global ZERO_DEBIAS global EIGENVALUE_DECOMPOSITION_THRESHOLD global EIGENVALUE_CLIPPING_THRESHOLD + global _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW + global _SUB_SAMPLE_OUTER_PRODUCTS + global _INPUTS_TO_EXTRACT_PATCHES_FACTOR + global _SUB_SAMPLE_INPUTS global TOWER_STRATEGY if init_covariances_at_zero is not None: @@ -83,6 +108,14 @@ def set_global_constants(init_covariances_at_zero=None, EIGENVALUE_DECOMPOSITION_THRESHOLD = eigenvalue_decomposition_threshold if eigenvalue_clipping_threshold is not None: EIGENVALUE_CLIPPING_THRESHOLD = eigenvalue_clipping_threshold + if max_num_outer_products_per_cov_row is not None: + _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = max_num_outer_products_per_cov_row + if sub_sample_outer_products is not None: + _SUB_SAMPLE_OUTER_PRODUCTS = sub_sample_outer_products + if inputs_to_extract_patches_factor is not None: + _INPUTS_TO_EXTRACT_PATCHES_FACTOR = inputs_to_extract_patches_factor + if sub_sample_inputs is not None: + _SUB_SAMPLE_INPUTS = sub_sample_inputs if tower_strategy is not None: TOWER_STRATEGY = tower_strategy @@ -227,6 +260,58 @@ def graph_func_to_string(func): return list_to_string(func.func_id) +def _subsample_for_cov_computation(array, name=None): + """Subsamples the first dimension of the array. + + `array`(A) is a tensor of shape `[batch_size, dim_2]`. Then the covariance + matrix(A^TA) is of shape `dim_2 ** 2`. Subsample only if the number of outer + products per row of the covariance matrix is greater than + `_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW`. + + Args: + array: Tensor, of shape `[batch_size, dim_2]`. + name: `string`, Default(None) + + Returns: + A tensor of shape `[max_samples, dim_2]`. + + Raises: + ValueError: If array's is not matrix-shaped. + ValueError: If array's batch_size cannot be inferred. + + """ + with tf_ops.name_scope(name, "subsample", [array]): + array = tf_ops.convert_to_tensor(array) + if len(array.shape) != 2: + raise ValueError("Input param array must be a matrix.") + + batch_size = array.shape.as_list()[0] + if batch_size is None: + raise ValueError("Unable to get batch_size from input param array.") + + num_cov_rows = array.shape.as_list()[-1] + max_batch_size = int(_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW * num_cov_rows) + if batch_size <= max_batch_size: + return array + + return _random_tensor_gather(array, max_batch_size) + + +def _random_tensor_gather(array, max_size): + """Generates a random set of indices and gathers the value at the indcices. + + Args: + array: Tensor, of shape `[batch_size, dim_2]`. + max_size: int, Number of indices to sample. + + Returns: + A tensor of shape `[max_size, ...]`. + """ + batch_size = array.shape.as_list()[0] + indices = random_ops.random_shuffle(math_ops.range(0, batch_size))[:max_size] + return array_ops.gather(array, indices) + + @six.add_metaclass(abc.ABCMeta) class FisherFactor(object): """Base class for objects modeling factors of approximate Fisher blocks. @@ -1153,7 +1238,9 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): dilation_rate=None, data_format=None, extract_patches_fn=None, - has_bias=False): + has_bias=False, + sub_sample_inputs=None, + sub_sample_patches=None): """Initializes ConvInputKroneckerFactor. Args: @@ -1173,6 +1260,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): patches. One of "extract_convolution_patches", "extract_image_patches", "extract_pointwise_conv2d_patches". has_bias: bool. If True, append 1 to in_channel. + sub_sample_inputs: `bool`. If True, then subsample the inputs from which + the image patches are extracted. (Default: None) + sub_sample_patches: `bool`, If `True` then subsample the extracted + patches.(Default: None) """ self._inputs = inputs self._filter_shape = filter_shape @@ -1182,7 +1273,15 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): self._data_format = data_format self._extract_patches_fn = extract_patches_fn self._has_bias = has_bias + if sub_sample_inputs is None: + self._sub_sample_inputs = _SUB_SAMPLE_INPUTS + else: + self._sub_sample_inputs = sub_sample_inputs + if sub_sample_patches is None: + self._sub_sample_patches = _SUB_SAMPLE_OUTER_PRODUCTS + else: + self._sub_sample_patches = sub_sample_patches super(ConvInputKroneckerFactor, self).__init__() @property @@ -1215,6 +1314,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): assert source == 0 inputs = self._inputs[tower] + if self._sub_sample_inputs: + batch_size = inputs.shape.as_list()[0] + max_size = int(batch_size * _INPUTS_TO_EXTRACT_PATCHES_FACTOR) + inputs = _random_tensor_gather(inputs, max_size) # TODO(b/64144716): there is potential here for a big savings in terms of # memory use. @@ -1260,8 +1363,12 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): # |Delta| = number of spatial offsets, and J = number of input maps # for convolutional layer l. patches_flat = array_ops.reshape(patches, [-1, flatten_size]) + # We append a homogenous coordinate to patches_flat if the layer has # bias parameters. This gives us [[A_l]]_H from the paper. + if self._sub_sample_patches: + patches_flat = _subsample_for_cov_computation(patches_flat) + if self._has_bias: patches_flat = append_homog(patches_flat) # We call compute_cov without passing in a normalizer. compute_cov uses diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py index 151fc7a0d734fe8ea4d7872a4051e82d317a500e..2f3e57653c5d6d949c4dcc91635690322b7f90c4 100644 --- a/tensorflow/contrib/layers/python/layers/layers.py +++ b/tensorflow/contrib/layers/python/layers/layers.py @@ -1536,6 +1536,7 @@ def convolution3d_transpose( @add_arg_scope def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None): """Converts a dense tensor into a sparse tensor. + An example use would be to convert dense labels to sparse ones so that they can be fed to the ctc_loss. @@ -2323,11 +2324,16 @@ def images_to_sequence(inputs, outputs_collections=None, scope=None): """Convert a batch of images into a batch of sequences. + Args: inputs: a (num_images, height, width, depth) tensor data_format: A string. `NHWC` (default) and `NCHW` are supported. outputs_collections: The collections to which the outputs are added. scope: Optional scope for name_scope. + + Raises: + ValueError: If `data_format` is not either NCHW or NHWC. + Returns: (width, num_images*height, depth) sequence tensor """ @@ -2833,6 +2839,7 @@ def sequence_to_images(inputs, outputs_collections=None, scope=None): """Convert a batch of sequences into a batch of images. + Args: inputs: (num_steps, num_batches, depth) sequence tensor height: the height of the images @@ -2840,6 +2847,7 @@ def sequence_to_images(inputs, Currently supports `'channels_first'` and `'channels_last'`. outputs_collections: The collections to which the outputs are added. scope: Optional scope for name_scope. + Returns: A tensor representing the output of the operation. """ @@ -2849,7 +2857,7 @@ def sequence_to_images(inputs, if num_batches is None: num_batches = -1 else: - num_batches = num_batches // height + num_batches //= height reshaped = array_ops.reshape(inputs, [width, num_batches, height, depth]) if output_data_format == 'channels_first': diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py index c4fa3392ef9d5c5dd3760e55f37ed565580918d4..02d294c68f1e10108d774c5fe23b6371a7a9f0e6 100644 --- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py +++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py @@ -504,7 +504,11 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False): @_fn_with_custom_grad(grad_fn) def fn_with_recompute(*args): cached_vs.append(variable_scope.get_variable_scope()) - cached_arg_scope.append(contrib_framework_ops.current_arg_scope()) + # TODO(rsepassi): Rm conditional in TF 1.4 + if hasattr(contrib_framework_ops, "current_arg_scope"): + cached_arg_scope.append(contrib_framework_ops.current_arg_scope()) + else: + cached_arg_scope.append({}) return fn(*args) return fn_with_recompute(*args) diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py index 3e639a180ef11af5f7f498c647eb25417f918eb9..69bb6be81453f5f5487f25547f017dc5f87c2f2c 100644 --- a/tensorflow/contrib/layers/python/layers/target_column.py +++ b/tensorflow/contrib/layers/python/layers/target_column.py @@ -270,7 +270,7 @@ class _RegressionTargetColumn(_TargetColumn): def logits_to_predictions(self, logits, proba=False): if self.num_label_columns == 1: - return array_ops.squeeze(logits, squeeze_dims=[1]) + return array_ops.squeeze(logits, axis=[1]) return logits def get_eval_ops(self, features, logits, labels, metrics=None): @@ -418,7 +418,7 @@ def _softmax_cross_entropy_loss(logits, target): "Instead got %s." % target.dtype) # sparse_softmax_cross_entropy_with_logits requires [batch_size] target. if len(target.get_shape()) == 2: - target = array_ops.squeeze(target, squeeze_dims=[1]) + target = array_ops.squeeze(target, axis=[1]) loss_vec = nn.sparse_softmax_cross_entropy_with_logits( labels=target, logits=logits) return loss_vec diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD index d665fc9335cf22cdfa1e7330ab67003042502515..3b053cd4c66952cf6c494186b16c17f38801bcaf 100644 --- a/tensorflow/contrib/learn/BUILD +++ b/tensorflow/contrib/learn/BUILD @@ -281,7 +281,10 @@ py_test( size = "medium", srcs = ["python/learn/estimators/estimator_test.py"], srcs_version = "PY2AND3", - tags = ["manual"], + tags = [ + "manual", + "noasan", # times out + ], deps = [ ":learn", "//tensorflow/contrib/framework:framework_py", diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py index d81a534b79bc90fe91ffd3cb97a7865a7cb4c2a9..9e5aaf3118dfed4ce64dd244a915860b5a2eef44 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py @@ -715,7 +715,9 @@ class EstimatorTest(test.TestCase): ckpt = checkpoint_state_pb2.CheckpointState() text_format.Merge(checkpoint_file_content, ckpt) self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5') - self.assertAllEqual(['model.ckpt-1', 'model.ckpt-5'], + # TODO(b/78461127): Please modify tests to not directly rely on names of + # checkpoints. + self.assertAllEqual(['model.ckpt-0', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths) def test_train_save_copy_reload(self): diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py index 2b4b6eff39f4fc8a20a149edfc07d2f4f27a9bae..e28e6854a5097d66cb486be3e82f3726f5cc70fd 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/head.py +++ b/tensorflow/contrib/learn/python/learn/estimators/head.py @@ -777,7 +777,7 @@ class _RegressionHead(_SingleHead): key = prediction_key.PredictionKey.SCORES with ops.name_scope(None, "predictions", (logits,)): if self.logits_dimension == 1: - logits = array_ops.squeeze(logits, squeeze_dims=(1,), name=key) + logits = array_ops.squeeze(logits, axis=(1,), name=key) return {key: self._link_fn(logits)} def _metrics(self, eval_loss, predictions, labels, weights): @@ -974,7 +974,7 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None): is_squeezed_labels = False # TODO(ptucker): This will break for dynamic shapes. if len(labels.get_shape()) == 2: - labels = array_ops.squeeze(labels, squeeze_dims=(1,)) + labels = array_ops.squeeze(labels, axis=(1,)) is_squeezed_labels = True loss = nn.sparse_softmax_cross_entropy_with_logits( diff --git a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py index 92976d1539c7ddc226b81f903beee82b798ec8db..9f2cadb01747c5a8e4ee75ac38f423f85e11bbba 100644 --- a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py +++ b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py @@ -40,7 +40,7 @@ def mean_squared_error_regressor(tensor_in, labels, weights, biases, name=None): [tensor_in, labels]): predictions = nn.xw_plus_b(tensor_in, weights, biases) if len(labels.get_shape()) == 1 and len(predictions.get_shape()) == 2: - predictions = array_ops_.squeeze(predictions, squeeze_dims=[1]) + predictions = array_ops_.squeeze(predictions, axis=[1]) return predictions, losses.mean_squared_error(labels, predictions) diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD index 8c2c4fd29c0502d4199f27a65e4827b2db973c3d..4ce91a140f816ddc8bdc60287e4cbc807172ec6d 100644 --- a/tensorflow/contrib/legacy_seq2seq/BUILD +++ b/tensorflow/contrib/legacy_seq2seq/BUILD @@ -58,5 +58,8 @@ cuda_py_tests( "//tensorflow/python:variable_scope", "//tensorflow/python:variables", ], - tags = ["noasan"], # times out b/63678675 + tags = [ + "noasan", # times out b/63678675 + "optonly", # times out (flaky) + ], ) diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD index 2c5fa7af89bccefb84c7b0e0c0a628e3ce737706..2e92ad6eb39d8aa8876a34572f50d5b6aff0511a 100644 --- a/tensorflow/contrib/linalg/BUILD +++ b/tensorflow/contrib/linalg/BUILD @@ -59,7 +59,10 @@ cuda_py_test( "//tensorflow/python:platform_test", ], shard_count = 5, - tags = ["noasan"], + tags = [ + "noasan", + "optonly", + ], ) cuda_py_test( @@ -78,5 +81,8 @@ cuda_py_test( "//tensorflow/python:platform_test", ], shard_count = 8, - tags = ["noasan"], + tags = [ + "noasan", + "optonly", + ], ) diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py index 38bd66b13f79ee918acbdc2d33a6367e4e34d4b1..554854da84715ee8c8d00ec7f8e3156642b43d80 100644 --- a/tensorflow/contrib/linalg/__init__.py +++ b/tensorflow/contrib/linalg/__init__.py @@ -18,6 +18,9 @@ See the @{$python/contrib.linalg} guide. @@LinearOperator @@LinearOperatorBlockDiag +@@LinearOperatorCirculant +@@LinearOperatorCirculant2D +@@LinearOperatorCirculant3D @@LinearOperatorDiag @@LinearOperatorIdentity @@LinearOperatorScaledIdentity @@ -39,6 +42,7 @@ from tensorflow.contrib.linalg.python.ops.linear_operator_addition import * from tensorflow.contrib.linalg.python.ops.linear_operator_block_diag import * from tensorflow.contrib.linalg.python.ops.linear_operator_kronecker import * from tensorflow.python.ops.linalg.linear_operator import * +from tensorflow.python.ops.linalg.linear_operator_circulant import * from tensorflow.python.ops.linalg.linear_operator_composition import * from tensorflow.python.ops.linalg.linear_operator_diag import * from tensorflow.python.ops.linalg.linear_operator_full_matrix import * diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py index 6e6c812adcbaf7e90d7c10c05fdfc0e150829329..b5741967ab52568725d7c9f03a0cc0b0f63f7459 100644 --- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py +++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py @@ -39,8 +39,8 @@ from tensorflow.python.ops import variables as variables_lib from tensorflow.python.platform import googletest _MAX_ITERATIONS = 100 -_SHARD_NUMBERS = [None, 1, 3, 10] -_NUM_LOSS_PARTITIONS = [2, 4] +_SHARD_NUMBERS = [None, 1, 3] +_NUM_LOSS_PARTITIONS = [4] def make_example_proto(feature_dict, target, value=1.0): diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py index ec726bbed41a86eb314e3591ecaedaa6bf0e5e9b..5015fb0848107950dd27eb81431dd308f22858bc 100644 --- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py +++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py @@ -49,6 +49,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface): default_value, empty_key, num_shards=1, + checkpoint=True, name='ShardedMutableHashTable'): with ops.name_scope(name, 'sharded_mutable_hash_table') as scope: super(ShardedMutableDenseHashTable, self).__init__(key_dtype, @@ -61,6 +62,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface): value_dtype=value_dtype, default_value=default_value, empty_key=empty_key, + checkpoint=checkpoint, name='%s-%d-of-%d' % (name, i + 1, num_shards))) self._table_shards = table_shards # TODO(andreasst): add a value_shape() method to LookupInterface diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl index b8f6b7fd59af9834edb4aa7aefa524c25ede66d2..85216776823eab2ab3ac2a3bc666f21e312acc6c 100644 --- a/tensorflow/contrib/lite/build_def.bzl +++ b/tensorflow/contrib/lite/build_def.bzl @@ -124,19 +124,19 @@ def tf_to_tflite(name, src, options, out): out: name of the output flatbuffer file. """ - toco = "//tensorflow/contrib/lite/toco:toco" + toco_cmdline = " ".join([ + "//tensorflow/contrib/lite/toco:toco", + "--input_format=TENSORFLOW_GRAPHDEF", + "--output_format=TFLITE", + ("--input_file=$(location %s)" % src), + ("--output_file=$(location %s)" % out), + ] + options ) native.genrule( name = name, - srcs=[src, options], + srcs=[src], outs=[out], - cmd = ("$(location %s) " + - " --input_file=$(location %s) " + - " --output_file=$(location %s) " + - " --input_format=TENSORFLOW_GRAPHDEF" + - " --output_format=TFLITE" + - " `cat $(location %s)`") - % (toco, src, out, options), - tools= [toco], + cmd = toco_cmdline, + tools= ["//tensorflow/contrib/lite/toco:toco"], ) def tflite_to_json(name, src, out): diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h index 0b38f43cd32fbdfa0296eec7ef81aab76ebe5461..12841d233cc1d3c5e1219fc505b1975d2a7fa3e3 100644 --- a/tensorflow/contrib/lite/context.h +++ b/tensorflow/contrib/lite/context.h @@ -275,7 +275,7 @@ typedef struct { typedef struct TfLiteContext { // Number of tensors in the context. - int tensors_size; + size_t tensors_size; // The execution plan contains a list of the node indices in execution // order. execution_plan->size is the current number of nodes. And, @@ -397,13 +397,13 @@ typedef struct _TfLiteDelegate { // This can be null if the delegate doesn't use its own buffer. TfLiteStatus (*CopyFromBufferHandle)(TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, - void* data, int size); + void* data, size_t size); // Copy the data from raw memory to delegate buffer handle. // This can be null if the delegate doesn't use its own buffer. TfLiteStatus (*CopyToBufferHandle)(TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, - void* data, int size); + void* data, size_t size); // Free the Delegate Buffer Handle. Note: This only frees the handle, but // this doesn't release the underlying resource (e.g. textures). The diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc index 91b6c414bf036fbf57f53fc75f570b05449fa89e..9d8ea55fd1edc0dacc821536cc2b564c59f65b71 100644 --- a/tensorflow/contrib/lite/interpreter.cc +++ b/tensorflow/contrib/lite/interpreter.cc @@ -308,7 +308,12 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label, for (int i = 0; i < length; i++) { int index = indices[i]; - if (index < kOptionalTensor || index >= context_.tensors_size) { + // Continue if index == kOptionalTensor before additional comparisons below, + // size_t(-1) is always >= context_tensors_size. + if (index == kOptionalTensor) { + continue; + } + if (index < 0 || static_cast(index) >= context_.tensors_size) { ReportError(&context_, "Invalid tensor index %d in %s\n", index, label); consistent_ = false; return kTfLiteError; @@ -318,7 +323,7 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label, } TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims, - int dims_size, size_t* bytes) { + size_t dims_size, size_t* bytes) { // TODO(aselle): Check for overflow here using overflow.h in TensorFlow // MultiplyWithoutOverflow. TF_LITE_ENSURE(&context_, bytes != nullptr); @@ -645,7 +650,7 @@ TfLiteStatus Interpreter::GetNodeAndRegistration( } TfLiteStatus Interpreter::SetTensorParametersReadOnly( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization, const char* buffer, size_t bytes, const Allocation* allocation) { if (state_ == kStateInvokableAndImmutable) { @@ -691,7 +696,7 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly( // bytes. The lifetime of buffer must be ensured to be greater or equal // to Interpreter. TfLiteStatus Interpreter::SetTensorParametersReadWrite( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization) { if (state_ == kStateInvokableAndImmutable) { ReportError( diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h index a49134b95ee47b97ad1e56d07f737fa44f89badc..6f3433abcf71b6090b434d47e925775a2e517064 100644 --- a/tensorflow/contrib/lite/interpreter.h +++ b/tensorflow/contrib/lite/interpreter.h @@ -150,7 +150,7 @@ class Interpreter { }; TfLiteStatus SetTensorParametersReadOnly( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization, const char* buffer, size_t bytes, const Allocation* allocation = nullptr); @@ -165,7 +165,7 @@ class Interpreter { dims.data(), quantization); } TfLiteStatus SetTensorParametersReadWrite( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization); // Functions to access tensor data @@ -189,10 +189,10 @@ class Interpreter { } // Return the number of tensors in the model. - int tensors_size() const { return context_.tensors_size; } + size_t tensors_size() const { return context_.tensors_size; } // Return the number of ops in the model. - int nodes_size() const { return nodes_and_registration_.size(); } + size_t nodes_size() const { return nodes_and_registration_.size(); } // WARNING: Experimental interface, subject to change const std::vector& execution_plan() const { return execution_plan_; } @@ -406,7 +406,7 @@ class Interpreter { // Compute the number of bytes required to represent a tensor with dimensions // specified by the array dims (of length dims_size). Returns the status code // and bytes. - TfLiteStatus BytesRequired(TfLiteType type, const int* dims, int dims_size, + TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size, size_t* bytes); // Request an tensor be resized implementation. If the given tensor is of @@ -467,7 +467,7 @@ class Interpreter { // tensors. After calling this function, adding `kTensorsCapacityHeadroom` // more tensors won't invalidate the pointer to existing tensors. void EnsureTensorsVectorCapacity() { - const int required_capacity = tensors_size() + kTensorsCapacityHeadroom; + const size_t required_capacity = tensors_size() + kTensorsCapacityHeadroom; if (required_capacity > tensors_.capacity()) { tensors_.reserve(required_capacity); context_.tensors = tensors_.data(); diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc index 131e088079857af34478645b7f1559364d03a493..453c1ada1cf6263be14a3b170f209e3a30580cc3 100644 --- a/tensorflow/contrib/lite/interpreter_test.cc +++ b/tensorflow/contrib/lite/interpreter_test.cc @@ -887,15 +887,15 @@ class TestDelegate : public ::testing::Test { TfLiteIntArrayFree(nodes_to_separate); return kTfLiteOk; }; - delegate_.CopyToBufferHandle = [](TfLiteDelegate* delegate, - TfLiteBufferHandle buffer_handle, - void* data, int size) -> TfLiteStatus { + delegate_.CopyToBufferHandle = + [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, + void* data, size_t size) -> TfLiteStatus { // TODO(ycling): Implement tests to test buffer copying logic. return kTfLiteOk; }; delegate_.CopyFromBufferHandle = [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, - void* data, int size) -> TfLiteStatus { + void* data, size_t size) -> TfLiteStatus { // TODO(ycling): Implement tests to test buffer copying logic. return kTfLiteOk; }; diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java index 18f64651889d7eeb4be961afc47554cbcc51a410..4f5662bc2d15f1bf6bfec0b9ec79b09f9e124186 100644 --- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java +++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java @@ -47,6 +47,8 @@ import android.os.HandlerThread; import android.support.annotation.NonNull; import android.support.v13.app.FragmentCompat; import android.support.v4.content.ContextCompat; +import android.text.SpannableString; +import android.text.SpannableStringBuilder; import android.util.Log; import android.util.Size; import android.view.LayoutInflater; @@ -207,14 +209,21 @@ public class Camera2BasicFragment extends Fragment * * @param text The message to show */ - private void showToast(final String text) { + private void showToast(String s) { + SpannableStringBuilder builder = new SpannableStringBuilder(); + SpannableString str1 = new SpannableString(s); + builder.append(str1); + showToast(builder); + } + + private void showToast(SpannableStringBuilder builder) { final Activity activity = getActivity(); if (activity != null) { activity.runOnUiThread( new Runnable() { @Override public void run() { - textView.setText(text); + textView.setText(builder, TextView.BufferType.SPANNABLE); } }); } @@ -682,8 +691,9 @@ public class Camera2BasicFragment extends Fragment showToast("Uninitialized Classifier or invalid context."); return; } + SpannableStringBuilder textToShow = new SpannableStringBuilder(); Bitmap bitmap = textureView.getBitmap(classifier.getImageSizeX(), classifier.getImageSizeY()); - String textToShow = classifier.classifyFrame(bitmap); + classifier.classifyFrame(bitmap, textToShow); bitmap.recycle(); showToast(textToShow); } diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java index d32c0779101cf8e795ee9d7e970401c2c03bb33a..7bb6afd9d8b77159bb180fad6bbe43ca454f9d14 100644 --- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java +++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java @@ -19,10 +19,11 @@ import android.app.Activity; import android.content.res.AssetFileDescriptor; import android.graphics.Bitmap; import android.os.SystemClock; +import android.text.SpannableString; +import android.text.SpannableStringBuilder; +import android.text.style.ForegroundColorSpan; +import android.text.style.RelativeSizeSpan; import android.util.Log; - -import org.tensorflow.lite.Interpreter; - import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; @@ -37,11 +38,15 @@ import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.PriorityQueue; +import org.tensorflow.lite.Interpreter; /** * Classifies images with Tensorflow Lite. */ public abstract class ImageClassifier { + // Display preferences + private static final float GOOD_PROB_THRESHOLD = 0.3f; + private static final int SMALL_COLOR = 0xffddaa88; /** Tag for the {@link Log}. */ private static final String TAG = "TfLiteCameraDemo"; @@ -99,10 +104,12 @@ public abstract class ImageClassifier { } /** Classifies a frame from the preview stream. */ - String classifyFrame(Bitmap bitmap) { + void classifyFrame(Bitmap bitmap, SpannableStringBuilder builder) { + printTopKLabels(builder); + if (tflite == null) { Log.e(TAG, "Image classifier has not been initialized; Skipped."); - return "Uninitialized Classifier."; + builder.append(new SpannableString("Uninitialized Classifier.")); } convertBitmapToByteBuffer(bitmap); // Here's where the magic happens!!! @@ -115,9 +122,10 @@ public abstract class ImageClassifier { applyFilter(); // Print the results. - String textToShow = printTopKLabels(); - textToShow = Long.toString(endTime - startTime) + "ms" + textToShow; - return textToShow; + long duration = endTime - startTime; + SpannableString span = new SpannableString(duration + " ms"); + span.setSpan(new ForegroundColorSpan(android.graphics.Color.LTGRAY), 0, span.length(), 0); + builder.append(span); } void applyFilter() { @@ -202,7 +210,7 @@ public abstract class ImageClassifier { } /** Prints top-K labels, to be shown in UI as the results. */ - private String printTopKLabels() { + private void printTopKLabels(SpannableStringBuilder builder) { for (int i = 0; i < getNumLabels(); ++i) { sortedLabels.add( new AbstractMap.SimpleEntry<>(labelList.get(i), getNormalizedProbability(i))); @@ -210,13 +218,27 @@ public abstract class ImageClassifier { sortedLabels.poll(); } } - String textToShow = ""; + final int size = sortedLabels.size(); - for (int i = 0; i < size; ++i) { + for (int i = 0; i < size; i++) { Map.Entry label = sortedLabels.poll(); - textToShow = String.format("\n%s: %4.2f", label.getKey(), label.getValue()) + textToShow; + SpannableString span = + new SpannableString(String.format("%s: %4.2f\n", label.getKey(), label.getValue())); + int color; + // Make it white when probability larger than threshold. + if (label.getValue() > GOOD_PROB_THRESHOLD) { + color = android.graphics.Color.WHITE; + } else { + color = SMALL_COLOR; + } + // Make first item bigger. + if (i == size - 1) { + float sizeScale = (i == size - 1) ? 1.25f : 0.8f; + span.setSpan(new RelativeSizeSpan(sizeScale), 0, span.length(), 0); + } + span.setSpan(new ForegroundColorSpan(color), 0, span.length(), 0); + builder.insert(0, span); } - return textToShow; } /** diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png index c22509d8dfccae14d9470e3042a9ed5b469ca2c9..52cf2ab95296d675dd42533bb9136707adebd98c 100644 Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png index d68af39186ca9cd2bc755cad8397467a11844a1d..b75f892c462a12cae4f09851d019db23b286f843 100644 Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png index 15e419b7ccd88651bd21dac36853a827fc4075b8..36e14c48d14a8d3e5bf37d3caaee661061cec3be 100644 Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png index 342ce34e1663960d8d7050a9be57face3571d336..06dd2a740ec2abaec4919c991dd17ee007ffcf28 100644 Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..b94bcfc081e0b036fbba271d7cbfb986575d4abf Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml index a84f1bbfa0cb48a3fc335c9bc4aa7d8e93d20e75..20f520814d7154764932638c5e9dddc32639b677 100644 --- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml +++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml @@ -14,37 +14,50 @@ limitations under the License. --> - - - - - + + + + + + + + + + - - + diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml new file mode 100644 index 0000000000000000000000000000000000000000..72a229ecdb19f5309994e994d82e0b5b5ed617a2 --- /dev/null +++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml @@ -0,0 +1,88 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml index db557ad62f619e88f72426a48a74bffb0f57b818..d12435d5abda45917b8a4f12c4b3179997eae689 100644 --- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml +++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml @@ -14,9 +14,30 @@ limitations under the License. --> + + + + + + - + + + + android:layout_alignParentStart="true" + android:layout_alignTop="@+id/control" + android:layout_marginLeft="300dp" + android:layout_marginStart="300dp" + android:background="#bb7700"> - + android:layout_alignParentLeft="true" + android:layout_alignParentStart="true" /> - + + outputs) { if (wrapper == null) { - throw new IllegalStateException("The Interpreter has already been closed."); + throw new IllegalStateException("Internal error: The Interpreter has already been closed."); } Tensor[] tensors = wrapper.run(inputs); if (outputs == null || tensors == null || outputs.size() > tensors.length) { - throw new IllegalArgumentException("Outputs do not match with model outputs."); + throw new IllegalArgumentException("Output error: Outputs do not match with model outputs."); } final int size = tensors.length; for (Integer idx : outputs.keySet()) { if (idx == null || idx < 0 || idx >= size) { throw new IllegalArgumentException( - String.format("Invalid index of output %d (should be in range [0, %d))", idx, size)); + String.format( + "Output error: Invalid index of output %d (should be in range [0, %d))", + idx, size)); } tensors[idx].copyTo(outputs.get(idx)); } @@ -160,7 +162,7 @@ public final class Interpreter implements AutoCloseable { */ public void resizeInput(int idx, @NonNull int[] dims) { if (wrapper == null) { - throw new IllegalStateException("The Interpreter has already been closed."); + throw new IllegalStateException("Internal error: The Interpreter has already been closed."); } wrapper.resizeInput(idx, dims); } @@ -173,7 +175,7 @@ public final class Interpreter implements AutoCloseable { */ public int getInputIndex(String opName) { if (wrapper == null) { - throw new IllegalStateException("The Interpreter has already been closed."); + throw new IllegalStateException("Internal error: The Interpreter has already been closed."); } return wrapper.getInputIndex(opName); } @@ -186,7 +188,7 @@ public final class Interpreter implements AutoCloseable { */ public int getOutputIndex(String opName) { if (wrapper == null) { - throw new IllegalStateException("The Interpreter has already been closed."); + throw new IllegalStateException("Internal error: The Interpreter has already been closed."); } return wrapper.getOutputIndex(opName); } @@ -198,7 +200,7 @@ public final class Interpreter implements AutoCloseable { */ public Long getLastNativeInferenceDurationNanoseconds() { if (wrapper == null) { - throw new IllegalStateException("The interpreter has already been closed."); + throw new IllegalStateException("Internal error: The interpreter has already been closed."); } return wrapper.getLastNativeInferenceDurationNanoseconds(); } @@ -208,7 +210,8 @@ public final class Interpreter implements AutoCloseable { if (wrapper != null) { wrapper.setUseNNAPI(useNNAPI); } else { - throw new IllegalStateException("NativeInterpreterWrapper has already been closed."); + throw new IllegalStateException( + "Internal error: NativeInterpreterWrapper has already been closed."); } } diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java index 61a552db2303fdddbbf7ff6c14067794d0c30898..2fc803715be5e5afbc5d548d46c1665785f6055d 100644 --- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java +++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java @@ -80,7 +80,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { /** Sets inputs, runs model inference and returns outputs. */ Tensor[] run(Object[] inputs) { if (inputs == null || inputs.length == 0) { - throw new IllegalArgumentException("Invalid inputs. Inputs should not be null or empty."); + throw new IllegalArgumentException("Input error: Inputs should not be null or empty."); } int[] dataTypes = new int[inputs.length]; Object[] sizes = new Object[inputs.length]; @@ -92,7 +92,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { ByteBuffer buffer = (ByteBuffer) inputs[i]; if (buffer.order() != ByteOrder.nativeOrder()) { throw new IllegalArgumentException( - "Invalid ByteBuffer. It shoud use ByteOrder.nativeOrder()."); + "Input error: ByteBuffer shoud use ByteOrder.nativeOrder()."); } numsOfBytes[i] = buffer.limit(); sizes[i] = getInputDims(interpreterHandle, i, numsOfBytes[i]); @@ -103,7 +103,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { } else { throw new IllegalArgumentException( String.format( - "%d-th element of the %d inputs is not an array or a ByteBuffer.", + "Input error: %d-th element of the %d inputs is not an array or a ByteBuffer.", i, inputs.length)); } } @@ -119,7 +119,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { this, isMemoryAllocated); if (outputsHandles == null || outputsHandles.length == 0) { - throw new IllegalStateException("Interpreter has no outputs."); + throw new IllegalStateException("Internal error: Interpreter has no outputs."); } isMemoryAllocated = true; Tensor[] outputs = new Tensor[outputsHandles.length]; @@ -173,7 +173,8 @@ final class NativeInterpreterWrapper implements AutoCloseable { } else { throw new IllegalArgumentException( String.format( - "%s is not a valid name for any input. The indexes of the inputs are %s", + "Input error: %s is not a valid name for any input. " + + "The indexes of the inputs are %s", name, inputsIndexes.toString())); } } @@ -194,7 +195,8 @@ final class NativeInterpreterWrapper implements AutoCloseable { } else { throw new IllegalArgumentException( String.format( - "%s is not a valid name for any output. The indexes of the outputs are %s", + "Input error: %s is not a valid name for any output. " + + "The indexes of the outputs are %s", name, outputsIndexes.toString())); } } @@ -233,7 +235,8 @@ final class NativeInterpreterWrapper implements AutoCloseable { return DataType.BYTEBUFFER; } } - throw new IllegalArgumentException("cannot resolve DataType of " + o.getClass().getName()); + throw new IllegalArgumentException( + "DataType error: cannot resolve DataType of " + o.getClass().getName()); } /** Returns the shape of an object as an int array. */ @@ -249,7 +252,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { return 0; } if (Array.getLength(o) == 0) { - throw new IllegalArgumentException("array lengths cannot be 0."); + throw new IllegalArgumentException("Array lengths cannot be 0."); } return 1 + numDimensions(Array.get(o, 0)); } @@ -263,7 +266,7 @@ final class NativeInterpreterWrapper implements AutoCloseable { shape[dim] = len; } else if (shape[dim] != len) { throw new IllegalArgumentException( - String.format("mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim)); + String.format("Mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim)); } for (int i = 0; i < len; ++i) { fillShape(Array.get(o, i), dim + 1, shape); diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java index 54ace6c63ce5bd1b38be744176d0378e3cc8a1d3..09e887aae3339e9f114c07d689c0d7b5e2fc384b 100644 --- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java +++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java @@ -34,15 +34,16 @@ final class Tensor { if (NativeInterpreterWrapper.dataTypeOf(dst) != dtype) { throw new IllegalArgumentException( String.format( - "Cannot convert an TensorFlowLite tensor with type %s to a Java object of " - + "type %s (which is compatible with the TensorFlowLite type %s)", + "Output error: Cannot convert an TensorFlowLite tensor with type %s to a Java " + + "object of type %s (which is compatible with the TensorFlowLite type %s)", dtype, dst.getClass().getName(), NativeInterpreterWrapper.dataTypeOf(dst))); } int[] dstShape = NativeInterpreterWrapper.shapeOf(dst); if (!Arrays.equals(dstShape, shapeCopy)) { throw new IllegalArgumentException( String.format( - "Shape of output target %s does not match with the shape of the Tensor %s.", + "Output error: Shape of output target %s does not match with the shape of the " + + "Tensor %s.", Arrays.toString(dstShape), Arrays.toString(shapeCopy))); } readMultiDimensionalArray(nativeHandle, dst); diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc index 1578c9e3ddd034ad9ce17c8c3ae6c942258e2a55..34d91be04cd6c855a2068510ca810c0b93637584 100644 --- a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc +++ b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc @@ -44,7 +44,8 @@ BufferErrorReporter::BufferErrorReporter(JNIEnv* env, int limit) { buffer_ = new char[limit]; if (!buffer_) { throwException(env, kNullPointerException, - "Malloc of BufferErrorReporter to hold %d char failed.", + "Internal error: Malloc of BufferErrorReporter to hold %d " + "char failed.", limit); return; } diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc index 4c33a2dba4debf726c49016106502b7ef26ef122..45f510da1d940a288e2794cb3e08f66451956b64 100644 --- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc +++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc @@ -22,7 +22,7 @@ const int kBufferSize = 256; tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) { if (handle == 0) { throwException(env, kIllegalArgumentException, - "Invalid handle to Interpreter."); + "Internal error: Invalid handle to Interpreter."); return nullptr; } return reinterpret_cast(handle); @@ -30,7 +30,8 @@ tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) { tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) { if (handle == 0) { - throwException(env, kIllegalArgumentException, "Invalid handle to model."); + throwException(env, kIllegalArgumentException, + "Internal error: Invalid handle to model."); return nullptr; } return reinterpret_cast(handle); @@ -39,7 +40,7 @@ tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) { BufferErrorReporter* convertLongToErrorReporter(JNIEnv* env, jlong handle) { if (handle == 0) { throwException(env, kIllegalArgumentException, - "Invalid handle to ErrorReporter."); + "Internal error: Invalid handle to ErrorReporter."); return nullptr; } return reinterpret_cast(handle); @@ -51,7 +52,7 @@ std::vector convertJIntArrayToVector(JNIEnv* env, jintArray inputs) { jint* ptr = env->GetIntArrayElements(inputs, nullptr); if (ptr == nullptr) { throwException(env, kIllegalArgumentException, - "Empty dimensions of input array."); + "Array has empty dimensions."); return {}; } for (int i = 0; i < size; ++i) { @@ -113,7 +114,7 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter, jobjectArray sizes) { if (input_size != interpreter->inputs().size()) { throwException(env, kIllegalArgumentException, - "Expected num of inputs is %d but got %d", + "Input error: Expected num of inputs is %d but got %d", interpreter->inputs().size(), input_size); return kTfLiteError; } @@ -121,8 +122,9 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter, input_size != env->GetArrayLength(nums_of_bytes) || input_size != env->GetArrayLength(values)) { throwException(env, kIllegalArgumentException, - "Arrays in arguments should be of the same length, but got " - "%d sizes, %d data_types, %d nums_of_bytes, and %d values", + "Internal error: Arrays in arguments should be of the same " + "length, but got %d sizes, %d data_types, %d nums_of_bytes, " + "and %d values", input_size, env->GetArrayLength(data_types), env->GetArrayLength(nums_of_bytes), env->GetArrayLength(values)); @@ -136,8 +138,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter, int num_dims = static_cast(env->GetArrayLength(dims)); if (target->dims->size != num_dims) { throwException(env, kIllegalArgumentException, - "%d-th input should have %d dimensions, but found %d " - "dimensions", + "Input error: %d-th input should have %d dimensions, but " + "found %d dimensions", i, target->dims->size, num_dims); return kTfLiteError; } @@ -150,7 +152,8 @@ TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter, num_dims); printDims(obtained_dims.get(), kBufferSize, ptr, num_dims); throwException(env, kIllegalArgumentException, - "%d-th input dimension should be [%s], but found [%s]", + "Input error: %d-th input dimension should be [%s], but " + "found [%s]", i, expected_dims.get(), obtained_dims.get()); env->ReleaseIntArrayElements(dims, ptr, JNI_ABORT); return kTfLiteError; @@ -236,8 +239,8 @@ TfLiteStatus setInputs(JNIEnv* env, tflite::Interpreter* interpreter, TfLiteType type = resolveDataType(data_type[i]); if (type != target->type) { throwException(env, kIllegalArgumentException, - "DataType (%d) of input data does not match with the " - "DataType (%d) of model inputs.", + "Input error: DataType (%d) of input data does not " + "match with the DataType (%d) of model inputs.", type, target->type); return kTfLiteError; } @@ -270,7 +273,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env, jclass string_class = env->FindClass("java/lang/String"); if (string_class == nullptr) { throwException(env, kUnsupportedOperationException, - "Can not find java/lang/String class to get input names."); + "Internal error: Can not find java/lang/String class to get " + "input names."); return nullptr; } size_t size = interpreter->inputs().size(); @@ -292,7 +296,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env, jclass string_class = env->FindClass("java/lang/String"); if (string_class == nullptr) { throwException(env, kUnsupportedOperationException, - "Can not find java/lang/String class to get output names."); + "Internal error: Can not find java/lang/String class to get " + "output names."); return nullptr; } size_t size = interpreter->outputs().size(); @@ -361,8 +366,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel( path, verifier.get(), error_reporter); if (!model) { throwException(env, kIllegalArgumentException, - "Contents of %s does not encode a valid TensorFlowLite " - "model: %s", + "Contents of %s does not encode a valid " + "TensorFlowLite model: %s", path, error_reporter->CachedErrorMessage()); env->ReleaseStringUTFChars(model_file, path); return 0; @@ -390,8 +395,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer( buf, static_cast(capacity), error_reporter); if (!model) { throwException(env, kIllegalArgumentException, - "MappedByteBuffer does not encode a valid TensorFlowLite " - "model: %s", + "MappedByteBuffer does not encode a valid " + "TensorFlowLite model: %s", error_reporter->CachedErrorMessage()); return 0; } @@ -413,7 +418,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter( &interpreter, static_cast(num_threads)); if (status != kTfLiteOk) { throwException(env, kIllegalArgumentException, - "Cannot create interpreter: %s", + "Internal error: Cannot create interpreter: %s", error_reporter->CachedErrorMessage()); return 0; } @@ -421,7 +426,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter( status = interpreter->AllocateTensors(); if (status != kTfLiteOk) { throwException(env, kNullPointerException, - "Can not allocate memory for the interpreter", + "Internal error: Cannot allocate memory for the interpreter", error_reporter->CachedErrorMessage()); return 0; } @@ -450,7 +455,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run( // resizes inputs status = resizeInputs(env, interpreter, input_size, sizes); if (status != kTfLiteOk) { - throwException(env, kNullPointerException, "Can not resize the input: %s", + throwException(env, kNullPointerException, + "Internal error: Can not resize the input: %s", error_reporter->CachedErrorMessage()); return nullptr; } @@ -458,7 +464,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run( status = interpreter->AllocateTensors(); if (status != kTfLiteOk) { throwException(env, kNullPointerException, - "Can not allocate memory for the given inputs: %s", + "Internal error: Can not allocate memory for the given " + "inputs: %s", error_reporter->CachedErrorMessage()); return nullptr; } @@ -471,7 +478,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run( // runs inference if (interpreter->Invoke() != kTfLiteOk) { throwException(env, kIllegalArgumentException, - "Failed to run on the given Interpreter: %s", + "Internal error: Failed to run on the given Interpreter: %s", error_reporter->CachedErrorMessage()); return nullptr; } @@ -489,8 +496,9 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_run( // returns outputs const std::vector& results = interpreter->outputs(); if (results.empty()) { - throwException(env, kIllegalArgumentException, - "The Interpreter does not have any outputs."); + throwException( + env, kIllegalArgumentException, + "Internal error: The Interpreter does not have any outputs."); return nullptr; } jlongArray outputs = env->NewLongArray(results.size()); @@ -511,7 +519,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims( const int idx = static_cast(input_idx); if (input_idx < 0 || input_idx >= interpreter->inputs().size()) { throwException(env, kIllegalArgumentException, - "Out of range: Failed to get %d-th input out of %d inputs", + "Input error: Out of range: Failed to get %d-th input out of" + " %d inputs", input_idx, interpreter->inputs().size()); return nullptr; } @@ -524,8 +533,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims( } if (num_bytes != expected_num_bytes) { throwException(env, kIllegalArgumentException, - "Failed to get input dimensions. %d-th input should have" - " %d bytes, but found %d bytes.", + "Input error: Failed to get input dimensions. %d-th input " + "should have %d bytes, but found %d bytes.", idx, expected_num_bytes, num_bytes); return nullptr; } @@ -543,8 +552,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputDataType( const int idx = static_cast(output_idx); if (output_idx < 0 || output_idx >= interpreter->outputs().size()) { throwException(env, kIllegalArgumentException, - "Out of range: Failed to get %d-th output out of %d outputs", - output_idx, interpreter->outputs().size()); + "Failed to get %d-th output out of %d outputs", output_idx, + interpreter->outputs().size()); return -1; } TfLiteTensor* target = interpreter->tensor(interpreter->outputs()[idx]); @@ -565,7 +574,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput( const int idx = static_cast(input_idx); if (idx < 0 || idx >= interpreter->inputs().size()) { throwException(env, kIllegalArgumentException, - "Can not resize %d-th input for a model having %d inputs.", + "Input error: Can not resize %d-th input for a model having " + "%d inputs.", idx, interpreter->inputs().size()); return JNI_FALSE; } @@ -577,7 +587,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput( interpreter->inputs()[idx], convertJIntArrayToVector(env, dims)); if (status != kTfLiteOk) { throwException(env, kIllegalArgumentException, - "Failed to resize %d-th input: %s", idx, + "Internal error: Failed to resize %d-th input: %s", idx, error_reporter->CachedErrorMessage()); return JNI_FALSE; } diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc index 65126e78a3003f8a69c69326124d613e878c0f9d..17f4be09c63a9e80feda9d0324d49cc0418fe66a 100644 --- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc +++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc @@ -23,7 +23,7 @@ namespace { TfLiteTensor* convertLongToTensor(JNIEnv* env, jlong handle) { if (handle == 0) { throwException(env, kIllegalArgumentException, - "Invalid handle to TfLiteTensor."); + "Internal error: Invalid handle to TfLiteTensor."); return nullptr; } return reinterpret_cast(handle); @@ -36,7 +36,8 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type, size_t to_copy = num_elements * elementByteSize(type); if (to_copy > dst_size) { throwException(env, kIllegalStateException, - "cannot write Java array of %d bytes to Tensor of %d bytes", + "Internal error: cannot write Java array of %d bytes to " + "Tensor of %d bytes", to_copy, dst_size); return 0; } @@ -71,10 +72,10 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type, } default: { throwException(env, kUnsupportedOperationException, - "TensorFlowLite currently supports float (32 bits), " - "int (32 bits), byte (8 bits), and long (64 bits), " - "support for other types (DataType %d in this case) will " - "be added in the future", + "DataType error: TensorFlowLite currently supports float " + "(32 bits), int (32 bits), byte (8 bits), and long " + "(64 bits), support for other types (DataType %d in this " + "case) will be added in the future", kTfLiteFloat32, type); return 0; } @@ -88,8 +89,9 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type, if (size > src_size) { throwException( env, kIllegalStateException, - "cannot fill a Java array of %d bytes with a Tensor of %d bytes", size, - src_size); + "Internal error: cannot fill a Java array of %d bytes with a Tensor of " + "%d bytes", + size, src_size); return 0; } switch (data_type) { @@ -117,8 +119,8 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type, return size; } default: { - throwException(env, kIllegalStateException, "invalid DataType(%d)", - data_type); + throwException(env, kIllegalStateException, + "DataType error: invalid DataType(%d)", data_type); } } return 0; @@ -152,19 +154,22 @@ size_t elementByteSize(TfLiteType data_type) { switch (data_type) { case kTfLiteFloat32: static_assert(sizeof(jfloat) == 4, - "Java float not compatible with kTfLiteFloat"); + "Interal error: Java float not compatible with " + "kTfLiteFloat"); return 4; case kTfLiteInt32: static_assert(sizeof(jint) == 4, - "Java int not compatible with kTfLiteInt"); + "Interal error: Java int not compatible with kTfLiteInt"); return 4; case kTfLiteUInt8: static_assert(sizeof(jbyte) == 1, - "Java byte not compatible with kTfLiteUInt8"); + "Interal error: Java byte not compatible with " + "kTfLiteUInt8"); return 1; case kTfLiteInt64: static_assert(sizeof(jlong) == 8, - "Java long not compatible with kTfLiteInt64"); + "Interal error: Java long not compatible with " + "kTfLiteInt64"); return 8; default: return 0; @@ -212,7 +217,7 @@ Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env, int num_dims = tensor->dims->size; if (num_dims == 0) { throwException(env, kIllegalArgumentException, - "copyTo() is not meant for scalar Tensors."); + "Internal error: Cannot copy empty/scalar Tensors."); return; } readMultiDimensionalArray(env, tensor->type, tensor->data.raw, tensor->bytes, diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java index dbe45e5a05b8227b441de7ca6747f61d010ae210..7c00d3196fd001a288d77d4e01f0b30978d72afe 100644 --- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java +++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java @@ -321,9 +321,7 @@ public final class NativeInterpreterWrapperTest { wrapper.run(inputs); fail(); } catch (IllegalArgumentException e) { - assertThat(e) - .hasMessageThat() - .contains("Invalid inputs. Inputs should not be null or empty."); + assertThat(e).hasMessageThat().contains("Inputs should not be null or empty."); } wrapper.close(); } @@ -440,7 +438,7 @@ public final class NativeInterpreterWrapperTest { NativeInterpreterWrapper.numDimensions(emptyArray); fail(); } catch (IllegalArgumentException e) { - assertThat(e).hasMessageThat().contains("array lengths cannot be 0."); + assertThat(e).hasMessageThat().contains("Array lengths cannot be 0."); } } diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD index 8cfa7e53d1d15b1dd71e7515b717a45173ffd08b..80cefe83b29192d3ed8f37fc38c4b6d8a9a039a9 100644 --- a/tensorflow/contrib/lite/kernels/BUILD +++ b/tensorflow/contrib/lite/kernels/BUILD @@ -212,6 +212,7 @@ tf_cc_test( name = "audio_spectrogram_test", size = "small", srcs = ["audio_spectrogram_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -225,6 +226,7 @@ tf_cc_test( name = "mfcc_test", size = "small", srcs = ["mfcc_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -346,6 +348,7 @@ tf_cc_test( name = "cast_test", size = "small", srcs = ["cast_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -398,6 +401,7 @@ tf_cc_test( name = "dequantize_test", size = "small", srcs = ["dequantize_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", @@ -504,6 +508,7 @@ tf_cc_test( name = "maximum_minimum_test", size = "small", srcs = ["maximum_minimum_test.cc"], + tags = ["tflite_not_portable_ios"], deps = [ ":builtin_ops", "//tensorflow/contrib/lite:framework", diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h index 9ac48ebdc99abae910131fa17a0c5369a024c464..9e9aba0169bcd4ff3453557cb16737888c899be3 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h @@ -1203,13 +1203,330 @@ void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, output_activation_max, output_data, output_dims, gemm_context); } +// Internal function doing the actual arithmetic work for +// ExperimentalShuffledFullyConnected. +// May be called either directly by it (single-threaded case) or may be used +// as the 'task' for worker threads to run (multi-threaded case, see +// ExperimentalShuffledFullyConnectedWorkerTask below). +inline void ExperimentalShuffledFullyConnectedWorkerImpl( + const uint8* shuffled_input_workspace_data, + const int8* shuffled_weights_data, int batches, int output_depth, + int output_stride, int accum_depth, const int32* bias_data, + int32 output_multiplier, int output_shift, int16* output_data) { +#if defined USE_NEON + const int8* shuffled_weights_ptr = shuffled_weights_data; + if (batches == 1) { + const int right_shift = output_shift > 0 ? output_shift : 0; + const int left_shift = output_shift > 0 ? 0 : -output_shift; + for (int c = 0; c < output_depth; c += 4) { + // Accumulation loop. + int32x4_t row_accum0 = vdupq_n_s32(0); + int32x4_t row_accum1 = vdupq_n_s32(0); + int32x4_t row_accum2 = vdupq_n_s32(0); + int32x4_t row_accum3 = vdupq_n_s32(0); + for (int d = 0; d < accum_depth; d += 16) { + int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0); + int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16); + int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32); + int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48); + shuffled_weights_ptr += 64; + int8x16_t input = + vreinterpretq_s8_u8(vld1q_u8(shuffled_input_workspace_data + d)); + int16x8_t local_accum0 = + vmull_s8(vget_low_s8(weights0), vget_low_s8(input)); + int16x8_t local_accum1 = + vmull_s8(vget_low_s8(weights1), vget_low_s8(input)); + int16x8_t local_accum2 = + vmull_s8(vget_low_s8(weights2), vget_low_s8(input)); + int16x8_t local_accum3 = + vmull_s8(vget_low_s8(weights3), vget_low_s8(input)); + local_accum0 = + vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input)); + local_accum1 = + vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input)); + local_accum2 = + vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input)); + local_accum3 = + vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input)); + row_accum0 = vpadalq_s16(row_accum0, local_accum0); + row_accum1 = vpadalq_s16(row_accum1, local_accum1); + row_accum2 = vpadalq_s16(row_accum2, local_accum2); + row_accum3 = vpadalq_s16(row_accum3, local_accum3); + } + // Horizontally reduce accumulators + int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, + pairwise_reduced_acc_2, pairwise_reduced_acc_3; + pairwise_reduced_acc_0 = + vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0)); + pairwise_reduced_acc_1 = + vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1)); + pairwise_reduced_acc_2 = + vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2)); + pairwise_reduced_acc_3 = + vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3)); + const int32x2_t reduced_lo = + vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); + const int32x2_t reduced_hi = + vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); + int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); + // Add bias values. + int32x4_t bias_vec = vld1q_s32(bias_data + c); + reduced = vaddq_s32(reduced, bias_vec); + reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); + // Multiply by the fixed-point multiplier. + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + // Rounding-shift-right. + using gemmlowp::RoundingDivideByPOT; + reduced = RoundingDivideByPOT(reduced, right_shift); + // Narrow values down to 16 bit signed. + const int16x4_t res16 = vqmovn_s32(reduced); + vst1_s16(output_data + c, res16); + } + } else if (batches == 4) { + const int right_shift = output_shift > 0 ? output_shift : 0; + const int left_shift = output_shift > 0 ? 0 : -output_shift; + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = + reinterpret_cast(shuffled_input_workspace_data); + // Accumulation loop. + int32x4_t row_accum00 = vdupq_n_s32(0); + int32x4_t row_accum10 = vdupq_n_s32(0); + int32x4_t row_accum20 = vdupq_n_s32(0); + int32x4_t row_accum30 = vdupq_n_s32(0); + int32x4_t row_accum01 = vdupq_n_s32(0); + int32x4_t row_accum11 = vdupq_n_s32(0); + int32x4_t row_accum21 = vdupq_n_s32(0); + int32x4_t row_accum31 = vdupq_n_s32(0); + int32x4_t row_accum02 = vdupq_n_s32(0); + int32x4_t row_accum12 = vdupq_n_s32(0); + int32x4_t row_accum22 = vdupq_n_s32(0); + int32x4_t row_accum32 = vdupq_n_s32(0); + int32x4_t row_accum03 = vdupq_n_s32(0); + int32x4_t row_accum13 = vdupq_n_s32(0); + int32x4_t row_accum23 = vdupq_n_s32(0); + int32x4_t row_accum33 = vdupq_n_s32(0); + for (int d = 0; d < accum_depth; d += 16) { + int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0); + int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16); + int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32); + int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48); + shuffled_weights_ptr += 64; + int8x16_t input0 = vld1q_s8(shuffled_input_ptr + 0); + int8x16_t input1 = vld1q_s8(shuffled_input_ptr + 16); + int8x16_t input2 = vld1q_s8(shuffled_input_ptr + 32); + int8x16_t input3 = vld1q_s8(shuffled_input_ptr + 48); + shuffled_input_ptr += 64; + int16x8_t local_accum0, local_accum1, local_accum2, local_accum3; +#define TFLITE_SHUFFLED_FC_ACCUM(B) \ + local_accum0 = vmull_s8(vget_low_s8(weights0), vget_low_s8(input##B)); \ + local_accum1 = vmull_s8(vget_low_s8(weights1), vget_low_s8(input##B)); \ + local_accum2 = vmull_s8(vget_low_s8(weights2), vget_low_s8(input##B)); \ + local_accum3 = vmull_s8(vget_low_s8(weights3), vget_low_s8(input##B)); \ + local_accum0 = \ + vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input##B)); \ + local_accum1 = \ + vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input##B)); \ + local_accum2 = \ + vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input##B)); \ + local_accum3 = \ + vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input##B)); \ + row_accum0##B = vpadalq_s16(row_accum0##B, local_accum0); \ + row_accum1##B = vpadalq_s16(row_accum1##B, local_accum1); \ + row_accum2##B = vpadalq_s16(row_accum2##B, local_accum2); \ + row_accum3##B = vpadalq_s16(row_accum3##B, local_accum3); + + TFLITE_SHUFFLED_FC_ACCUM(0) + TFLITE_SHUFFLED_FC_ACCUM(1) + TFLITE_SHUFFLED_FC_ACCUM(2) + TFLITE_SHUFFLED_FC_ACCUM(3) + +#undef TFLITE_SHUFFLED_FC_ACCUM + } + // Horizontally reduce accumulators + +#define TFLITE_SHUFFLED_FC_STORE(B) \ + { \ + int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, \ + pairwise_reduced_acc_2, pairwise_reduced_acc_3; \ + pairwise_reduced_acc_0 = \ + vpadd_s32(vget_low_s32(row_accum0##B), vget_high_s32(row_accum0##B)); \ + pairwise_reduced_acc_1 = \ + vpadd_s32(vget_low_s32(row_accum1##B), vget_high_s32(row_accum1##B)); \ + pairwise_reduced_acc_2 = \ + vpadd_s32(vget_low_s32(row_accum2##B), vget_high_s32(row_accum2##B)); \ + pairwise_reduced_acc_3 = \ + vpadd_s32(vget_low_s32(row_accum3##B), vget_high_s32(row_accum3##B)); \ + const int32x2_t reduced_lo = \ + vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); \ + const int32x2_t reduced_hi = \ + vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); \ + int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); \ + int32x4_t bias_vec = vld1q_s32(bias_data + c); \ + reduced = vaddq_s32(reduced, bias_vec); \ + reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); \ + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); \ + using gemmlowp::RoundingDivideByPOT; \ + reduced = RoundingDivideByPOT(reduced, right_shift); \ + const int16x4_t res16 = vqmovn_s32(reduced); \ + vst1_s16(output_data + c + B * output_stride, res16); \ + } + + TFLITE_SHUFFLED_FC_STORE(0); + TFLITE_SHUFFLED_FC_STORE(1); + TFLITE_SHUFFLED_FC_STORE(2); + TFLITE_SHUFFLED_FC_STORE(3); + +#undef TFLITE_SHUFFLED_FC_STORE + } + } else { + TFLITE_DCHECK(false); + return; + } +#else + if (batches == 1) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4] = {0}; + // Accumulation loop. + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_data[d + j]; + int8 weights_val = *shuffled_weights_ptr++; + accum[i] += weights_val * input_val; + } + } + } + for (int i = 0; i < 4; i++) { + // Add bias value + int acc = accum[i] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, -32768); + acc = std::min(acc, 32767); + output_ptr[c + i] = acc; + } + } + } else if (batches == 4) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = shuffled_input_data; + // Accumulation loop. + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4][4]; + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + accum[i][b] = 0; + } + } + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_ptr[16 * b + j]; + int8 weights_val = shuffled_weights_ptr[16 * i + j]; + accum[i][b] += weights_val * input_val; + } + } + } + shuffled_input_ptr += 64; + shuffled_weights_ptr += 64; + } + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + // Add bias value + int acc = accum[i][b] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The + // quantized multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, -32768); + acc = std::min(acc, 32767); + output_ptr[b * output_stride + c + i] = acc; + } + } + } + } else { + TFLITE_DCHECK(false); + return; + } +#endif +} + +// Wraps ExperimentalShuffledFullyConnectedWorkerImpl into a Task class +// to allow using gemmlowp's threadpool. +struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { + ExperimentalShuffledFullyConnectedWorkerTask( + const uint8* input_data, const int8* shuffled_weights_data, int batches, + int output_depth, int output_stride, int accum_depth, + const int32* bias_data, int32 output_multiplier, int output_shift, + int16* output_data) + : input_data_(input_data), + shuffled_weights_data_(shuffled_weights_data), + batches_(batches), + output_depth_(output_depth), + output_stride_(output_stride), + accum_depth_(accum_depth), + bias_data_(bias_data), + output_multiplier_(output_multiplier), + output_shift_(output_shift), + output_data_(output_data) {} + + void Run() override { + ExperimentalShuffledFullyConnectedWorkerImpl( + input_data_, shuffled_weights_data_, batches_, output_depth_, + output_stride_, accum_depth_, bias_data_, output_multiplier_, + output_shift_, output_data_); + } + + const uint8* input_data_; + const int8* shuffled_weights_data_; + int batches_; + int output_depth_; + int output_stride_; + int accum_depth_; + const int32* bias_data_; + int32 output_multiplier_; + int output_shift_; + int16* output_data_; +}; + inline void ExperimentalShuffledFullyConnected( const uint8* input_data, const Dims<4>& input_dims, const uint8* shuffled_weights_data, const Dims<4>& weights_dims, const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, int output_shift, int32 output_activation_min, int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { + uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { gemmlowp::ScopedProfilingLabel label( "ExperimentalShuffledFullyConnected/8bit"); (void)gemm_context; // only used in optimized code. @@ -1226,117 +1543,100 @@ inline void ExperimentalShuffledFullyConnected( const int accum_depth = ArraySize(weights_dims, 0); TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - // The experimental shuffling is an optimization for matrix*vector product. - // We aren't interested in supporting non-matrix*vector-product cases, i.e. - // batches>1. - TFLITE_DCHECK_EQ(batches, 1); + TFLITE_DCHECK((accum_depth % 16) == 0); + TFLITE_DCHECK((output_depth % 4) == 0); // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) // so that just reinterpreting them as int8 values is equivalent to // subtracting 128 from them, thus implementing for free the subtraction of // the zero_point value 128. - const int8* shuffled_weights_ptr = + const int8* int8_shuffled_weights_data = reinterpret_cast(shuffled_weights_data); -#if defined USE_NEON - // We'll only need to xor signbit to the input activation values, as - // that xor-ing is pre-built into the shuffled weights values. - const uint8x16_t signbit = vdupq_n_u8(0x80); - const int right_shift = output_shift > 0 ? output_shift : 0; - const int left_shift = output_shift > 0 ? 0 : -output_shift; - for (int c = 0; c < output_depth; c += 4) { - // Accumulation loop. - int32x4_t row_accum0 = vdupq_n_s32(0); - int32x4_t row_accum1 = vdupq_n_s32(0); - int32x4_t row_accum2 = vdupq_n_s32(0); - int32x4_t row_accum3 = vdupq_n_s32(0); - for (int d = 0; d < accum_depth; d += 16) { - int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0); - int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16); - int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32); - int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48); - shuffled_weights_ptr += 64; - int8x16_t input = - vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(input_data + d))); - int16x8_t local_accum0 = - vmull_s8(vget_low_s8(weights0), vget_low_s8(input)); - int16x8_t local_accum1 = - vmull_s8(vget_low_s8(weights1), vget_low_s8(input)); - int16x8_t local_accum2 = - vmull_s8(vget_low_s8(weights2), vget_low_s8(input)); - int16x8_t local_accum3 = - vmull_s8(vget_low_s8(weights3), vget_low_s8(input)); - local_accum0 = - vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input)); - local_accum1 = - vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input)); - local_accum2 = - vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input)); - local_accum3 = - vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input)); - row_accum0 = vpadalq_s16(row_accum0, local_accum0); - row_accum1 = vpadalq_s16(row_accum1, local_accum1); - row_accum2 = vpadalq_s16(row_accum2, local_accum2); - row_accum3 = vpadalq_s16(row_accum3, local_accum3); + + // Shuffling and xoring of input activations into the workspace buffer + if (batches == 1) { +#ifdef USE_NEON + const uint8x16_t signbit = vdupq_n_u8(0x80); + for (int i = 0; i < accum_depth; i += 16) { + uint8x16_t val = vld1q_u8(input_data + i); + val = veorq_u8(val, signbit); + vst1q_u8(shuffled_input_workspace_data + i, val); } - // Horizontally reduce accumulators - int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, - pairwise_reduced_acc_2, pairwise_reduced_acc_3; - pairwise_reduced_acc_0 = - vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0)); - pairwise_reduced_acc_1 = - vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1)); - pairwise_reduced_acc_2 = - vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2)); - pairwise_reduced_acc_3 = - vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3)); - const int32x2_t reduced_lo = - vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); - const int32x2_t reduced_hi = - vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); - int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); - // Add bias values. - int32x4_t bias_vec = vld1q_s32(bias_data + c); - reduced = vaddq_s32(reduced, bias_vec); - reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); - // Multiply by the fixed-point multiplier. - reduced = vqrdmulhq_n_s32(reduced, output_multiplier); - // Rounding-shift-right. - using gemmlowp::RoundingDivideByPOT; - reduced = RoundingDivideByPOT(reduced, right_shift); - // Narrow values down to 16 bit signed. - const int16x4_t res16 = vqmovn_s32(reduced); - vst1_s16(output_data + c, res16); - } #else - for (int c = 0; c < output_depth; c += 4) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4] = {0}; - // Accumulation loop. - for (int d = 0; d < accum_depth; d += 16) { - for (int i = 0; i < 4; i++) { + for (int i = 0; i < accum_depth; i++) { + shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; + } +#endif + } else if (batches == 4) { + uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + int c = 0; +#ifdef USE_NEON + const uint8x16_t signbit = vdupq_n_u8(0x80); + for (c = 0; c < accum_depth; c += 16) { + const uint8* src_data_ptr = input_data + c; + uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth); + uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth); + uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth); + uint8x16_t val3 = vld1q_u8(src_data_ptr + 3 * accum_depth); + val0 = veorq_u8(val0, signbit); + val1 = veorq_u8(val1, signbit); + val2 = veorq_u8(val2, signbit); + val3 = veorq_u8(val3, signbit); + vst1q_u8(shuffled_input_workspace_ptr + 0, val0); + vst1q_u8(shuffled_input_workspace_ptr + 16, val1); + vst1q_u8(shuffled_input_workspace_ptr + 32, val2); + vst1q_u8(shuffled_input_workspace_ptr + 48, val3); + shuffled_input_workspace_ptr += 64; + } +#else + for (c = 0; c < accum_depth; c += 16) { + for (int b = 0; b < 4; b++) { + const uint8* src_data_ptr = input_data + b * accum_depth + c; for (int j = 0; j < 16; j++) { - int8 input_val = input_data[d + j] - 128; - int8 weights_val = *shuffled_weights_ptr++; - accum[i] += weights_val * input_val; + uint8 src_val = *src_data_ptr++; + // Flip the sign bit, so that the kernel will only need to + // reinterpret these uint8 values as int8, getting for free the + // subtraction of the zero_point value 128. + uint8 dst_val = src_val ^ 0x80; + *shuffled_input_workspace_ptr++ = dst_val; } } } - for (int i = 0; i < 4; i++) { - // Add bias value - int acc = accum[i] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = - MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[c + i] = acc; - } - } #endif + } else { + TFLITE_DCHECK(false); + return; + } + + static constexpr int kKernelRows = 4; + const int thread_count = gemmlowp::HowManyThreads( + gemm_context->max_num_threads(), output_depth, batches, accum_depth); + if (thread_count == 1) { + // Single-thread case: do the computation on the current thread, don't + // use a threadpool + ExperimentalShuffledFullyConnectedWorkerImpl( + shuffled_input_workspace_data, int8_shuffled_weights_data, batches, + output_depth, output_depth, accum_depth, bias_data, output_multiplier, + output_shift, output_data); + return; + } + + // Multi-threaded case: use the gemmlowp context's threadpool. + TFLITE_DCHECK_GT(thread_count, 1); + std::vector tasks(thread_count); + const int kRowsPerWorker = + gemmlowp::RoundUp(output_depth / thread_count); + int row_start = 0; + for (int i = 0; i < thread_count; i++) { + int row_end = std::min(output_depth, row_start + kRowsPerWorker); + tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask( + shuffled_input_workspace_data, + int8_shuffled_weights_data + row_start * accum_depth, batches, + row_end - row_start, output_depth, accum_depth, bias_data + row_start, + output_multiplier, output_shift, output_data + row_start); + row_start = row_end; + } + TFLITE_DCHECK_EQ(row_start, output_depth); + gemm_context->workers_pool()->Execute(tasks); } template @@ -5474,6 +5774,9 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims, const std::vector& right_paddings, T* output_data, const Dims<4>& output_dims, const int32_t pad_value) { gemmlowp::ScopedProfilingLabel label("Pad"); + TFLITE_DCHECK_EQ(left_paddings.size(), 4); + TFLITE_DCHECK_EQ(right_paddings.size(), 4); + const int output_batch = ArraySize(output_dims, 3); const int output_height = ArraySize(output_dims, 2); const int output_width = ArraySize(output_dims, 1); @@ -5561,43 +5864,127 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims, output_dims, 0); } +// UNOPTIMIZED COPY of StridedSlice from reference_ops.h (see comments there). + +// Use until std::clamp() is available from C++17. +inline int Clamp(const int v, const int lo, const int hi) { + TFLITE_DCHECK(!(hi < lo)); + if (hi < v) return hi; + if (v < lo) return lo; + return v; +} + +inline int StartForAxis(int begin_mask, const std::vector& start_indices, + const std::vector& strides, + const Dims<4>& input_shape, int axis) { + // Begin with the specified index + int start = start_indices[axis]; + + // begin_mask override + if (begin_mask & 1 << axis) { + if (strides[axis] > 0) { + // Forward iteration - use the first element. These values will get + // clamped below (Note: We could have set them to 0 and axis_size-1, but + // use lowest() and max() to maintain symmetry with StopForAxis()) + start = std::numeric_limits::lowest(); + } else { + // Backward iteration - use the last element. + start = std::numeric_limits::max(); + } + } + + // Handle negative indices + int axis_size = input_shape.sizes[axis]; + if (start < 0) { + start += axis_size; + } + + // Clamping + start = Clamp(start, 0, axis_size - 1); + + return start; +} + +inline int StopForAxis(int end_mask, const std::vector& stop_indices, + const std::vector& strides, + const Dims<4>& input_shape, int axis) { + // Begin with the specified index + int stop = stop_indices[axis]; + + // end_mask override + if (end_mask & (1 << axis)) { + if (strides[axis] > 0) { + // Forward iteration - use the last element. These values will get + // clamped below + stop = std::numeric_limits::max(); + } else { + // Backward iteration - use the first element. + stop = std::numeric_limits::lowest(); + } + } + + // Handle negative indices + int axis_size = input_shape.sizes[axis]; + if (stop < 0) { + stop += axis_size; + } + + // Clamping + // Because the end index points one past the last element, we need slightly + // different clamping ranges depending on the direction. + if (strides[axis] > 0) { + // Forward iteration + stop = Clamp(stop, 0, axis_size); + } else { + // Backward iteration + stop = Clamp(stop, -1, axis_size - 1); + } + + return stop; +} + +inline bool LoopCondition(int index, int stop, int stride) { + // True when we have reached the end of an axis and should loop. + return stride > 0 ? index >= stop : index <= stop; +} + template inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, int begin_mask, int end_mask, - const std::vector& starts, - const std::vector& stops, + const std::vector& start_indices, + const std::vector& stop_indices, const std::vector& strides, T* output_data, const Dims<4>& output_dims) { - gemmlowp::ScopedProfilingLabel label("StridedSlice"); - const int start_b = (begin_mask & 8) ? 0 : starts[3]; - const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3]; - const int start_h = (begin_mask & 4) ? 0 : starts[2]; - const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2]; - const int start_w = (begin_mask & 2) ? 0 : starts[1]; - const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1]; - const int start_d = (begin_mask & 1) ? 0 : starts[0]; - const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0]; + TFLITE_DCHECK_EQ(start_indices.size(), 4); + TFLITE_DCHECK_EQ(stop_indices.size(), 4); + TFLITE_DCHECK_EQ(strides.size(), 4); + const int start_b = + StartForAxis(begin_mask, start_indices, strides, input_dims, 3); + const int stop_b = + StopForAxis(end_mask, stop_indices, strides, input_dims, 3); + const int start_h = + StartForAxis(begin_mask, start_indices, strides, input_dims, 2); + const int stop_h = + StopForAxis(end_mask, stop_indices, strides, input_dims, 2); + const int start_w = + StartForAxis(begin_mask, start_indices, strides, input_dims, 1); + const int stop_w = + StopForAxis(end_mask, stop_indices, strides, input_dims, 1); + const int start_d = + StartForAxis(begin_mask, start_indices, strides, input_dims, 0); + const int stop_d = + StopForAxis(end_mask, stop_indices, strides, input_dims, 0); T* out_ptr = output_data; - if (strides[0] == 0) { - for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) { - for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) { - for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) { - const int len = stop_d - start_d; - memcpy(out_ptr, - input_data + Offset(input_dims, start_d, in_w, in_h, in_b), - len * sizeof(T)); - out_ptr += len; - } - } - } - } else { - for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) { - for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) { - for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) { - for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) { - *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; - } + for (int in_b = start_b; !LoopCondition(in_b, stop_b, strides[3]); + in_b += strides[3]) { + for (int in_h = start_h; !LoopCondition(in_h, stop_h, strides[2]); + in_h += strides[2]) { + for (int in_w = start_w; !LoopCondition(in_w, stop_w, strides[1]); + in_w += strides[1]) { + for (int in_d = start_d; !LoopCondition(in_d, stop_d, strides[0]); + in_d += strides[0]) { + *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; } } } diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h index 4e324a5e107cf5a90c0042331899edab831c8e51..ff15f3e3b10324f8fa0ebcd924560c61221e7ee5 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ -#define TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ // TODO(ghodrat): Remove this header file and the dependency to internal data // structure. @@ -135,4 +135,4 @@ void NeonReductionSumVector(const float* input_vector, float* output_vector, } // namespace tensor_utils } // namespace tflite -#endif // TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index b11489e4af0d4658ee2c44da55e379deee1fbb6e..4c8cbe42759d0af6bacca718d7f6bcae5efa1169 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -608,8 +608,9 @@ inline void ExperimentalShuffledFullyConnected( const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, int output_shift, int32 output_activation_min, int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { + uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { (void)gemm_context; // only used in optimized code. + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); // TODO(benoitjacob): This really should be: // const int batches = ArraySize(output_dims, 1); @@ -622,44 +623,130 @@ inline void ExperimentalShuffledFullyConnected( const int accum_depth = ArraySize(weights_dims, 0); TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - // The experimental shuffling is an optimization for matrix*vector product. - // We aren't interested in supporting non-matrix*vector-product cases, i.e. - // batches>1. - TFLITE_DCHECK_EQ(batches, 1); - // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to - // subtracting 128 from them, thus implementing for free the subtraction of - // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast(shuffled_weights_data); - for (int c = 0; c < output_depth; c += 4) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4] = {0}; - // Accumulation loop. - for (int d = 0; d < accum_depth; d += 16) { - for (int i = 0; i < 4; i++) { + TFLITE_DCHECK((accum_depth % 16) == 0); + TFLITE_DCHECK((output_depth % 4) == 0); + + // Shuffling and xoring of input activations into the workspace buffer + uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + if (batches == 1) { + for (int i = 0; i < accum_depth; i++) { + shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; + } + } else if (batches == 4) { + for (int c = 0; c < accum_depth; c += 16) { + for (int b = 0; b < 4; b++) { + const uint8* src_data_ptr = input_data + b * accum_depth + c; for (int j = 0; j < 16; j++) { - int8 input_val = input_data[d + j] - 128; - int8 weights_val = *shuffled_weights_ptr++; - accum[i] += weights_val * input_val; + uint8 src_val = *src_data_ptr++; + // Flip the sign bit, so that the kernel will only need to + // reinterpret these uint8 values as int8, getting for free the + // subtraction of the zero_point value 128. + uint8 dst_val = src_val ^ 0x80; + *shuffled_input_workspace_ptr++ = dst_val; } } } - for (int i = 0; i < 4; i++) { - // Add bias value - int acc = accum[i] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = - MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[c + i] = acc; + } else { + TFLITE_DCHECK(false); + return; + } + + // Actual computation + if (batches == 1) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4] = {0}; + // Accumulation loop. + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_data[d + j]; + int8 weights_val = *shuffled_weights_ptr++; + accum[i] += weights_val * input_val; + } + } + } + for (int i = 0; i < 4; i++) { + // Add bias value + int acc = accum[i] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[c + i] = acc; + } } + } else if (batches == 4) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = shuffled_input_data; + // Accumulation loop. + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4][4]; + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + accum[i][b] = 0; + } + } + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_ptr[16 * b + j]; + int8 weights_val = shuffled_weights_ptr[16 * i + j]; + accum[i][b] += weights_val * input_val; + } + } + } + shuffled_input_ptr += 64; + shuffled_weights_ptr += 64; + } + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + // Add bias value + int acc = accum[i][b] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The + // quantized multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[b * output_depth + c + i] = acc; + } + } + } + } else { + TFLITE_DCHECK(false); + return; } } @@ -2993,6 +3080,9 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims, const std::vector& left_paddings, const std::vector& right_paddings, T* output_data, const Dims<4>& output_dims, const int32_t pad_value) { + TFLITE_DCHECK_EQ(left_paddings.size(), 4); + TFLITE_DCHECK_EQ(right_paddings.size(), 4); + const int output_batch = ArraySize(output_dims, 3); const int output_height = ArraySize(output_dims, 2); const int output_width = ArraySize(output_dims, 1); @@ -3041,59 +3131,139 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims, output_dims, 0); } -inline bool LoopCondition(int index, int stop, int stride) { - return stride > 0 ? index < stop : index > stop; +// STRIDED SLICE +// The functions below for StridedSlice are mirrored in a number of places: +// +// propagate_fixed_sizes.cc +// propagate_shapes.cc +// resolve_constant_strided_slice.cc +// optimized_ops.h +// +// It is designed for an arbitrary number of dimensions, even though dimensions +// here are fixed at 4. This is because we expect to eventually support +// arbitrary dimensionality. Also note that the axis orders are reversed for +// runtime ops, and so the indices and masks must be as well too. +// +// Be warned this code involves some rather subtle logic of python slicing. The +// best "ground truth" is to compare results to actual python execution. + +// Use until std::clamp() is available from C++17. +inline int Clamp(const int v, const int lo, const int hi) { + TFLITE_DCHECK(!(hi < lo)); + if (hi < v) return hi; + if (v < lo) return lo; + return v; +} + +inline int StartForAxis(int begin_mask, const std::vector& start_indices, + const std::vector& strides, + const Dims<4>& input_shape, int axis) { + // Begin with the specified index + int start = start_indices[axis]; + + // begin_mask override + if (begin_mask & 1 << axis) { + if (strides[axis] > 0) { + // Forward iteration - use the first element. These values will get + // clamped below (Note: We could have set them to 0 and axis_size-1, but + // use lowest() and max() to maintain symmetry with StopForAxis()) + start = std::numeric_limits::lowest(); + } else { + // Backward iteration - use the last element. + start = std::numeric_limits::max(); + } + } + + // Handle negative indices + int axis_size = input_shape.sizes[axis]; + if (start < 0) { + start += axis_size; + } + + // Clamping + start = Clamp(start, 0, axis_size - 1); + + return start; } -inline int StartIndex(int start, int stride, int dim, bool masked) { - return masked ? (stride > 0 ? 0 : dim - 1) : start; +inline int StopForAxis(int end_mask, const std::vector& stop_indices, + const std::vector& strides, + const Dims<4>& input_shape, int axis) { + // Begin with the specified index + int stop = stop_indices[axis]; + + // end_mask override + if (end_mask & (1 << axis)) { + if (strides[axis] > 0) { + // Forward iteration - use the last element. These values will get + // clamped below + stop = std::numeric_limits::max(); + } else { + // Backward iteration - use the first element. + stop = std::numeric_limits::lowest(); + } + } + + // Handle negative indices + int axis_size = input_shape.sizes[axis]; + if (stop < 0) { + stop += axis_size; + } + + // Clamping + // Because the end index points one past the last element, we need slightly + // different clamping ranges depending on the direction. + if (strides[axis] > 0) { + // Forward iteration + stop = Clamp(stop, 0, axis_size); + } else { + // Backward iteration + stop = Clamp(stop, -1, axis_size - 1); + } + + return stop; } -inline int StopIndex(int start, int stop, int stride, int dim, bool masked, - bool shrink_axis_masked) { - return shrink_axis_masked ? stride > 0 ? start + 1 : start - 1 - : masked ? (stride > 0 ? dim : -1) : stop; +inline bool LoopCondition(int index, int stop, int stride) { + // True when we have reached the end of an axis and should loop. + return stride > 0 ? index >= stop : index <= stop; } template inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, - int begin_mask, int end_mask, int shrink_axis_mask, - const std::vector& starts, - const std::vector& stops, + int begin_mask, int end_mask, + const std::vector& start_indices, + const std::vector& stop_indices, const std::vector& strides, T* output_data, const Dims<4>& output_dims) { - TFLITE_DCHECK_EQ(starts.size(), 4); - TFLITE_DCHECK_EQ(stops.size(), 4); + TFLITE_DCHECK_EQ(start_indices.size(), 4); + TFLITE_DCHECK_EQ(stop_indices.size(), 4); TFLITE_DCHECK_EQ(strides.size(), 4); const int start_b = - StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8); + StartForAxis(begin_mask, start_indices, strides, input_dims, 3); const int stop_b = - StopIndex(start_b, stops[3], strides[3], input_dims.sizes[3], - end_mask & 8, shrink_axis_mask & 8); + StopForAxis(end_mask, stop_indices, strides, input_dims, 3); const int start_h = - StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4); + StartForAxis(begin_mask, start_indices, strides, input_dims, 2); const int stop_h = - StopIndex(start_h, stops[2], strides[2], input_dims.sizes[2], - end_mask & 4, shrink_axis_mask & 4); + StopForAxis(end_mask, stop_indices, strides, input_dims, 2); const int start_w = - StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2); + StartForAxis(begin_mask, start_indices, strides, input_dims, 1); const int stop_w = - StopIndex(start_w, stops[1], strides[1], input_dims.sizes[1], - end_mask & 2, shrink_axis_mask & 2); + StopForAxis(end_mask, stop_indices, strides, input_dims, 1); const int start_d = - StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1); + StartForAxis(begin_mask, start_indices, strides, input_dims, 0); const int stop_d = - StopIndex(start_d, stops[0], strides[0], input_dims.sizes[0], - end_mask & 1, shrink_axis_mask & 1); + StopForAxis(end_mask, stop_indices, strides, input_dims, 0); T* out_ptr = output_data; - for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]); + for (int in_b = start_b; !LoopCondition(in_b, stop_b, strides[3]); in_b += strides[3]) { - for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]); + for (int in_h = start_h; !LoopCondition(in_h, stop_h, strides[2]); in_h += strides[2]) { - for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]); + for (int in_w = start_w; !LoopCondition(in_w, stop_w, strides[1]); in_w += strides[1]) { - for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]); + for (int in_d = start_d; !LoopCondition(in_d, stop_d, strides[0]); in_d += strides[0]) { *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; } @@ -3102,18 +3272,6 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, } } -template -inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, - int begin_mask, int end_mask, - const std::vector& starts, - const std::vector& stops, - const std::vector& strides, T* output_data, - const Dims<4>& output_dims) { - StridedSlice(input_data, input_dims, begin_mask, end_mask, - /*shrink_axis_mask=*/0, starts, stops, strides, output_data, - output_dims); -} - template inline void Slice(const T* input_data, const Dims<4>& input_dims, const std::vector& begin, const std::vector& size, diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc index e6d5c300dcd47821b0572e3239b36f14bd6ea3d0..40ac436b7dcabe7a12166e5381f0381941a204d3 100644 --- a/tensorflow/contrib/lite/kernels/strided_slice.cc +++ b/tensorflow/contrib/lite/kernels/strided_slice.cc @@ -87,6 +87,8 @@ inline int32_t ClampedIndex(int32_t index, int dim, bool pos_stride) { std::min(std::max(index, -dim), dim - 1), dim)); } +// TODO(b/77971377) this logic should be removed, as it's a duplication of +// StartForAxis() & StopForAxis() in kernels/internal/reference/reference_ops.h inline int32_t GetBeginValueAtIndex(StridedSliceContext* op_context, int idx) { const int dim = op_context->input->dims->data[idx]; const bool pos_stride = GetTensorData(op_context->strides)[idx] > 0; @@ -188,8 +190,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { std::vector strides; for (int idx = op_context.dims - 1; idx >= 0; --idx) { - starts.emplace_back(GetBeginValueAtIndex(&op_context, idx)); - stops.emplace_back(GetEndValueAtIndex(&op_context, idx)); + starts.emplace_back(GetTensorData(op_context.begin)[idx]); + stops.emplace_back(GetTensorData(op_context.end)[idx]); strides.emplace_back(GetTensorData(op_context.strides)[idx]); } @@ -202,15 +204,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { int begin_mask = ReverseMaskBits(op_context.params->begin_mask, op_context.dims); int end_mask = ReverseMaskBits(op_context.params->end_mask, op_context.dims); - int shrink_axis_mask = - ReverseMaskBits(op_context.params->shrink_axis_mask, op_context.dims); - -#define TF_LITE_STRIDED_SLICE(kernel_type, data_type) \ - kernel_type::StridedSlice( \ - GetTensorData(op_context.input), \ - GetTensorDims(op_context.input), begin_mask, end_mask, shrink_axis_mask, \ - starts, stops, strides, GetTensorData(op_context.output), \ - GetTensorDims(op_context.output)) + +#define TF_LITE_STRIDED_SLICE(kernel_type, data_type) \ + kernel_type::StridedSlice(GetTensorData(op_context.input), \ + GetTensorDims(op_context.input), begin_mask, \ + end_mask, starts, stops, strides, \ + GetTensorData(op_context.output), \ + GetTensorDims(op_context.output)) switch (op_context.input->type) { case kTfLiteFloat32: diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc index 22d7b097cbd4e1349516eae9fd378aa186e06de7..cc39179bc705aa1083e74b06f8f7f3fb45e9f616 100644 --- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc +++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc @@ -377,29 +377,18 @@ TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) { StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1); m.SetInput({1, 2, 3, 4}); m.SetBegin({1}); - m.SetEnd({3}); + m.SetEnd({2}); m.SetStrides({1}); m.Invoke(); EXPECT_TRUE(m.GetOutputShape().empty()); EXPECT_THAT(m.GetOutput(), ElementsAreArray({2})); } -TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) { - StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1); - m.SetInput({1, 2, 3, 4}); - m.SetBegin({2}); - m.SetEnd({1}); - m.SetStrides({1}); - m.Invoke(); - EXPECT_TRUE(m.GetOutputShape().empty()); - EXPECT_THAT(m.GetOutput(), ElementsAreArray({3})); -} - TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) { StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1); m.SetInput({1, 2, 3, 4}); m.SetBegin({1}); - m.SetEnd({3}); + m.SetEnd({1}); m.SetStrides({1}); m.Invoke(); EXPECT_TRUE(m.GetOutputShape().empty()); @@ -421,7 +410,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) { StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1); m.SetInput({1, 2, 3, 4, 5, 6}); m.SetBegin({0, 0}); - m.SetEnd({2, 3}); + m.SetEnd({1, 3}); m.SetStrides({1, 1}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3})); @@ -432,7 +421,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) { StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2); m.SetInput({1, 2, 3, 4, 5, 6}); m.SetBegin({0, 0}); - m.SetEnd({2, 3}); + m.SetEnd({2, 1}); m.SetStrides({1, 1}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); @@ -443,7 +432,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) { StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3); m.SetInput({1, 2, 3, 4, 5, 6}); m.SetBegin({0, 0}); - m.SetEnd({2, 3}); + m.SetEnd({1, 1}); m.SetStrides({1, 1}); m.Invoke(); EXPECT_TRUE(m.GetOutputShape().empty()); @@ -454,7 +443,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) { StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); m.SetBegin({0, 0, 0}); - m.SetEnd({2, 3, 2}); + m.SetEnd({1, 3, 2}); m.SetStrides({1, 1, 1}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2})); @@ -465,7 +454,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) { StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); m.SetBegin({0, 0, 0}); - m.SetEnd({2, 3, 2}); + m.SetEnd({2, 1, 2}); m.SetStrides({1, 1, 1}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2})); @@ -476,7 +465,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) { StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); m.SetBegin({0, 0, 0}); - m.SetEnd({2, 3, 2}); + m.SetEnd({1, 1, 2}); m.SetStrides({1, 1, 1}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); @@ -487,7 +476,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) { StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); m.SetBegin({0, 0, 0}); - m.SetEnd({2, 3, 2}); + m.SetEnd({2, 3, 1}); m.SetStrides({1, 1, 1}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3})); @@ -498,7 +487,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) { StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); m.SetBegin({0, 0, 0}); - m.SetEnd({2, 3, 2}); + m.SetEnd({1, 3, 1}); m.SetStrides({1, 1, 1}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3})); @@ -509,7 +498,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) { StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); m.SetBegin({0, 0, 0}); - m.SetEnd({2, 3, 2}); + m.SetEnd({2, 1, 1}); m.SetStrides({1, 1, 1}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); @@ -520,7 +509,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) { StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); m.SetBegin({0, 0, 0}); - m.SetEnd({2, 3, 2}); + m.SetEnd({1, 1, 1}); m.SetStrides({1, 1, 1}); m.Invoke(); EXPECT_TRUE(m.GetOutputShape().empty()); @@ -553,7 +542,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) { 0, 0, 1); m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); m.SetBegin({0, 0, 0}); - m.SetEnd({2, 3, 2}); + m.SetEnd({1, 3, 2}); m.SetStrides({1, 1, 1}); m.Invoke(); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2})); diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h index a9064d54e7704d52eefa34f6bf446ec1cfe68fe1..a5f345e98a9d4fda57072684298fbc084b8b9a61 100644 --- a/tensorflow/contrib/lite/kernels/test_util.h +++ b/tensorflow/contrib/lite/kernels/test_util.h @@ -88,7 +88,9 @@ struct TensorData { class SingleOpResolver : public OpResolver { public: SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration) - : op_(op), registration_(registration) {} + : op_(op), registration_(registration) { + registration_->builtin_code = op; + } TfLiteRegistration* FindOp(BuiltinOperator op) const override { if (op == op_) { return registration_; diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc index 2dd6d67e078619df41524e8242a0475320c02013..f45f39d1e6f874ad548c3cd2b64c2c1ee1d2fa0d 100644 --- a/tensorflow/contrib/lite/model.cc +++ b/tensorflow/contrib/lite/model.cc @@ -194,7 +194,6 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() { builtin_code); status = kTfLiteError; } else if (builtin_code != BuiltinOperator_CUSTOM) { - flatbuffer_op_index_to_registration_types_.push_back(builtin_code); registration = op_resolver_.FindOp(builtin_code); if (registration == nullptr) { error_reporter_->Report("Didn't find op for builtin opcode '%s'\n", @@ -208,8 +207,6 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() { } else { const char* name = opcode->custom_code()->c_str(); registration = op_resolver_.FindOp(name); - flatbuffer_op_index_to_registration_types_.push_back( - BuiltinOperator_CUSTOM); if (registration == nullptr) { error_reporter_->Report("Didn't find custom op for name '%s'\n", name); status = kTfLiteError; @@ -702,8 +699,7 @@ TfLiteStatus InterpreterBuilder::ParseNodes( continue; } - auto op_type = - flatbuffer_op_index_to_registration_types_[op->opcode_index()]; + BuiltinOperator op_type = static_cast(reg->builtin_code); if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) { error_reporter_->Report( "Found builtin operator %s with custom options.\n", diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h index 5a55b031a8c28085e02782608eb820a3cfe78dde..a7d7f3ea1096799608d9fcd324de65638f1a6d6f 100644 --- a/tensorflow/contrib/lite/model.h +++ b/tensorflow/contrib/lite/model.h @@ -188,7 +188,6 @@ class InterpreterBuilder { ErrorReporter* error_reporter_; std::vector flatbuffer_op_index_to_registration_; - std::vector flatbuffer_op_index_to_registration_types_; const Allocation* allocation_ = nullptr; }; diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h index 85aca3687402a89b557d76ab5ace80dea8f8b23d..ace4827d8ce2150cde69a0b0c2d5ca39203193bd 100644 --- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h +++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h @@ -34,10 +34,13 @@ limitations under the License. inline void* loadLibrary(const char* name) { // TODO: change RTLD_LOCAL? Assumes there can be multiple instances of nn // api RT - void* handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); + void* handle = nullptr; +#ifdef __ANDROID__ + handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); if (handle == nullptr) { NNAPI_LOG("nnapi error: unable to open library %s", name); } +#endif return handle; } diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc index e1366639c78a4e90740aaf42a9ba5770ec65cb78..dfdd80ea8a42af683632be1d7e8ab0062847077d 100644 --- a/tensorflow/contrib/lite/optional_debug_tools.cc +++ b/tensorflow/contrib/lite/optional_debug_tools.cc @@ -72,7 +72,7 @@ const char* AllocTypeName(TfLiteAllocationType type) { // Prints a dump of what tensors and what nodes are in the interpreter. void PrintInterpreterState(Interpreter* interpreter) { - printf("Interpreter has %d tensors and %d nodes\n", + printf("Interpreter has %zu tensors and %zu nodes\n", interpreter->tensors_size(), interpreter->nodes_size()); printf("Inputs:"); PrintIntVector(interpreter->inputs()); diff --git a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc index 0c5f0cd31495d67beb867beb835cfc897d7d6265..b8784cca455cfc301f2cc30c9c6d031b7174f829 100644 --- a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc +++ b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc @@ -49,13 +49,13 @@ TEST(ProfileBufferTest, AddEvent) { auto event = GetProfileEvents(buffer)[0]; EXPECT_EQ(event->tag, "hello"); - EXPECT_GT(event->begin_timestamp_ms, 0); + EXPECT_GT(event->begin_timestamp_us, 0); EXPECT_EQ(event->event_type, ProfileEvent::EventType::DEFAULT); EXPECT_EQ(event->event_metadata, 42); buffer.EndEvent(event_handle); EXPECT_EQ(1, buffer.Size()); - EXPECT_GE(event->end_timestamp_ms, event->begin_timestamp_ms); + EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us); } TEST(ProfileBufferTest, OverFlow) { diff --git a/tensorflow/contrib/lite/profiling/profiler_test.cc b/tensorflow/contrib/lite/profiling/profiler_test.cc index 994523a8fb74b511a18f2ec53147ce81323bec1b..7914f36a319b855090ca43556967fa9abbc4e1b3 100644 --- a/tensorflow/contrib/lite/profiling/profiler_test.cc +++ b/tensorflow/contrib/lite/profiling/profiler_test.cc @@ -30,7 +30,7 @@ namespace { void AssertDurationOfEventAroundMs(const ProfileEvent* event, double expected_ms, double eps_ms) { double duration_ms = - (event->end_timestamp_ms - event->begin_timestamp_ms) / 1e3; + (event->end_timestamp_us - event->begin_timestamp_us) / 1e3; EXPECT_NEAR(expected_ms, duration_ms, eps_ms); } diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD index 926896d609d83aac3b875d33dfe3c4dc7ae89ccd..e6dcc7aa099ccde848ef6cd9322639ea2d0f1c01 100644 --- a/tensorflow/contrib/lite/python/BUILD +++ b/tensorflow/contrib/lite/python/BUILD @@ -39,16 +39,35 @@ py_test( py_library( name = "lite", srcs = ["lite.py"], - # data = [ - # "//tensorflow/contrib/lite/toco/python:toco_from_protos", - # ], srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ + ":convert", + ":convert_saved_model", ":op_hint", + ], +) + +py_library( + name = "lite_constants", + srcs = ["lite_constants.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/contrib/lite/toco:toco_flags_proto_py", + ], +) + +py_library( + name = "convert", + srcs = ["convert.py"], + srcs_version = "PY2AND3", + visibility = ["//visibility:public"], + deps = [ + ":lite_constants", "//tensorflow/contrib/lite/toco:model_flags_proto_py", "//tensorflow/contrib/lite/toco:toco_flags_proto_py", "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco", + "//tensorflow/contrib/lite/toco/python:toco_from_protos", "//tensorflow/python:platform", ], ) @@ -66,15 +85,15 @@ py_library( ) py_test( - name = "lite_test", - srcs = ["lite_test.py"], + name = "convert_test", + srcs = ["convert_test.py"], srcs_version = "PY2AND3", tags = [ "no-internal-py3", "no_oss", ], deps = [ - ":lite", + ":convert", ":op_hint", "//tensorflow/python:array_ops", "//tensorflow/python:client_testlib", @@ -84,13 +103,14 @@ py_test( ], ) -py_binary( +py_library( name = "convert_saved_model", srcs = ["convert_saved_model.py"], srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ - ":lite", + ":convert", + ":lite_constants", "//tensorflow/contrib/saved_model:saved_model_py", "//tensorflow/python:graph_util", "//tensorflow/python/tools:freeze_graph_lib", @@ -130,6 +150,15 @@ py_test( ], ) +py_binary( + name = "convert_saved_model_to_frozen_graph", + srcs = ["convert_saved_model_to_frozen_graph.py"], + srcs_version = "PY2AND3", + deps = [ + ":convert_saved_model", + ], +) + # Transitive dependencies of this target will be included in the pip package. py_library( name = "tf_lite_py_pip", diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py new file mode 100644 index 0000000000000000000000000000000000000000..c4200c879ba0e17b3bd183f4004eb75ebdd2f5ee --- /dev/null +++ b/tensorflow/contrib/lite/python/convert.py @@ -0,0 +1,187 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Converts a frozen graph into a TFLite FlatBuffer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os as _os +import subprocess as _subprocess +import tempfile as _tempfile + +from tensorflow.contrib.lite.python import lite_constants +from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2 +from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2 +from tensorflow.python.framework import dtypes as _dtypes +from tensorflow.python.platform import resource_loader as _resource_loader +from tensorflow.python.util.lazy_loader import LazyLoader + + +# Lazy load since some of the performance benchmark skylark rules +# break dependencies. +_toco_python = LazyLoader( + "tensorflow_wrap_toco", globals(), + "tensorflow.contrib.lite.toco.python." + "tensorflow_wrap_toco") +del LazyLoader + +# Find the toco_from_protos binary using the resource loader if using from +# bazel, otherwise we are in a pip where console_scripts already has +# the toco_from_protos tool. +if lite_constants.EXPERIMENTAL_USE_TOCO_API_DIRECTLY: + _toco_from_proto_bin = "" +else: + _toco_from_proto_bin = _resource_loader.get_path_to_datafile( + "../toco/python/toco_from_protos") + +if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin): + _toco_from_proto_bin = "toco_from_protos" + + +def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str): + """Convert `input_data_str` according to model and toco parameters. + + Unless you know what you are doing consider using + the more friendly @{tf.contrib.lite.toco_convert}}. + + Args: + model_flags_str: Serialized proto describing model properties, see + `toco/model_flags.proto`. + toco_flags_str: Serialized proto describing conversion properties, see + `toco/toco_flags.proto`. + input_data_str: Input data in serialized form (e.g. a graphdef is common) + Returns: + Converted model in serialized form (e.g. a TFLITE model is common). + Raises: + RuntimeError: When conversion fails, an exception is raised with the error + message embedded. + """ + # TODO(aselle): When toco does not use fatal errors for failure, we can + # switch this on. + if not _toco_from_proto_bin: + return _toco_python.TocoConvert( + model_flags_str, toco_flags_str, input_data_str) + + with _tempfile.NamedTemporaryFile() as fp_toco, \ + _tempfile.NamedTemporaryFile() as fp_model, \ + _tempfile.NamedTemporaryFile() as fp_input, \ + _tempfile.NamedTemporaryFile() as fp_output: + fp_model.write(model_flags_str) + fp_toco.write(toco_flags_str) + fp_input.write(input_data_str) + fp_model.flush() + fp_toco.flush() + fp_input.flush() + + cmd = [ + _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name, + fp_output.name + ] + cmdline = " ".join(cmd) + proc = _subprocess.Popen( + cmdline, + shell=True, + stdout=_subprocess.PIPE, + stderr=_subprocess.STDOUT, + close_fds=True) + stdout, stderr = proc.communicate() + exitcode = proc.returncode + if exitcode == 0: + stuff = fp_output.read() + return stuff + else: + raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" % + (stdout, stderr)) + + +def tensor_name(x): + return x.name.split(":")[0] + + +def toco_convert(input_data, + input_tensors, + output_tensors, + inference_type=lite_constants.FLOAT, + input_format=lite_constants.TENSORFLOW_GRAPHDEF, + output_format=lite_constants.TFLITE, + quantized_input_stats=None, + drop_control_dependency=True): + """Convert a model using TOCO from `input_format` to `output_format`. + + Typically this is to convert from TensorFlow GraphDef to TFLite, in which + case the default `input_format` and `output_format` are sufficient. + + Args: + input_data: Input data (i.e. often `sess.graph_def`). + input_tensors: List of input tensors. Type and shape are computed using + `foo.get_shape()` and `foo.dtype`. + output_tensors: List of output tensors (only .name is used from this). + inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`. + input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF). + output_format: Type of data to write (currently must be TFLITE or + GRAPHVIZ_DOT) + quantized_input_stats: For each member of input_tensors the mean and + std deviation of training data. Only needed if `inference_type` is + `QUANTIZED_UINT8`. + drop_control_dependency: Drops control dependencies silently. This is due + to tf lite not supporting control dependencies. + + Returns: + The converted data. For example if tflite was the destination, then + this will be a tflite flatbuffer in a bytes array. + + Raises: + ValueError: If the input tensor type is unknown + RuntimeError: If TOCO fails to convert (in which case the runtime error's + error text will contain the TOCO error log) + """ + toco = _toco_flags_pb2.TocoFlags() + toco.input_format = input_format + toco.output_format = output_format + toco.drop_control_dependency = drop_control_dependency + model = _model_flags_pb2.ModelFlags() + toco.inference_type = inference_type + for idx, input_tensor in enumerate(input_tensors): + if input_tensor.dtype == _dtypes.float32: + tflite_input_type = lite_constants.FLOAT + elif input_tensor.dtype == _dtypes.int32: + tflite_input_type = lite_constants.INT32 + elif input_tensor.dtype == _dtypes.int64: + tflite_input_type = lite_constants.INT64 + # TODO(aselle): Insert strings when they are available + else: + raise ValueError("Tensors %s not known type %r" % (input_tensor.name, + input_tensor.dtype)) + + input_array = model.input_arrays.add() + + if inference_type == lite_constants.QUANTIZED_UINT8: + if tflite_input_type == lite_constants.FLOAT: + tflite_input_type = lite_constants.QUANTIZED_UINT8 + input_array.mean_value, input_array.std_value = quantized_input_stats[idx] + + input_array.name = tensor_name(input_tensor) + input_array.shape.dims.extend(map(int, input_tensor.get_shape())) + + for output_tensor in output_tensors: + model.output_arrays.append(tensor_name(output_tensor)) + + # TODO(aselle): Consider handling the case of allowing quantized + # inputs to be converted to float (via the toco.inference_input_type field). + data = toco_convert_protos(model.SerializeToString(), + toco.SerializeToString(), + input_data.SerializeToString()) + return data diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py index a2b5ef488ec1feb455b2c8d5d1c4005c3b2f60d6..a7eddf3408f54dff5fa49ff6fa7b61cd0b8a22e4 100644 --- a/tensorflow/contrib/lite/python/convert_saved_model.py +++ b/tensorflow/contrib/lite/python/convert_saved_model.py @@ -12,52 +12,43 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -r"""TensorFlow Lite flatbuffer generation from saved_models. +"""Functions to convert SavedModel to frozen GraphDefs.""" -Example: - -bazel run third_party/tensorflow/contrib/lite/python:convert_saved_model -- \ - --saved_model_dir=/tmp/test_saved_model/1519865537 \ - --output_tflite=/tmp/test.lite - -""" from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.lite.python import lite +from tensorflow.contrib.lite.python import convert +from tensorflow.contrib.lite.python import lite_constants +from tensorflow.contrib.lite.toco import model_flags_pb2 from tensorflow.contrib.saved_model.python.saved_model import reader from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils from tensorflow.core.framework import types_pb2 from tensorflow.python.client import session from tensorflow.python.framework import graph_util as tf_graph_util from tensorflow.python.framework import ops -from tensorflow.python.platform import app -from tensorflow.python.platform import flags from tensorflow.python.platform import gfile from tensorflow.python.platform import tf_logging as logging from tensorflow.python.saved_model import loader from tensorflow.python.saved_model import signature_constants from tensorflow.python.saved_model import tag_constants -flags.DEFINE_string("saved_model_dir", "", "Saved model directory to convert.") -flags.DEFINE_string("output_tflite", None, "File path to write flatbuffer.") -flags.DEFINE_string("output_arrays", None, - "List of output tensor names, the default value is None, " - "which means the conversion will keep all outputs.") -flags.DEFINE_integer("batch_size", 1, - "If input tensor shape has None at first dimension, " - "e.g. (None,224,224,3), replace None with batch_size.") -flags.DEFINE_string("tag_set", tag_constants.SERVING, - "Group of tag(s) of the MetaGraphDef in the saved_model, " - "in string format, separated by ','. For tag-set contains " - "multiple tags, all tags must be passed in.") -flags.DEFINE_string("signature_key", - signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, - "This is signature key to extract inputs, outputs.") - - -def log_tensor_details(tensor_info): + +def _write_and_flush_file(file_path, data_str): + """Writes data to file path. + + Args: + file_path: Full path of the file to store data in. + data_str: Data represented as a string. + + Returns: None. + """ + with gfile.Open(file_path, "wb") as data_file: + data_file.write(data_str) + data_file.flush() + + +def _log_tensor_details(tensor_info): """Log tensor details: name, shape, and type.""" for key in tensor_info: val = tensor_info[key] @@ -73,7 +64,7 @@ def log_tensor_details(tensor_info): dtype) -def get_meta_graph_def(saved_model_dir, tag_set): +def _get_meta_graph_def(saved_model_dir, tag_set): """Validate saved_model and extract MetaGraphDef. Args: @@ -103,7 +94,7 @@ def get_meta_graph_def(saved_model_dir, tag_set): "values are '{}'. ".format(tag_set, tag_sets)) -def get_signature_def(meta_graph, signature_key): +def _get_signature_def(meta_graph, signature_key): """Get the signature def from meta_graph with given signature_key. Args: @@ -130,11 +121,11 @@ def get_signature_def(meta_graph, signature_key): return signature_def -def get_inputs_outputs(signature_def): - """Get inputs and outputs from signature def. +def _get_inputs_outputs(signature_def): + """Get inputs and outputs from SignatureDef. Args: - signature_def: signatuer def in the meta_graph_def for conversion. + signature_def: SignatureDef in the meta_graph_def for conversion. Returns: The inputs and outputs in the graph for conversion. @@ -142,9 +133,9 @@ def get_inputs_outputs(signature_def): inputs_tensor_info = signature_def.inputs outputs_tensor_info = signature_def.outputs logging.info("input tensors info: ") - log_tensor_details(inputs_tensor_info) + _log_tensor_details(inputs_tensor_info) logging.info("output tensors info: ") - log_tensor_details(outputs_tensor_info) + _log_tensor_details(outputs_tensor_info) def gather_names(tensor_info): return [tensor_info[key].name for key in tensor_info] @@ -154,109 +145,277 @@ def get_inputs_outputs(signature_def): return inputs, outputs -def convert(saved_model_dir, - output_tflite=None, - output_arrays=None, - tag_set=None, - signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, - batch_size=1): - """Convert a saved_model to tflite flatbuffer. +def _get_tensors(graph, signature_def_tensor_names=None, + user_tensor_names=None): + """Gets the tensors associated with the tensor names. + + Either signature_def_tensor_names or user_tensor_names should be provided. If + the user provides tensors, the tensors associated with the user provided + tensor names are provided. Otherwise, the tensors associated with the names in + the SignatureDef are provided. Args: - saved_model_dir: Saved model directory to convert. - output_tflite: File path to write result flatbuffer. - output_arrays: List of output tensor names, the default value is None, which - means conversion keeps all output tensors. This is also used to filter - tensors that are from Op currently not supported in tflite, e.g., Argmax). - tag_set: This is the set of tags to get meta_graph_def in saved_model. - signature_key: This is the signature key to extract inputs, outputs. - batch_size: If input tensor shape has None at first dimension, - e.g. (None,224,224,3), replace None with batch_size. + graph: GraphDef representing graph. + signature_def_tensor_names: Tensor names stored in either the inputs or + outputs of a SignatureDef. (default None) + user_tensor_names: Tensor names provided by the user. (default None) Returns: - The converted data. For example if tflite was the destination, then - this will be a tflite flatbuffer in a bytes array. + List of tensors. + + Raises: + ValueError: + signature_def_tensors and user_tensor_names are undefined or empty. + user_tensor_names are not valid. + """ + tensors = [] + if user_tensor_names: + # Get the list of all of the tensors with and without the tensor index. + all_tensor_names = [ + tensor.name for op in graph.get_operations() for tensor in op.outputs + ] + all_tensor_names_only = [name.split(":")[0] for name in all_tensor_names] + + # Sort the tensor names. + user_tensor_names = sorted(user_tensor_names) + + # Get the tensors associated with the tensor names. + tensors = [] + invalid_tensors = [] + for name in user_tensor_names: + if name not in all_tensor_names_only: + invalid_tensors.append(name) + else: + idx = all_tensor_names_only.index(name) + tensors.append(graph.get_tensor_by_name(all_tensor_names[idx])) + + # Throw ValueError if any user input names are not valid tensors. + if invalid_tensors: + raise ValueError("Invalid tensors '{}' were found.".format( + ",".join(invalid_tensors))) + elif signature_def_tensor_names: + tensors = [ + graph.get_tensor_by_name(name) + for name in sorted(signature_def_tensor_names) + ] + else: + # Throw ValueError if signature_def_tensors and user_tensor_names are both + # either undefined or empty. + raise ValueError( + "Specify either signature_def_tensor_names or user_tensor_names") + + return tensors + + +def _freeze_saved_model(saved_model_dir, input_arrays, input_shapes, + output_arrays, tag_set, signature_key, batch_size): + """Converts a SavedModel to a frozen graph. + + Args: + saved_model_dir: SavedModel directory to convert. + input_arrays: List of input tensors to freeze graph with. Uses input arrays + from SignatureDef when none are provided. (default None) + input_shapes: Map of strings representing input tensor names to list of + integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}). + Automatically determined when input shapes is None (e.g., {"foo" : None}). + (default None) + output_arrays: List of output tensors to freeze graph with. Uses output + arrays from SignatureDef when none are provided. (default None) + tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to + analyze. All tags in the tag set must be present. (default "serve") + signature_key: Key identifying SignatureDef containing inputs and outputs. + batch_size: Batch size for the model. Replaces the first dimension of an + input size array if undefined. (default 1) + + Returns: + frozen_graph_def: Frozen GraphDef. + in_tensors: List of input tensors for the graph. + out_tensors: List of output tensors for the graph. Raises: - ValueError: If tag_set does not indicate any meta_graph_def in saved_model, - or signature_key is not in relevant meta_graph_def, - or input shape has None beyond 1st dimension, e.g., (1,None, None, 3), - or given output_arrays are not valid causing empty outputs. + ValueError: + SavedModel doesn't contain a MetaGraphDef identified by tag_set. + signature_key is not in the MetaGraphDef. + input_shapes does not match the length of input_arrays. + input_shapes has a None value after the 1st dimension. + input_arrays or output_arrays are not valid. + Unable to load Session. """ + # Set default values for inputs if they are set to None. + if signature_key is None: + signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY if tag_set is None: tag_set = set([tag_constants.SERVING]) + if batch_size is None: + batch_size = 1 - meta_graph = get_meta_graph_def(saved_model_dir, tag_set) - signature_def = get_signature_def(meta_graph, signature_key) - inputs, outputs = get_inputs_outputs(signature_def) + # Read SignatureDef. + meta_graph = _get_meta_graph_def(saved_model_dir, tag_set) + signature_def = _get_signature_def(meta_graph, signature_key) + inputs, outputs = _get_inputs_outputs(signature_def) graph = ops.Graph() with session.Session(graph=graph) as sess: - + # TODO(nupurgarg): Throw ValueError if SavedModel has assets/ directory. loader.load(sess, meta_graph.meta_info_def.tags, saved_model_dir) - in_tensors = [graph.get_tensor_by_name(input_) for input_ in inputs] - - # Users can use output_arrays to filter output tensors for conversion. - # If output_arrays is None, we keep all output tensors. In future, we may - # use tflite supported Op list and check whether op is custom Op to - # automatically filter output arrays. - # TODO(zhixianyan): Use tflite supported Op list to filter outputs. - if output_arrays is not None: - output_arrays = output_arrays.split(",") - out_tensors = [ - graph.get_tensor_by_name(output) - for output in outputs - if output.split(":")[0] in output_arrays - ] - else: - out_tensors = [graph.get_tensor_by_name(output) for output in outputs] + # Gets input and output tensors. + # TODO(zhixianyan): Use TFLite supported Op list to filter outputs. + in_tensors = _get_tensors(graph, inputs, input_arrays) + out_tensors = _get_tensors(graph, outputs, output_arrays) - output_names = [node.split(":")[0] for node in outputs] + # Gets fully defined tensor shape. An input tensor with None in the first + # dimension, e.g. (None, 224, 224, 3), is replaced with the batch_size. + # Shapes with None after the first dimension result in a ValueError. + # TODO(zhixianyan): Add supports for input tensor with more None in shape. + for tensor in in_tensors: + if (input_shapes and tensor.name in input_shapes and + input_shapes[tensor.name] is not None): + shape = input_shapes[tensor.name] + else: + shape = tensor.get_shape().as_list() - if not out_tensors: - raise ValueError( - "No valid output tensors for '{}', possible values are '{}'".format( - output_arrays, output_names)) + if None in shape[1:]: + raise ValueError( + "None is only supported in the 1st dimension. Tensor '{0}' has " + "invalid shape '{1}'.".format(tensor.name, shape)) + elif shape[0] is None: + shape[0] = batch_size + tensor.set_shape(shape) + output_names = [node.split(":")[0] for node in outputs] frozen_graph_def = tf_graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), output_names) - # Toco requires fully defined tensor shape, for input tensor with None in - # their shape, e.g., (None, 224, 224, 3), we need to replace first None with - # a given batch size. For shape with more None, e.g. (None, None, None, 3), - # still be able to replace and convert, but require further investigation. - # TODO(zhixianyan): Add supports for input tensor with more None in shape. - for i in range(len(in_tensors)): - shape = in_tensors[i].get_shape().as_list() - if shape[0] is None: - shape[0] = batch_size - if None in shape[1:]: - raise ValueError( - "Only support None shape at 1st dim as batch_size. But tensor " - "'{}' 's shape '{}' has None at other dimension. ".format( - inputs[i], shape)) - in_tensors[i].set_shape(shape) + return frozen_graph_def, in_tensors, out_tensors + raise ValueError("Unable to load Session.") - result = lite.toco_convert(frozen_graph_def, in_tensors, out_tensors) - if output_tflite is not None: - with gfile.Open(output_tflite, "wb") as f: - f.write(result) - logging.info("Successfully converted to: %s", output_tflite) +def saved_model_to_frozen_graphdef( + saved_model_dir, + output_file_model, + output_file_flags, + input_arrays=None, + input_shapes=None, + output_arrays=None, + tag_set=None, + signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, + batch_size=1): + """Converts a SavedModel to a frozen graph. Writes graph to tmp directory. - return result + Stores frozen graph and command line flags in the tmp directory. + Args: + saved_model_dir: SavedModel directory to convert. + output_file_model: Full file path to save frozen graph. + output_file_flags: Full file path to save ModelFlags. + input_arrays: List of input tensors to freeze graph with. Uses input arrays + from SignatureDef when none are provided. (default None) + input_shapes: Map of strings representing input tensor names to list of + integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}). + Automatically determined when input shapes is None (e.g., {"foo" : None}). + (default None) + output_arrays: List of output tensors to freeze graph with. Uses output + arrays from SignatureDef when none are provided. (default None) + tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to + analyze. All tags in the tag set must be present. (default "serve") + signature_key: Key identifying SignatureDef containing inputs and outputs. + batch_size: Batch size for the model. Replaces the first dimension of an + input size array if undefined. (default 1) + + Returns: None. -def main(_): - convert( - saved_model_dir=flags.FLAGS.saved_model_dir, - output_tflite=flags.FLAGS.output_tflite, - output_arrays=flags.FLAGS.output_arrays, - batch_size=flags.FLAGS.batch_size, - tag_set=set(flags.FLAGS.tag_set.split(",")), - signature_key=flags.FLAGS.signature_key) + Raises: + ValueError: Unable to convert to frozen graph. + """ + frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model( + saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set, + signature_key, batch_size) + + # Initialize model flags. + model = model_flags_pb2.ModelFlags() + + for input_tensor in in_tensors: + input_array = model.input_arrays.add() + input_array.name = convert.tensor_name(input_tensor) + input_array.shape.dims.extend(map(int, input_tensor.get_shape())) + + for output_tensor in out_tensors: + model.output_arrays.append(convert.tensor_name(output_tensor)) + + # Write model and ModelFlags to file. ModelFlags contain input array and + # output array information that is parsed from the SignatureDef and used for + # analysis by TOCO. + _write_and_flush_file(output_file_model, frozen_graph_def.SerializeToString()) + _write_and_flush_file(output_file_flags, model.SerializeToString()) + + +def tflite_from_saved_model( + saved_model_dir, + output_file=None, + input_arrays=None, + input_shapes=None, + output_arrays=None, + tag_set=None, + signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, + batch_size=1, + inference_type=lite_constants.FLOAT, + input_format=lite_constants.TENSORFLOW_GRAPHDEF, + output_format=lite_constants.TFLITE, + quantized_input_stats=None, + drop_control_dependency=True): + """Converts a SavedModel to TFLite FlatBuffer. + Args: + saved_model_dir: SavedModel directory to convert. + output_file: File path to write result TFLite FlatBuffer. + input_arrays: List of input tensors to freeze graph with. Uses input arrays + from SignatureDef when none are provided. (default None) + input_shapes: Map of strings representing input tensor names to list of + integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}). + Automatically determined when input shapes is None (e.g., {"foo" : None}). + (default None) + output_arrays: List of output tensors to freeze graph with. Uses output + arrays from SignatureDef when none are provided. (default None) + tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to + analyze. All tags in the tag set must be present. (default "serve") + signature_key: Key identifying SignatureDef containing inputs and outputs. + batch_size: Batch size for the model. Replaces the first dimension of an + input size array if undefined. (default 1) + inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`. + input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF). + output_format: Type of data to write (currently must be TFLITE or + GRAPHVIZ_DOT) + quantized_input_stats: For each member of input_tensors the mean and + std deviation of training data. Only needed if `inference_type` is + `QUANTIZED_UINT8`. + drop_control_dependency: Drops control dependencies silently. This is due + to tf lite not supporting control dependencies. -if __name__ == "__main__": - app.run(main) + Returns: + The converted data. For example if tflite was the destination, then + this will be a tflite flatbuffer in a bytes array. + + Raises: + ValueError: Unable to convert to frozen graph. + """ + frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model( + saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set, + signature_key, batch_size) + + result = convert.toco_convert( + input_data=frozen_graph_def, + input_tensors=in_tensors, + output_tensors=out_tensors, + inference_type=inference_type, + input_format=input_format, + output_format=output_format, + quantized_input_stats=quantized_input_stats, + drop_control_dependency=drop_control_dependency) + + if output_file is not None: + with gfile.Open(output_file, "wb") as f: + f.write(result) + logging.info("Successfully converted to: %s", output_file) + + return result diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py index 734e42d619bdb79de0306a94e304ce46065d14d4..db95fc8ad7f94b52d33c72f6ec5819bdfe8cf05f 100644 --- a/tensorflow/contrib/lite/python/convert_saved_model_test.py +++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""TF Lite SavedModel Conversion test cases. - - - test on generated saved_models from simple graphs (sanity check) - - test mnist savedmodel generated on-the-fly +"""TFLite SavedModel conversion test cases. + - Tests converting simple SavedModel graph to TFLite FlatBuffer. + - Tests converting simple SavedModel graph to frozen graph. + - Tests converting MNIST SavedModel to TFLite FlatBuffer. """ from __future__ import absolute_import @@ -25,6 +25,7 @@ from __future__ import print_function import os from tensorflow.contrib.lite.python import convert_saved_model +from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2 from tensorflow.python import keras from tensorflow.python.client import session from tensorflow.python.estimator import estimator_lib as estimator @@ -37,6 +38,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.ops import random_ops from tensorflow.python.ops.losses import losses +from tensorflow.python.platform import gfile from tensorflow.python.platform import test from tensorflow.python.saved_model import saved_model from tensorflow.python.training import training as train @@ -45,7 +47,7 @@ from tensorflow.python.training import training as train class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase): def _createSimpleSavedModel(self, shape): - """Create a simple savedmodel on the fly.""" + """Create a simple SavedModel on the fly.""" saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel") with session.Session() as sess: in_tensor = array_ops.placeholder(shape=shape, dtype=dtypes.float32) @@ -56,44 +58,78 @@ class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase): return saved_model_dir def testSimpleSavedModel(self): - """Test a simple savedmodel created on the fly.""" - # Create a simple savedmodel + """Test a simple SavedModel created on the fly.""" + # Create a simple SavedModel saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) # Convert to tflite - result = convert_saved_model.convert(saved_model_dir=saved_model_dir) + result = convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir) self.assertTrue(result) def testSimpleSavedModelWithNoneBatchSizeInShape(self): - """Test a simple savedmodel, with None in input tensor's shape.""" + """Test a simple SavedModel, with None in input tensor's shape.""" saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3]) - result = convert_saved_model.convert(saved_model_dir=saved_model_dir) + result = convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir) self.assertTrue(result) def testSimpleSavedModelWithMoreNoneInShape(self): - """Test a simple savedmodel, fail as more None in input shape.""" + """Test a simple SavedModel, fail as more None in input shape.""" saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, None, 3]) # Convert to tflite: this should raise ValueError, as 3rd dim is None. with self.assertRaises(ValueError): - convert_saved_model.convert(saved_model_dir=saved_model_dir) + convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir) def testSimpleSavedModelWithWrongSignatureKey(self): - """Test a simple savedmodel, fail as given signature is invalid.""" + """Test a simple SavedModel, fail as given signature is invalid.""" saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) # Convert to tflite: this should raise ValueError, as # signature_key does not exit in the saved_model. with self.assertRaises(ValueError): - convert_saved_model.convert( + convert_saved_model.tflite_from_saved_model( saved_model_dir=saved_model_dir, signature_key="wrong-key") def testSimpleSavedModelWithWrongOutputArray(self): - """Test a simple savedmodel, fail as given output_arrays is invalid.""" - # Create a simple savedmodel + """Test a simple SavedModel, fail as given output_arrays is invalid.""" + # Create a simple SavedModel saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) # Convert to tflite: this should raise ValueError, as # output_arrays is not valid for the saved_model. with self.assertRaises(ValueError): - convert_saved_model.convert( - saved_model_dir=saved_model_dir, output_arrays="wrong-output") + convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, output_arrays=["wrong-output"]) + + def testSimpleSavedModelWithWrongInputArrays(self): + """Test a simple SavedModel, fail as given input_arrays is invalid.""" + saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) + # Checks invalid input_arrays. + with self.assertRaises(ValueError): + convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, input_arrays=["wrong-input"]) + # Checks valid and invalid input_arrays. + with self.assertRaises(ValueError): + convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, + input_arrays=["Placeholder", "wrong-input"]) + + def testSimpleSavedModelWithCorrectArrays(self): + """Test a simple SavedModel, with correct input_arrays and output_arrays.""" + saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3]) + result = convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, + input_arrays=["Placeholder"], + output_arrays=["add"]) + self.assertTrue(result) + + def testSimpleSavedModelWithCorrectInputArrays(self): + """Test a simple SavedModel, with correct input_arrays and input_shapes.""" + saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) + result = convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, + input_arrays=["Placeholder"], + input_shapes={"Placeholder": [1, 16, 16, 3]}) + self.assertTrue(result) def testMultipleMetaGraphDef(self): """Test saved model with multiple MetaGraphDef.""" @@ -119,20 +155,103 @@ class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase): sess, tags=[saved_model.tag_constants.SERVING, "additional_test_tag"], signature_def_map=signature_def_map) + # MetaGraphDef 2 builder.add_meta_graph(tags=["tflite"]) builder.save(True) # Convert to tflite - convert_saved_model.convert( + convert_saved_model.tflite_from_saved_model( saved_model_dir=saved_model_dir, tag_set=set([saved_model.tag_constants.SERVING, "additional_test_tag"])) +class ConvertSavedModelTestBasicGraphToText(test_util.TensorFlowTestCase): + + def _createSimpleSavedModel(self, shape): + """Create a simple SavedModel.""" + saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel") + with session.Session() as sess: + in_tensor_1 = array_ops.placeholder( + shape=shape, dtype=dtypes.float32, name="inputB") + in_tensor_2 = array_ops.placeholder( + shape=shape, dtype=dtypes.float32, name="inputA") + out_tensor = in_tensor_1 + in_tensor_2 + inputs = {"x": in_tensor_1, "y": in_tensor_2} + outputs = {"z": out_tensor} + saved_model.simple_save(sess, saved_model_dir, inputs, outputs) + return saved_model_dir + + def _getInputArrayNames(self, model_proto): + return [data.name for data in model_proto.input_arrays] + + def _getInputArrayShapes(self, model_proto): + return [ + [dim for dim in data.shape.dims] for data in model_proto.input_arrays + ] + + def _get_model_flags_proto_from_file(self, filename): + proto = _model_flags_pb2.ModelFlags() + with gfile.Open(filename, "rb") as output_file: + proto.ParseFromString(output_file.read()) + output_file.close() + return proto + + def testSimpleSavedModel(self): + """Test a simple SavedModel.""" + saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) + output_file_model = os.path.join(self.get_temp_dir(), "model.pb") + output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt") + + convert_saved_model.saved_model_to_frozen_graphdef( + saved_model_dir=saved_model_dir, + output_file_model=output_file_model, + output_file_flags=output_file_flags, + input_arrays=["inputB", "inputA"]) + + proto = self._get_model_flags_proto_from_file(output_file_flags) + self.assertEqual(proto.output_arrays, ["add"]) + self.assertEqual(self._getInputArrayNames(proto), ["inputA", "inputB"]) + self.assertEqual( + self._getInputArrayShapes(proto), [[1, 16, 16, 3], [1, 16, 16, 3]]) + + def testSimpleSavedModelWithDifferentInputNames(self): + """Test a simple SavedModel.""" + saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) + output_file_model = os.path.join(self.get_temp_dir(), "model.pb") + output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt") + + # Check case where input shape is given. + convert_saved_model.saved_model_to_frozen_graphdef( + saved_model_dir=saved_model_dir, + output_file_model=output_file_model, + output_file_flags=output_file_flags, + input_arrays=["inputA"], + input_shapes={"inputA": [1, 16, 16, 3]}) + + proto = self._get_model_flags_proto_from_file(output_file_flags) + self.assertEqual(proto.output_arrays, ["add"]) + self.assertEqual(self._getInputArrayNames(proto), ["inputA"]) + self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]]) + + # Check case where input shape is None. + convert_saved_model.saved_model_to_frozen_graphdef( + saved_model_dir=saved_model_dir, + output_file_model=output_file_model, + output_file_flags=output_file_flags, + input_arrays=["inputA"], + input_shapes={"inputA": None}) + + proto = self._get_model_flags_proto_from_file(output_file_flags) + self.assertEqual(proto.output_arrays, ["add"]) + self.assertEqual(self._getInputArrayNames(proto), ["inputA"]) + self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]]) + + class Model(keras.Model): """Model to recognize digits in the MNIST dataset. - Train and export savedmodel, used for testOnflyTrainMnistSavedModel + Train and export SavedModel, used for testOnflyTrainMnistSavedModel Network structure is equivalent to: https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py @@ -238,7 +357,7 @@ def dummy_input_fn(): class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase): def testTrainedMnistSavedModel(self): - """Test mnist savedmodel, trained with dummy data and small steps.""" + """Test mnist SavedModel, trained with dummy data and small steps.""" # Build classifier classifier = estimator.Estimator( model_fn=model_fn, @@ -253,21 +372,20 @@ class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase): "image": image, }) - # Export savedmodel + # Export SavedModel saved_model_dir = os.path.join(self.get_temp_dir(), "mnist_savedmodel") classifier.export_savedmodel(saved_model_dir, pred_input_fn) # Convert to tflite and test output saved_model_name = os.listdir(saved_model_dir)[0] saved_model_final_dir = os.path.join(saved_model_dir, saved_model_name) - output_tflite = os.path.join(saved_model_dir, - saved_model_final_dir + ".lite") + output_file = os.path.join(saved_model_dir, saved_model_final_dir + ".lite") # TODO(zhixianyan): no need to limit output_arrays to `Softmax' # once b/74205001 fixed and argmax implemented in tflite. - result = convert_saved_model.convert( + result = convert_saved_model.tflite_from_saved_model( saved_model_dir=saved_model_final_dir, - output_arrays="Softmax", - output_tflite=output_tflite) + output_arrays=["Softmax"], + output_file=output_file) self.assertTrue(result) diff --git a/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..4d9782f4a6a9e853c3afdbd97d4264a818937e63 --- /dev/null +++ b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py @@ -0,0 +1,106 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Python console command for generating frozen models from SavedModels. + +This exists to add SavedModel compatibility to TOCO. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys +from tensorflow.contrib.lite.python.convert_saved_model import saved_model_to_frozen_graphdef +from tensorflow.python.platform import app + +FLAGS = None + + +def execute(unused_args): + """Calls function to convert the SavedModel to a frozen graph.""" + # Error handling. + if FLAGS.input_shapes and not FLAGS.input_arrays: + raise ValueError("Input shapes requires input arrays to be specified.") + + # Calls saved_model_to_frozen_graphdef function to generate frozen graph. + input_arrays = (FLAGS.input_arrays.split(",") if FLAGS.input_arrays else None) + input_shapes = None + if FLAGS.input_shapes: + input_shapes = { + input_arrays[idx]: shape.split(",") + for idx, shape in enumerate(FLAGS.input_shapes.split(":")) + } + output_arrays = ( + FLAGS.output_arrays.split(",") if FLAGS.output_arrays else None) + tag_set = set(FLAGS.tag_set.split(",")) if FLAGS.tag_set else None + + saved_model_to_frozen_graphdef( + saved_model_dir=FLAGS.saved_model_directory, + output_file_model=FLAGS.output_file_model, + output_file_flags=FLAGS.output_file_flags, + input_arrays=input_arrays, + input_shapes=input_shapes, + output_arrays=output_arrays, + tag_set=tag_set, + signature_key=FLAGS.signature_key, + batch_size=FLAGS.batch_size) + + +def main(): + global FLAGS + # Parses flags. + parser = argparse.ArgumentParser( + description="Invoke SavedModel to frozen model converter.") + parser.add_argument( + "saved_model_directory", + type=str, + help="Full path to directory containing the SavedModel.") + parser.add_argument( + "output_file_model", + type=str, + help="Full file path to save frozen graph.") + parser.add_argument( + "output_file_flags", type=str, help="Full file path to save ModelFlags.") + parser.add_argument( + "--input_arrays", + type=str, + help="Name of the input arrays, comma-separated.") + parser.add_argument( + "--input_shapes", + type=str, + help="Shapes corresponding to --input_arrays, colon-separated.") + parser.add_argument( + "--output_arrays", + type=str, + help="Name of the output arrays, comma-separated.") + parser.add_argument( + "--tag_set", type=str, help="Name of output arrays, comma-separated.") + parser.add_argument( + "--signature_key", + type=str, + help="Key identifying SignatureDef containing inputs and outputs.") + parser.add_argument( + "--batch_size", + type=int, + help="Batch size for the model. Replaces the first dimension of an " + "input size array if undefined.") + + FLAGS, unparsed = parser.parse_known_args() + + app.run(main=execute, argv=[sys.argv[0]] + unparsed) + + +if __name__ == "__main__": + main() diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/convert_test.py similarity index 82% rename from tensorflow/contrib/lite/python/lite_test.py rename to tensorflow/contrib/lite/python/convert_test.py index b8b4510188bee867b32ffde714b27f41a1df778a..dc21a9b66933f595a5f31b0b91ff247a5458dad6 100644 --- a/tensorflow/contrib/lite/python/lite_test.py +++ b/tensorflow/contrib/lite/python/convert_test.py @@ -17,8 +17,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.lite.python import lite -from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base +from tensorflow.contrib.lite.python import convert +from tensorflow.contrib.lite.python import lite_constants +from tensorflow.contrib.lite.python import op_hint from tensorflow.python.client import session from tensorflow.python.framework import dtypes from tensorflow.python.framework import test_util @@ -29,7 +30,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -class LiteTest(test_util.TensorFlowTestCase): +class ConvertTest(test_util.TensorFlowTestCase): def testBasic(self): in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3], @@ -37,13 +38,13 @@ class LiteTest(test_util.TensorFlowTestCase): out_tensor = in_tensor + in_tensor sess = session.Session() # Try running on valid graph - result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor]) + result = convert.toco_convert(sess.graph_def, [in_tensor], [out_tensor]) self.assertTrue(result) # TODO(aselle): remove tests that fail (we must get TOCO to not fatal # all the time). # Try running on identity graph (known fail) # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"): - # result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor]) + # result = convert.toco_convert(sess.graph_def, [in_tensor], [in_tensor]) def testQuantization(self): in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3], @@ -51,13 +52,14 @@ class LiteTest(test_util.TensorFlowTestCase): out_tensor = array_ops.fake_quant_with_min_max_args(in_tensor + in_tensor, min=0., max=1.) sess = session.Session() - result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor], - inference_type=lite.QUANTIZED_UINT8, - quantized_input_stats=[(0., 1.)]) + result = convert.toco_convert( + sess.graph_def, [in_tensor], [out_tensor], + inference_type=lite_constants.QUANTIZED_UINT8, + quantized_input_stats=[(0., 1.)]) self.assertTrue(result) -class LiteTestOpHint(test_util.TensorFlowTestCase): +class ConvertTestOpHint(test_util.TensorFlowTestCase): """Test the hint to stub functionality.""" def _getGraphOpTypes(self, graphdef, output_nodes): @@ -99,7 +101,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): swish_scale = array_ops.constant(1.0) def _swish(input_tensor, scale): - custom = lite.OpHint("cool_activation") + custom = op_hint.OpHint("cool_activation") input_tensor, scale = custom.add_inputs(input_tensor, scale) output = math_ops.sigmoid(input_tensor) * input_tensor * scale output, = custom.add_outputs(output) @@ -111,11 +113,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): # and 1 final output). self.assertEqual(self._countIdentities(sess.graph_def.node), 4) - stubbed_graphdef = lite.convert_op_hints_to_stubs(sess) + stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess) self.assertCountEqual( self._getGraphOpTypes( - stubbed_graphdef, output_nodes=[_tensor_name_base(output)]), + stubbed_graphdef, + output_nodes=[op_hint._tensor_name_base(output)]), ["cool_activation", "Const", "Identity"]) def testScaleAndBiasAndIdentity(self): @@ -125,7 +128,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): b = array_ops.constant([4., 5.]) def _scaled_and_bias_and_identity(a, x, b): - custom = lite.OpHint("scale_and_bias_and_identity") + custom = op_hint.OpHint("scale_and_bias_and_identity") a, x, b = custom.add_inputs(a, x, b) return custom.add_outputs(a * x + b, x) output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b), @@ -136,11 +139,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): # +1 for the final output self.assertEqual(self._countIdentities(sess.graph_def.node), 6) - stubbed_graphdef = lite.convert_op_hints_to_stubs(sess) + stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess) self.assertCountEqual( self._getGraphOpTypes( - stubbed_graphdef, output_nodes=[_tensor_name_base(output)]), + stubbed_graphdef, + output_nodes=[op_hint._tensor_name_base(output)]), ["scale_and_bias_and_identity", "Const", "Identity", "Pack"]) def testTwoFunctions(self): @@ -148,7 +152,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): a = array_ops.constant([1.]) b = array_ops.constant([1.]) def _double_values(x): - custom = lite.OpHint("add_test") + custom = op_hint.OpHint("add_test") x = custom.add_inputs(x) output = math_ops.multiply(x, x) output, = custom.add_outputs(output) @@ -160,10 +164,11 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): # make sure one identity for each input (2) and output (2) => 2 + 2 # +1 for the final output self.assertEqual(self._countIdentities(sess.graph_def.node), 5) - stubbed_graphdef = lite.convert_op_hints_to_stubs(sess) + stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess) self.assertCountEqual( self._getGraphOpTypes( - stubbed_graphdef, output_nodes=[_tensor_name_base(output)]), + stubbed_graphdef, + output_nodes=[op_hint._tensor_name_base(output)]), ["add_test", "Const", "Identity", "Add"]) diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py index cf50f9d4d65cb7a36af8f82e2d29babbc9884d23..4ea40201f73bb0aabee60866dde2337862149dc7 100644 --- a/tensorflow/contrib/lite/python/lite.py +++ b/tensorflow/contrib/lite/python/lite.py @@ -18,6 +18,7 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice. @@toco_convert @@toco_convert_protos +@@tflite_from_saved_model @@OpHint @@convert_op_hints_to_stubs @@ -25,208 +26,11 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice. from __future__ import absolute_import from __future__ import division from __future__ import print_function -import os as _os -import subprocess as _subprocess -import tempfile as _tempfile # pylint: disable=unused-import +from tensorflow.contrib.lite.python.convert import toco_convert +from tensorflow.contrib.lite.python.convert import toco_convert_protos +from tensorflow.contrib.lite.python.convert_saved_model import tflite_from_saved_model from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs from tensorflow.contrib.lite.python.op_hint import OpHint # pylint: enable=unused-import -from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2 -from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2 -from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2 -from tensorflow.python.framework import dtypes as _dtypes -from tensorflow.python.platform import resource_loader as _resource_loader -from tensorflow.python.util.all_util import remove_undocumented -from tensorflow.python.util.lazy_loader import LazyLoader - -# Lazy load since some of the performance benchmark skylark rules -# break dependencies. -_toco_python = LazyLoader( - "tensorflow_wrap_toco", globals(), - "tensorflow.contrib.lite.toco.python." - "tensorflow_wrap_toco") -del LazyLoader - -# Enum types from the protobuf promoted to the API -FLOAT = _types_pb2.FLOAT -INT32 = _types_pb2.INT32 -INT64 = _types_pb2.INT64 -STRING = _types_pb2.STRING -QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8 -TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF -TFLITE = _toco_flags_pb2.TFLITE -GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT - -# Currently the default mode of operation is to shell to another python process -# to protect against crashes. However, it breaks some dependent targets because -# it forces us to depend on an external py_binary. The experimental API doesn't -# have that drawback. -EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False - -# Find the toco_from_protos binary using the resource loader if using from -# bazel, otherwise we are in a pip where console_scripts already has -# the toco_from_protos tool. -if EXPERIMENTAL_USE_TOCO_API_DIRECTLY: - _toco_from_proto_bin = "" -else: - _toco_from_proto_bin = _resource_loader.get_path_to_datafile( - "../toco/python/toco_from_protos") - -if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin): - _toco_from_proto_bin = "toco_from_protos" - - -def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str): - """Convert `input_data_str` according to model and toco parameters. - - Unless you know what you are doing consider using - the more friendly @{tf.contrib.lite.toco_convert}}. - - Args: - model_flags_str: Serialized proto describing model properties, see - `toco/model_flags.proto`. - toco_flags_str: Serialized proto describing conversion properties, see - `toco/toco_flags.proto`. - input_data_str: Input data in serialized form (e.g. a graphdef is common) - Returns: - Converted model in serialized form (e.g. a TFLITE model is common). - Raises: - RuntimeError: When conversion fails, an exception is raised with the error - message embedded. - """ - # TODO(aselle): When toco does not use fatal errors for failure, we can - # switch this on. - if not _toco_from_proto_bin: - return _toco_python.TocoConvert( - model_flags_str, toco_flags_str, input_data_str) - - with _tempfile.NamedTemporaryFile() as fp_toco, \ - _tempfile.NamedTemporaryFile() as fp_model, \ - _tempfile.NamedTemporaryFile() as fp_input, \ - _tempfile.NamedTemporaryFile() as fp_output: - fp_model.write(model_flags_str) - fp_toco.write(toco_flags_str) - fp_input.write(input_data_str) - fp_model.flush() - fp_toco.flush() - fp_input.flush() - - cmd = [ - _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name, - fp_output.name - ] - cmdline = " ".join(cmd) - proc = _subprocess.Popen( - cmdline, - shell=True, - stdout=_subprocess.PIPE, - stderr=_subprocess.STDOUT, - close_fds=True) - stdout, stderr = proc.communicate() - exitcode = proc.returncode - if exitcode == 0: - stuff = fp_output.read() - return stuff - else: - raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" % - (stdout, stderr)) - - -def _tensor_name(x): - return x.name.split(":")[0] - - -def toco_convert(input_data, - input_tensors, - output_tensors, - inference_type=FLOAT, - input_format=TENSORFLOW_GRAPHDEF, - output_format=TFLITE, - quantized_input_stats=None, - drop_control_dependency=True, - allow_custom_ops=None): - """Convert a model using TOCO from `input_format` to `output_format`. - - Typically this is to convert from TensorFlow GraphDef to TFLite, in which - case the default `input_format` and `output_format` are sufficient. - - Args: - input_data: Input data (i.e. often `sess.graph_def`). - input_tensors: List of input tensors. Type and shape are computed using - `foo.get_shape()` and `foo.dtype`. - output_tensors: List of output tensors (only .name is used from this). - inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`. - input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF). - output_format: Type of data to write (currently must be TFLITE or - GRAPHVIZ_DOT) - quantized_input_stats: For each member of input_tensors the mean and - std deviation of training data. Only needed if `inference_type` is - `QUANTIZED_UINT8`. - drop_control_dependency: Drops control dependencies silently. This is due - to tf lite not supporting control dependencies. - - Returns: - The converted data. For example if tflite was the destination, then - this will be a tflite flatbuffer in a bytes array. - - Raises: - ValueError: If the input tensor type is unknown - RuntimeError: If TOCO fails to convert (in which case the runtime error's - error text will contain the TOCO error log) - """ - toco = _toco_flags_pb2.TocoFlags() - toco.input_format = input_format - toco.output_format = output_format - toco.inference_type = inference_type - toco.drop_control_dependency = drop_control_dependency - if allow_custom_ops is not None: - toco.allow_custom_ops = allow_custom_ops - - model = _model_flags_pb2.ModelFlags() - for idx, input_tensor in enumerate(input_tensors): - if input_tensor.dtype == _dtypes.float32: - tflite_input_type = FLOAT - elif input_tensor.dtype == _dtypes.int32: - tflite_input_type = INT32 - elif input_tensor.dtype == _dtypes.int64: - tflite_input_type = INT64 - # TODO(aselle): Insert strings when they are available - else: - raise ValueError("Tensors %s not known type %r" % (input_tensor.name, - input_tensor.dtype)) - - input_array = model.input_arrays.add() - - if inference_type == QUANTIZED_UINT8: - if tflite_input_type == FLOAT: - tflite_input_type = QUANTIZED_UINT8 - input_array.mean_value, input_array.std_value = quantized_input_stats[idx] - - input_array.name = _tensor_name(input_tensor) - input_array.shape.dims.extend(map(int, input_tensor.get_shape())) - - for output_tensor in output_tensors: - model.output_arrays.append(_tensor_name(output_tensor)) - - # TODO(aselle): Consider handling the case of allowing quantized - # inputs to be converted to float (via the toco.inference_input_type field). - data = toco_convert_protos(model.SerializeToString(), - toco.SerializeToString(), - input_data.SerializeToString()) - return data - - -_allowed_symbols = [ - "FLOAT", - "INT32", - "INT64", - "STRING", - "QUANTIZED_UINT8", - "TENSORFLOW_GRAPHDEF", - "TFLITE", - "GRAPHVIZ_DOT", - "EXPERIMENTAL_USE_TOCO_API_DIRECTLY", -] -remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/lite/python/lite_constants.py b/tensorflow/contrib/lite/python/lite_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..195d7a732f337676937c7af5137d4dea84989c03 --- /dev/null +++ b/tensorflow/contrib/lite/python/lite_constants.py @@ -0,0 +1,53 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Constants for TFLite.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2 +from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2 +from tensorflow.python.util.all_util import remove_undocumented + +# Enum types from the protobuf promoted to the API +FLOAT = _types_pb2.FLOAT +INT32 = _types_pb2.INT32 +INT64 = _types_pb2.INT64 +STRING = _types_pb2.STRING +QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8 +TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF +TFLITE = _toco_flags_pb2.TFLITE +GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT + +# Currently the default mode of operation is to shell to another python process +# to protect against crashes. However, it breaks some dependent targets because +# it forces us to depend on an external py_binary. The experimental API doesn't +# have that drawback. +EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False + + +_allowed_symbols = [ + "FLOAT", + "INT32", + "INT64", + "STRING", + "QUANTIZED_UINT8", + "TENSORFLOW_GRAPHDEF", + "TFLITE", + "GRAPHVIZ_DOT", + "EXPERIMENTAL_USE_TOCO_API_DIRECTLY", +] +remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs index 2b62c257d8410f9af1b250c9d108eba6737a9efe..a65c2e0c70dcf9f97e49cc749132afbd5fc69365 100644 --- a/tensorflow/contrib/lite/schema/schema.fbs +++ b/tensorflow/contrib/lite/schema/schema.fbs @@ -435,21 +435,25 @@ table Operator { custom_options_format:CustomOptionsFormat; } -// The root type, defining a model. +// The root type, defining a subgraph, which typically represents an entire +// model. table SubGraph { - // A list of all tensors used in this model. + // A list of all tensors used in this subgraph. tensors:[Tensor]; - // Indices of the input tensors. + // Indices of the tensors that are inputs into this subgraph. Note this is + // the list of non-static tensors that feed into the subgraph for inference. inputs:[int]; - // Indices of the output tensors. + // Indices of the tensors that are outputs out of this subgraph. Note this is + // the list of output tensors that are considered the product of the + // subgraph's inference. outputs:[int]; // All operators, in execution order. operators:[Operator]; - // Name of subgraph (used for debugging). + // Name of this subgraph (used for debugging). name:string; } @@ -475,7 +479,10 @@ table Model { // A description of the model. description:string; - // Buffers of the model + // Buffers of the model. + // Note the 0th entry of this array must be an empty buffer (sentinel). + // This is a convention so that tensors without a buffer can provide 0 as + // their buffer. buffers:[Buffer]; } diff --git a/tensorflow/contrib/lite/string_util.cc b/tensorflow/contrib/lite/string_util.cc index cd41299d38361321503d421272426a9d1082c937..a89776b29f895fe82ee71efe00c0949c58c109df 100644 --- a/tensorflow/contrib/lite/string_util.cc +++ b/tensorflow/contrib/lite/string_util.cc @@ -24,7 +24,10 @@ namespace tflite { namespace { // Convenient method to get pointer to int32_t. -int32_t* GetIntPtr(char* ptr) { return reinterpret_cast(ptr); } +const int32_t* GetIntPtr(const char* ptr) { + return reinterpret_cast(ptr); +} + } // namespace void DynamicBuffer::AddString(const char* str, size_t len) { @@ -64,7 +67,7 @@ void DynamicBuffer::AddJoinedString(const std::vector& strings, offset_.push_back(offset_.back() + total_len); } -void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) { +int DynamicBuffer::WriteToBuffer(char** buffer) { // Allocate sufficient memory to tensor buffer. int32_t num_strings = offset_.size() - 1; // Total bytes include: @@ -75,43 +78,57 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) { int32_t bytes = data_.size() // size of content + sizeof(int32_t) * (num_strings + 2); // size of header - // Output tensor will take over the ownership of tensor_buffer, and free it - // during Interpreter destruction. - char* tensor_buffer = static_cast(malloc(bytes)); + // Caller will take ownership of buffer. + *buffer = reinterpret_cast(malloc(bytes)); // Set num of string - memcpy(tensor_buffer, &num_strings, sizeof(int32_t)); + memcpy(*buffer, &num_strings, sizeof(int32_t)); // Set offset of strings. int32_t start = sizeof(int32_t) * (num_strings + 2); for (int i = 0; i < offset_.size(); i++) { int32_t offset = start + offset_[i]; - memcpy(tensor_buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t)); + memcpy(*buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t)); } // Copy data of strings. - memcpy(tensor_buffer + start, data_.data(), data_.size()); + memcpy(*buffer + start, data_.data(), data_.size()); + return bytes; +} + +void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) { + char* tensor_buffer; + int bytes = WriteToBuffer(&tensor_buffer); // Set tensor content pointer to tensor_buffer, and release original data. auto dims = TfLiteIntArrayCreate(1); - dims->data[0] = num_strings; + dims->data[0] = offset_.size() - 1; // Store number of strings. TfLiteTensorReset(tensor->type, tensor->name, dims, tensor->params, tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation, tensor); } +int GetStringCount(const char* raw_buffer) { + // The first integers in the raw buffer is the number of strings. + return *GetIntPtr(raw_buffer); +} + int GetStringCount(const TfLiteTensor* tensor) { // The first integers in the raw buffer is the number of strings. - return *GetIntPtr(tensor->data.raw); + return GetStringCount(tensor->data.raw); } -StringRef GetString(const TfLiteTensor* tensor, int string_index) { - int32_t* offset = - GetIntPtr(tensor->data.raw + sizeof(int32_t) * (string_index + 1)); +StringRef GetString(const char* raw_buffer, int string_index) { + const int32_t* offset = + GetIntPtr(raw_buffer + sizeof(int32_t) * (string_index + 1)); return { - tensor->data.raw + (*offset), + raw_buffer + (*offset), (*(offset + 1)) - (*offset), }; } +StringRef GetString(const TfLiteTensor* tensor, int string_index) { + return GetString(tensor->data.raw, string_index); +} + } // namespace tflite diff --git a/tensorflow/contrib/lite/string_util.h b/tensorflow/contrib/lite/string_util.h index c35a2fff3c23b17515323b65d08df6f6da288834..57f129bf5ea7ab7e792cc01f83e281922dbfb834 100644 --- a/tensorflow/contrib/lite/string_util.h +++ b/tensorflow/contrib/lite/string_util.h @@ -49,7 +49,7 @@ namespace tflite { // Convenient structure to store string pointer and length. typedef struct { - char* str; + const char* str; int len; } StringRef; @@ -70,6 +70,10 @@ class DynamicBuffer { // buffer. void AddJoinedString(const std::vector& strings, char separator); + // Fill content into a buffer and returns the number of bytes stored. + // The function allocates space for the buffer but does NOT take ownership. + int WriteToBuffer(char** buffer); + // Fill content into a string tensor. void WriteToTensor(TfLiteTensor* tensor); @@ -81,10 +85,12 @@ class DynamicBuffer { }; // Return num of strings in a String tensor. +int GetStringCount(const char* raw_buffer); int GetStringCount(const TfLiteTensor* tensor); // Get String pointer and length of index-th string in tensor. // NOTE: This will not create a copy of string data. +StringRef GetString(const char* raw_buffer, int string_index); StringRef GetString(const TfLiteTensor* tensor, int string_index); } // namespace tflite diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py index e045c27427f1cc40c824c74ca19f0426075c650d..f72a4e0d8cbc89c9de5ee0df61f78d7d32bde73e 100644 --- a/tensorflow/contrib/lite/testing/generate_examples.py +++ b/tensorflow/contrib/lite/testing/generate_examples.py @@ -1758,19 +1758,7 @@ def make_strided_slice_tests(zip_path): "shrink_axis_mask": [None, 1, 8, 11, 15, -1], "constant_indices": [False, True], }, - # - { - "dtype": [tf.float32], - "index_type": [tf.int32], - "input_shape": [[12, 2, 2, 5]], - "begin": [[0]], - "end": [[1]], - "strides": [[1]], - "begin_mask": [0], - "end_mask": [0], - "shrink_axis_mask": [1], - "constant_indices": [True], - }, + # TODO(b/73170889) Restore test paramaters removed in cl/191608113. # 2-D { "dtype": [tf.float32, tf.int32, tf.int64], diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD index 398978b14580802dd727f4be400391167e9f0ecd..3f73ef620e121bf3568db55e8a9fbe799900bc90 100644 --- a/tensorflow/contrib/lite/toco/BUILD +++ b/tensorflow/contrib/lite/toco/BUILD @@ -219,6 +219,7 @@ cc_library( "graph_transformations/drop_fake_quant.cc", "graph_transformations/drop_im2col_arrays.cc", "graph_transformations/ensure_bias_vectors.cc", + "graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc", "graph_transformations/experimental_shuffle_fc_weights.cc", "graph_transformations/fuse_activation_functions.cc", "graph_transformations/fuse_binary_into_following_affine.cc", @@ -238,6 +239,7 @@ cc_library( "graph_transformations/merge_reshape_into_preceding_transpose.cc", "graph_transformations/propagate_activation_function_into_constants.cc", "graph_transformations/propagate_array_data_types.cc", + "graph_transformations/propagate_default_min_max.cc", "graph_transformations/propagate_fake_quant_num_bits.cc", "graph_transformations/propagate_fixed_sizes.cc", "graph_transformations/quantization_util.cc", diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h index 71e7318ac367bededa40f1bcbc40b473b49657f8..fe30b88344c5340dc8647cd89e244987c86e47fe 100644 --- a/tensorflow/contrib/lite/toco/args.h +++ b/tensorflow/contrib/lite/toco/args.h @@ -227,6 +227,8 @@ struct ParsedTocoFlags { // TODO(aselle): command_line_flags doesn't support doubles Arg default_ranges_min = Arg(0.); Arg default_ranges_max = Arg(0.); + Arg default_int16_ranges_min = Arg(0.); + Arg default_int16_ranges_max = Arg(0.); Arg inference_type; Arg inference_input_type; Arg drop_fake_quant = Arg(false); @@ -238,6 +240,7 @@ struct ParsedTocoFlags { Arg debug_disable_recurrent_cell_fusion = Arg(false); Arg drop_control_dependency = Arg(false); Arg propagate_fake_quant_num_bits = Arg(false); + Arg allow_nudging_weights_to_use_fast_gemm_kernel = Arg(false); }; } // namespace toco diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc index c289ddcd929c74e9a5b0eaf2e0c8fc1a6a01d320..5bb0e3ba4d289ccfcb54198230f53c62222963a2 100644 --- a/tensorflow/contrib/lite/toco/dump_graphviz.cc +++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc @@ -259,6 +259,19 @@ NodeProperties GetPropertiesForOperator(const Operator& op) { node_properties.color = Color(0xC5, 0x39, 0x29); // Bolder color break; } + case OperatorType::kFakeQuant: { + const auto& fakequant_op = static_cast(op); + node_properties.color = Color(0xC5, 0x39, 0x29); // Bolder color + if (fakequant_op.minmax) { + AppendF(&node_properties.label, "\\n%dbit [%g,%g]", + fakequant_op.num_bits, fakequant_op.minmax->min, + fakequant_op.minmax->max); + } else { + AppendF(&node_properties.label, "\\n%dbit [?,?]", + fakequant_op.num_bits); + } + break; + } default: node_properties.color = Color(0xDB, 0x44, 0x37); break; diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc new file mode 100644 index 0000000000000000000000000000000000000000..394fa349e2663e2806344f27a96a5132a2d4a810 --- /dev/null +++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc @@ -0,0 +1,209 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include +#include +#include +#include + +#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h" +#include "tensorflow/contrib/lite/toco/model.h" +#include "tensorflow/contrib/lite/toco/tooling_util.h" +#include "tensorflow/core/platform/logging.h" + +namespace toco { + +// === Summary === +// +// TLDR: Some of our 8-bit arithmetic operations require uint8 weight values +// to avoid the value 0, thus ranging only in [1, 255]. This enables faster +// runtime arithmetic kernels on ARM NEON. This is not relevant on most +// other hardware architectures, and will cease to be relevant on ARM NEON +// in the future. These topics are elaborated below ("Context"). +// +// Having just one isolated uint8 value equal to 0 is fine. The bad case is when +// two uint8 values are both zero and are less than 16 bytes apart. +// +// By default, toco generates a fatal error when that happens. The user may opt +// in to more lax behavior by passing +// --allow_nudging_weights_to_use_fast_gemm_kernel. +// This causes toco to nudge such bad 0 values into the value 1, thus avoiding +// the problem in exchange for compromising on accuracy. +// +// The present graph transformation implements both the default fatal-erroring +// behavior, and, when allow_nudging_weights is set, also the lax nudging +// behavior. +// +// +// === Context === +// +// Since March 2017, we have been using a trick to perform faster +// 8bit matrix multiplications, to our knowledge first implemented in gemmlowp +// here: +// https://github.com/google/gemmlowp/commit/25b2989415b99e797e1ab977837111b2e231f81f +// +// This trick is explained in Appendix B of our paper, +// https://arxiv.org/abs/1712.05877 +// +// Here is the relevant paragraph: +// +// For efficient NEON implementation of the matrix multiplication’s +// core accumulation, we use the following trick. +// In the multiply-add operation in (10), we first change the +// operands’ type from uint8 to int8 (which can be done by +// subtracting 128 from the quantized values and zero-points). +// Thus the core multiply-add becomes +// +// int32 += int8 * int8. (B.1) +// +// As mentioned in section 3, with a minor tweak of the quantized +// training process, we can ensure that the weights, once +// quantized as int8 values, never take the value −128. Hence, +// the product in (B.1) is never −128 ∗ −128, and is therefore +// always less than 2^14 in absolute value. Hence, (B.1) +// can accumulate two products on a local int16 accumulator +// before that needs to be accumulated into the true int32 accumulator. +// This allows the use of an 8-way SIMD multiplication +// (SMULL on int8 operands), followed by an 8-way +// SIMD multiply-add (SMLAL on int8 operands), followed +// by a pairwise-add-and-accumulate into the int32 accumulators +// (SADALP). +// +// As that paragraph notes, quantized training should be suitably modified to +// ensure that quantized uint8 weights value only range in [1, 255]. So the +// problem that we are dealing with is only about the existing 8-bit quantized +// models that haven't been trained specifically to get 8-bit weights only in +// [1, 255]. +// +// This spreadsheet shows the speed benefit of this trick across many existing +// ARM-architecture CPUs: +// +// https://docs.google.com/spreadsheets/d/1-0LjdMvW0XtH1bYknC0bQINoFaxjTuL9eplZZcitykI/edit?usp=sharing +// +// Compare Row 18 (fast int8 trick) to Row 20 (regular uint8 kernel). +// +// The introduction of the 'dotprod' extension to ARM NEON, specifically the +// SDOT instruction, renders this eventually moot. See the experimental +// kernels contributed by ARM here, +// +// https://github.com/google/gemmlowp/pull/116 +// +// However, as of April 2018, there don't seem to be any commercially available +// CPU supporting these instructions (yet); we are waiting for +// Cortex-A{75,55}-r1 to become available; the "-r1" is key here. Even if such +// CPUs become available soon, it will presumably take years for them to +// overtake the large volume of existing CPUs not supporting these new +// instructions, especially in current and future low-end devices. All in all, +// we can foresee these 'fast int8 kernels' to remain important to have into +// the 2020s. +// +bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model, + std::size_t op_index) { + const auto& op = *model->operators[op_index]; + int weights_index = 0; + switch (op.type) { + case OperatorType::kConv: + weights_index = 1; + break; + case OperatorType::kLstmCell: + weights_index = 2; + break; + case OperatorType::kFullyConnected: { + weights_index = 1; + const auto& fc_op = static_cast(op); + CHECK(!fc_op.experimental_shuffled_weights) + << "This graph transformation expects to run before FC weights get " + "shuffled."; + break; + } + default: + // Other operator types are unaffected by this graph transformation, + // because their runtime implementations don't use the fast int8 trick. + // In particular that's the case of DepthwiseConv at the moment. + // We have to update this logic when that changes, e.g. if in the future + // some DepthwiseConv kernel wants to use the trick. + // + // The reason why that's not so likely, hence why it's fairly safe to + // stay conservative in the list of operators that we handle here, is that + // the fast int8 kernel trick is only applicable to ops that either are + // implemented as a GEMM, or use symmetric ranges for both weights and + // activations. The reason why GEMM is special (can use the trick even + // without symmetric ranges) is that it is so arithmetic-intense that + // it can use techniques reducing its implementation to the symmetric + // ranges case, with limited relative overhead (O(N^2) overhead vs + // O(N^3) GEMM cost). See https://arxiv.org/pdf/1712.05877, section + // 2.3 Efficient handling of zero-points. + // + // That's why at the moment we only handle operators that use a GEMM + // (Conv, fully-connected --- note that LSTM merely wraps a + // fully-connected operator). + return false; + } + + const string& name = op.inputs[weights_index]; + auto& array = model->GetArray(name); + if (!array.buffer) { + return false; + } + if (array.data_type != ArrayDataType::kUint8) { + return false; + } + auto& buffer_data = array.GetMutableBuffer().data; + + int count_bad = 0; + int index_of_previous_bad_value = 0; + bool changed = false; + + for (int i = 0; i < buffer_data.size(); i++) { + if (buffer_data[i] == 0) { + count_bad++; + if (count_bad > 1) { + const int distance = i - index_of_previous_bad_value; + // Semi-arbitrary threshold. The idea is that trouble only occurs + // when two bad values are very close to each other so that they + // are jointly used within registers inside some GEMM kernel. + // The details of that depend on the kernel. Our current fast ARM64 + // kernel, for instance, only has an issue when the distance between + // consecutive bad values is exactly 8. We do not want to track such + // kernel details too closely here, so we pick a threshold that's + // a bit larger than that, to give us room to change kernels in the + // future without worrying. + static constexpr int kMinDistanceBetweenBadValues = 16; + if (distance < kMinDistanceBetweenBadValues) { + if (allow_nudging_weights()) { + buffer_data[i] = 1; + changed = true; + continue; + } + LOG(FATAL) << "Bad value for " << name << " at index " << i + << ", previous bad value at index " + << index_of_previous_bad_value << ", distance=" << distance + << ", kMinDistanceBetweenBadValues=" + << kMinDistanceBetweenBadValues << ". Consider passing " + << "--allow_nudging_weights_to_use_fast_gemm_kernel " + << "if you don't care about accuracy."; + } + } + index_of_previous_bad_value = i; + } + } + + if (changed) { + AddMessageF("Tweaked weights values for %s", LogName(op)); + } + + return changed; +} + +} // namespace toco diff --git a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc index f098981a5cf4b91df4c7798bd3db8563705a3bd0..c00cdcb944b085dda41033b95c96537cc2e047c3 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc @@ -55,17 +55,26 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) { // Exit if, based on the known shapes, this FC op is not a GEMV. // The shuffling of FC weights is only useful to enable fast GEMV paths. const Shape& input_shape = input_array.shape(); - for (int i = 0; i < input_shape.dimensions_count() - 1; i++) { + for (int i = 1; i < input_shape.dimensions_count() - 1; i++) { if (input_shape.dims(i) != 1) { // The input activations, shaped as a matrix, have multiple columns. // This FC op isn't a matrix*vector multiplication. AddMessageF( "Not applying experimental shuffling to the weights of %s because " - "it's not a matrix*vector product", + "the input shape is not 1D or 2D (possibly with additional inner " + "dimensions of size 1)", LogName(*op)); return false; } } + if (input_shape.dims(0) != 1 && input_shape.dims(0) != 4) { + AddMessageF( + "Not applying experimental shuffling to the weights of %s because " + "the input shape's leading dimension, i.e. the 'batch size', is not " + "equal to 1 or 4", + LogName(*op)); + return false; + } // Exit if the weights shape isn't an integral multiple of the shuffled // block shape, 4x16. We don't want to have to write code dealing with // odd sizes, that would go un-exercised at the moment as the models @@ -129,6 +138,20 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) { fc_op->experimental_shuffled_weights = true; AddMessageF("Applied experimental shuffling to the weights of %s", LogName(*op)); + // Add a second output array to this FC op, serving as a workspace to perform + // runtime shuffling/xoring of its input activations. + CHECK_EQ(fc_op->outputs.size(), 1); + const string& shuffled_input_workspace_array_name = + AvailableArrayName(*model, fc_op->inputs[0] + "_shuffled"); + fc_op->outputs.push_back(shuffled_input_workspace_array_name); + auto& shuffled_input_workspace_array = + model->GetOrCreateArray(shuffled_input_workspace_array_name); + shuffled_input_workspace_array.data_type = input_array.data_type; + *shuffled_input_workspace_array.mutable_shape() = input_array.shape(); + shuffled_input_workspace_array.GetOrCreateMinMax() = input_array.GetMinMax(); + shuffled_input_workspace_array.GetOrCreateQuantizationParams() = + input_array.GetQuantizationParams(); + return true; } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h index 56b3dec5c495dd56318cb528c9fe530d6eb217a6..72ffd51db45d0808feab3d07436c13db2420a680 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h +++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h @@ -190,6 +190,24 @@ DECLARE_GRAPH_TRANSFORMATION(Dequantize) DECLARE_GRAPH_TRANSFORMATION(UnpartitionEmbeddingLookup) DECLARE_GRAPH_TRANSFORMATION(ExperimentalShuffleFCWeights) +class PropagateDefaultMinMax : public GraphTransformation { + public: + bool Run(Model* model, std::size_t op_index) override; + const char* Name() const override { return "PropagateDefaultMinMax"; } + + bool has_any_ranges_defined() const { return !type_ranges_.empty(); } + void DefineTypeRange(ArrayDataType data_type, double min, double max) { + MinMax minmax; + minmax.min = min; + minmax.max = max; + type_ranges_.emplace_back(data_type, minmax); + } + + private: + bool SetArrayMinMax(const string& array_name, Array* array); + std::vector> type_ranges_; +}; + class ResolveReshapeAttributes : public GraphTransformation { public: bool Run(Model* model, std::size_t op_index) override; @@ -228,6 +246,19 @@ class ResolveConstantFakeQuant : public GraphTransformation { bool propagate_fake_quant_num_bits_ = false; }; +class EnsureUint8WeightsSafeForFastInt8Kernels : public GraphTransformation { + public: + bool Run(Model* model, std::size_t op_index) override; + const char* Name() const override { + return "EnsureUint8WeightsSafeForFastInt8Kernels"; + } + bool allow_nudging_weights() const { return allow_nudging_weights_; } + void set_allow_nudging_weights(bool val) { allow_nudging_weights_ = val; } + + private: + bool allow_nudging_weights_ = false; +}; + #undef DECLARE_GRAPH_TRANSFORMATION } // end namespace toco diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc new file mode 100644 index 0000000000000000000000000000000000000000..50b90e7c2bfddb0382a4d44ad6c90fc7f7701273 --- /dev/null +++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc @@ -0,0 +1,86 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include +#include +#include +#include + +#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h" +#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h" +#include "tensorflow/contrib/lite/toco/model.h" +#include "tensorflow/contrib/lite/toco/tooling_util.h" +#include "tensorflow/core/platform/logging.h" + +namespace toco { + +// Propagates default min/max values to any operator input/output array that +// is missing them. +// +// When provided a set of min/max values for uint8 arrays this will rescale +// the values for other data types as required and preserving the floating point +// range within the new type. +bool PropagateDefaultMinMax::Run(Model* model, std::size_t op_index) { + const auto it = model->operators.begin() + op_index; + const auto* op = it->get(); + + bool did_change = false; + + for (const auto& input : op->inputs) { + auto& input_array = model->GetArray(input); + if (!input_array.minmax && !input_array.buffer) { + did_change |= SetArrayMinMax(input, &input_array); + } + } + + for (const auto& output : op->outputs) { + auto& output_array = model->GetArray(output); + if (!output_array.minmax && !output_array.buffer) { + did_change |= SetArrayMinMax(output, &output_array); + } + } + + return did_change; +} + +// Sets the min/max on the given array, adjusting the reference_minmax for the +// final data type of the array if it is already specified. +bool PropagateDefaultMinMax::SetArrayMinMax(const string& array_name, + Array* array) { + CHECK(!array->minmax); + + ArrayDataType quantized_data_type = + GetQuantizedDataType(*array, ArrayDataType::kUint8); + for (const auto& type_range : type_ranges_) { + if (type_range.first == quantized_data_type) { + array->GetOrCreateMinMax() = type_range.second; + break; + } + } + if (!array->minmax) { + AddMessageF( + "No defaults specified for quantized data type %s of array %s, " + "skipping", + ArrayDataTypeName(quantized_data_type), array_name); + return false; + } + + AddMessageF("Adding default minmax %g,%g to array %s when quantized as %s", + array->GetMinMax().min, array->GetMinMax().max, array_name, + ArrayDataTypeName(quantized_data_type)); + + return true; +} + +} // namespace toco diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc index 9191e696629f42faf89289f53f9dbc97f4527ce9..be6e0e07dd08abef856fb48892d1fa693dd926e6 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc @@ -168,7 +168,9 @@ void ProcessConvOperator(Model* model, ConvOperator* op) { return; } const auto& input_shape = input_array.shape(); - CHECK_EQ(input_shape.dimensions_count(), 4); + CHECK(input_shape.dimensions_count() == 4) + << "Conv ops require 4D inputs. Input array \"" << op->inputs[0] + << "\" is " << input_shape.dimensions_count() << "D."; const auto& weights_array = model->GetArray(op->inputs[1]); // Yield until weights dims have been resolved. @@ -249,12 +251,6 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) { << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape " << toco::ShapeToString(weights_shape) << "."; - CHECK(weights_shape.dims(0) == 1 && weights_shape.dims(3) == 1) - << "TransposeConv weights dimensions must begin and end with 1. Input " - "weights \"" - << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape " - << toco::ShapeToString(weights_shape) << "."; - // Compute padding const int kheight = weights_shape.dims(1); const int kwidth = weights_shape.dims(2); @@ -269,9 +265,7 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) { LOG(FATAL) << "TransposeConv only supports SAME or VALID padding"; } - // VALIDATE OUTPUT SHAPE - // Compute the output shape from the input and weights shapes to verify it - // agrees with the specified output shape. + // VALIDATE some dimensions and set the output shape. const auto& input_array = model->GetArray(op->inputs[TransposeConvOperator::DATA_INPUT]); if (!input_array.has_shape()) { @@ -283,31 +277,13 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) { << "TransposeConv input shape must have 4 dimensions. Input \"" << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape " << toco::ShapeToString(weights_shape) << "."; + CHECK_EQ(input_shape.dims(3), weights_shape.dims(0)) + << "Input shape depth and weight depth do not agree"; - // Compute output shape - const int input_width = input_shape.dims(2); - const int input_height = input_shape.dims(1); - int output_height = op->stride_height * (input_height - 1); - int output_width = op->stride_width * (input_width - 1); - if (op->padding.type == PaddingType::kValid) { - output_height += kheight; - output_width += kwidth; - } else if (op->padding.type == PaddingType::kSame) { - output_height += 1; - output_width += 1; - } - - CHECK(specified_output_shape_array.GetBuffer().data == - std::vector({input_shape.dims(0), output_height, output_width, - weights_shape.dims(3)})) - << "Specified output shape: " << ShapeToString(output_array.shape()) - << ", does not agree with shape computed from input data and weights: [" - << input_shape.dims(0) << ", " << output_height << ", " << output_width - << ", " << weights_shape.dims(3) << "]."; - - // SUCCESS: Set the op's output shape according to the specified output shape. - *(output_array.mutable_shape()->mutable_dims()) = + // Set the output shape according to the specified output shape. + std::vector const& specified_output_shape = specified_output_shape_array.GetBuffer().data; + *(output_array.mutable_shape()->mutable_dims()) = specified_output_shape; } void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) { @@ -1179,6 +1155,11 @@ void ProcessRankOperator(Model* model, RankOperator* op) { return; } + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes + return; + } + const auto& input_array = model->GetArray(op->inputs[0]); if (!input_array.has_shape()) { // Yield until input dims have been resolved. @@ -1200,6 +1181,11 @@ void ProcessShapeOperator(Model* model, TensorFlowShapeOperator* op) { return; } + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes + return; + } + const auto& input_array = model->GetArray(op->inputs[0]); if (!input_array.has_shape()) { // Yield until input dims have been resolved. @@ -1230,10 +1216,6 @@ void ProcessStackOperator(Model* model, StackOperator* op) { } Shape shape = input_array.shape(); - if (shape.dimensions_count() == 0) { - // Convert 0D scalars to 1D scalars of shape {1}. - shape.mutable_dims()->push_back(1); - } if (!stacked_shape) { stacked_shape.reset(new Shape(shape)); } else { @@ -1253,6 +1235,83 @@ void ProcessStackOperator(Model* model, StackOperator* op) { output_array.copy_shape(*stacked_shape); } +// These StridedSlice utility functions are essentially a COPY of those in +// reference_ops.h. See comments there. + +// Use until std::clamp() is available from C++17. +int Clamp(const int v, const int lo, const int hi) { + if (hi < v) return hi; + if (v < lo) return lo; + return v; +} + +int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape, + int axis) { + // Begin with the specified index + int start = op.start_indices[axis]; + + // begin_mask override + if (op.begin_mask & 1 << axis) { + if (op.strides[axis] > 0) { + // Forward iteration - use the first element. These values will get + // clamped below (Note: We could have set them to 0 and axis_size-1, but + // use lowest() and max() to maintain symmetry with StopForAxis()) + start = std::numeric_limits::lowest(); + } else { + // Backward iteration - use the last element. + start = std::numeric_limits::max(); + } + } + + // Handle negative indices + int axis_size = input_shape.dims(axis); + if (start < 0) { + start += axis_size; + } + + // Clamping + start = Clamp(start, 0, axis_size - 1); + + return start; +} + +int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape, + int axis) { + // Begin with the specified index + int stop = op.stop_indices[axis]; + + // end_mask override + if (op.end_mask & (1 << axis)) { + if (op.strides[axis] > 0) { + // Forward iteration - use the last element. These values will get + // clamped below + stop = std::numeric_limits::max(); + } else { + // Backward iteration - use the first element. + stop = std::numeric_limits::lowest(); + } + } + + // Handle negative indices + int axis_size = input_shape.dims(axis); + if (stop < 0) { + stop += axis_size; + } + + // Clamping + // Because the end index points one past the last element, we need slightly + // different clamping ranges depending on the direction. + if (op.strides[axis] > 0) { + // Forward iteration + stop = Clamp(stop, 0, axis_size); + } else { + // Backward iteration + stop = Clamp(stop, -1, axis_size - 1); + } + + return stop; +} + void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) { CHECK_GE(op->inputs.size(), 1); CHECK_EQ(op->outputs.size(), 1); @@ -1290,43 +1349,46 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) { return; } - int dim_count = input_array.shape().dimensions_count(); - CHECK(op->start_indices.size() == dim_count) - << ": Incorrect number of start indices supplied to StridedSlice op with " - "output \"" - << op->outputs[0] << "\". Op requires " << dim_count << " start indices"; - CHECK(op->stop_indices.size() == dim_count) - << ": Incorrect number of stop indices supplied to StridedSlice op with " - "output \"" - << op->outputs[0] << "\". Op requires " << dim_count << " stop indices"; - CHECK(op->strides.size() == dim_count) - << ": Incorrect number of strides supplied to StridedSlice op with " - " output \"" - << op->outputs[0] << "\". Op requires " << dim_count << " strides"; + int num_input_axes = input_array.shape().dimensions_count(); + CHECK_LE(op->start_indices.size(), num_input_axes) + << "StridedSlice op with output \"" << op->outputs[0] + << "\", requires no more than " << num_input_axes << " start indices"; + CHECK_LE(op->stop_indices.size(), num_input_axes) + << "StridedSlice op with output \"" << op->outputs[0] + << "\", requires no more than " << num_input_axes << " stop indices"; + CHECK_LE(op->strides.size(), num_input_axes) + << "StridedSlice op with output \"" << op->outputs[0] + << "\", requires no more than " << num_input_axes << " strides"; + for (int i = 0; i < op->strides.size(); i++) { + CHECK_NE(op->strides[i], 0) << "Strides must be non-zero. Axis " << i + << " has stride=" << op->strides[i] << "."; + } + + // The TensorFlow documentation is not explicit on how it handles fewer + // supplied indices than dimensions, but they are accepted. We emulate TF's + // behavior by fully iterating over each "forgotten" dimension. + op->PadIndices(num_input_axes); // Create output shape std::vector* dims = output_array.mutable_shape()->mutable_dims(); // Compute output shape - for (int i = 0; i < dim_count; ++i) { - const int mask = 1 << i; - int start = (op->begin_mask & mask) ? 0 : op->start_indices[i]; - if (start < 0) { - // handle negative indices - start += input_array.shape().dims(i); - } - int stop = (op->end_mask & mask) ? input_array.shape().dims(i) - : op->stop_indices[i]; - if (stop < 0) { - // handle negative indices - stop += input_array.shape().dims(i); - } - - int dim_size = ceil((stop - start) / static_cast(op->strides[i])); - dim_size = dim_size < 0 ? 0 : dim_size; - if (op->shrink_axis_mask & mask) { - CHECK_EQ(dim_size, 1) << "Output size for an axis must compute to 1 when " - "shrinking that axis"; + for (int axis = 0; axis < num_input_axes; ++axis) { + int start_index = StartForAxis(*op, input_array.shape(), axis); + int stop_index = StopForAxis(*op, input_array.shape(), axis); + int dim_size = + ceil(static_cast(stop_index - start_index) / op->strides[axis]); + + CHECK_GT(dim_size, 0) + << "Output size for an axis must be greater than 0. Axis " << axis + << " computes to size " << dim_size + << " for StridedSlice op with output \"" << op->outputs[0] << "\"."; + if (op->shrink_axis_mask & (1 << axis)) { + CHECK_EQ(dim_size, 1) + << "Output size for an axis must compute to 1 when shrinking an " + "axis. Axis " + << axis << " computes to size " << dim_size + << " for StridedSlice op with output \"" << op->outputs[0] << "\"."; } else { dims->push_back(dim_size); } @@ -1436,13 +1498,10 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) { return; } - // The current ArgMax implementation only supports 4-dimensional inputs with - // the last dimension as the axis to perform ArgMax for. const std::vector& input_dims = input_array.shape().dims(); - CHECK_EQ(input_dims.size(), 4); std::vector output_dims; - output_dims.reserve(input_dims.size() - 1); + output_dims.reserve(input_dims.size()); for (int i = 0; i < input_dims.size() - 1; ++i) { output_dims.push_back(input_dims[i]); } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc index 5e779f6765262326bd59db886c2feed603e0102e..6e78653fad238085da5ba66166884093ea9b0214 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc @@ -233,7 +233,12 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) { } // Check that input data types agree. - CHECK(input0_array.data_type == input1_array.data_type); + CHECK(input0_array.data_type == input1_array.data_type) + << "Dissimilar data types given to op outputting \"" + << binary_op->outputs[0] << "\". 0:\"" << binary_op->inputs[0] << "\"(" + << static_cast(input0_array.data_type) << ") 1:\"" + << binary_op->inputs[1] << "\"(" + << static_cast(input1_array.data_type) << ")."; // Do the actual constants propagation EvaluateBinaryOperatorOnConstantInputs(model, binary_op); diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc index a0cfc3d59763dc1211ed4d1ac114d371a4a7ee0b..8df3c2f1955c23b301ca10f613b57a1fbe138593 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc @@ -23,40 +23,88 @@ namespace toco { namespace { +// These StridedSlice utility functions are essentially a COPY of those in +// reference_ops.h. See comments there. + +// Use until std::clamp() is available from C++17. +int Clamp(const int v, const int lo, const int hi) { + if (hi < v) return hi; + if (v < lo) return lo; + return v; +} + int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape, int axis) { - int start; + // Begin with the specified index + int start = op.start_indices[axis]; + + // begin_mask override if (op.begin_mask & 1 << axis) { - // If begin mask bit is set, use the first element - start = 0; - } else { - // Otherwise, use the specified element - start = op.start_indices[axis]; - if (start < 0) { - // Handle negative indices - start += input_shape.dims(axis); + if (op.strides[axis] > 0) { + // Forward iteration - use the first element. These values will get + // clamped below (Note: We could have set them to 0 and axis_size-1, but + // use lowest() and max() to maintain symmetry with StopForAxis()) + start = std::numeric_limits::lowest(); + } else { + // Backward iteration - use the last element. + start = std::numeric_limits::max(); } } + + // Handle negative indices + int axis_size = input_shape.dims(axis); + if (start < 0) { + start += axis_size; + } + + // Clamping + start = Clamp(start, 0, axis_size - 1); + return start; } int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape, int axis) { - int stop; + // Begin with the specified index + int stop = op.stop_indices[axis]; + + // end_mask override if (op.end_mask & (1 << axis)) { - // If end mask bit set, use the last element - stop = input_shape.dims(axis); - } else { - // Otherwise, use the specified element - stop = op.stop_indices[axis]; - if (stop < 0) { - // Handle negative indices - stop += input_shape.dims(axis); + if (op.strides[axis] > 0) { + // Forward iteration - use the last element. These values will get + // clamped below + stop = std::numeric_limits::max(); + } else { + // Backward iteration - use the first element. + stop = std::numeric_limits::lowest(); } } + + // Handle negative indices + int axis_size = input_shape.dims(axis); + if (stop < 0) { + stop += axis_size; + } + + // Clamping + // Because the end index points one past the last element, we need slightly + // different clamping ranges depending on the direction. + if (op.strides[axis] > 0) { + // Forward iteration + stop = Clamp(stop, 0, axis_size); + } else { + // Backward iteration + stop = Clamp(stop, -1, axis_size - 1); + } + return stop; } +bool LoopCondition(int index, int stop, int stride) { + // True when we have reached the end of an axis and should loop. + return stride > 0 ? index >= stop : index <= stop; +} + template void StridedSlice(StridedSliceOperator const& op, Array const& input_array, Array* output_array) { @@ -73,9 +121,6 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array, int num_input_axes = op.start_indices.size(); CHECK_EQ(num_input_axes, op.stop_indices.size()); CHECK_EQ(num_input_axes, op.strides.size()); - for (int i = 0; i < op.strides.size(); i++) { - CHECK_GE(op.strides[i], 0) << "Negative strides usupported"; - } // Create a buffer for the output array std::vector>& output_data = @@ -103,13 +148,15 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array, // Compute next source input coordinates. bool carry = true; for (int axis = 0; axis < num_input_axes; axis++) { + int stride = op.strides[axis]; // Increment this axis if we carried from the previous one if (carry) { - src_coord[axis] += op.strides[axis]; + src_coord[axis] += stride; } // Check if we've overflowed. - if (src_coord[axis] >= StopForAxis(op, input_shape, axis)) { + int stop = StopForAxis(op, input_shape, axis); + if (LoopCondition(src_coord[axis], stop, stride)) { // Reset axis and set carry src_coord[axis] = StartForAxis(op, input_shape, axis); carry = true; diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc index 37beb41dfc5904fc6ace79ebea2420d2ab92fbfb..4bb1217828a9c480241a3b503dffe26462df4063 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc @@ -60,6 +60,11 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) { const auto& output_array_name = mul_op->outputs[0]; auto& output_array = model->GetArray(output_array_name); + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes + return false; + } + // Yield if the output shape is not known yet. if (!output_array.has_shape()) { return false; diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc index 155d890c9f23ba206f1f0e6db645a601308cea5b..2ed05cb3720662013e4805204aca89bcbf19447a 100644 --- a/tensorflow/contrib/lite/toco/import_tensorflow.cc +++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc @@ -1093,8 +1093,10 @@ void ConvertMatMulOperator(const NodeDef& node, // Transpose flags should be easy to support, but we don't have a // GraphDef with them to test on at the moment. - CHECK_EQ(GetBoolAttr(node, "transpose_a"), false); - CHECK_EQ(GetBoolAttr(node, "transpose_b"), false); + CHECK_EQ(HasAttr(node, "transpose_a") && GetBoolAttr(node, "transpose_a"), + false); + CHECK_EQ(HasAttr(node, "transpose_b") && GetBoolAttr(node, "transpose_b"), + false); CHECK(!HasAttr(node, "adjoint_a") || (GetBoolAttr(node, "adjoint_a") == false)); CHECK(!HasAttr(node, "adjoint_b") || @@ -1300,11 +1302,17 @@ void ConvertStridedSliceOperator(const NodeDef& node, } op->outputs.push_back(node.name()); - op->begin_mask = GetIntAttr(node, "begin_mask"); - op->ellipsis_mask = GetIntAttr(node, "ellipsis_mask"); - op->end_mask = GetIntAttr(node, "end_mask"); - op->new_axis_mask = GetIntAttr(node, "new_axis_mask"); - op->shrink_axis_mask = GetIntAttr(node, "shrink_axis_mask"); + op->begin_mask = + HasAttr(node, "begin_mask") ? GetIntAttr(node, "begin_mask") : 0; + op->ellipsis_mask = + HasAttr(node, "ellipsis_mask") ? GetIntAttr(node, "ellipsis_mask") : 0; + op->end_mask = HasAttr(node, "end_mask") ? GetIntAttr(node, "end_mask") : 0; + op->new_axis_mask = + HasAttr(node, "new_axis_mask") ? GetIntAttr(node, "new_axis_mask") : 0; + op->shrink_axis_mask = HasAttr(node, "shrink_axis_mask") + ? GetIntAttr(node, "shrink_axis_mask") + : 0; + model->operators.emplace_back(op); } @@ -1394,8 +1402,11 @@ void ConvertArgMaxOperator(const NodeDef& node, Model* model) { CHECK_EQ(node.op(), "ArgMax"); CheckInputsCount(node, tf_import_flags, 2); - const auto axis_data_type = GetDataTypeAttr(node, "Tidx"); - const auto output_type = GetDataTypeAttr(node, "output_type"); + const auto axis_data_type = + HasAttr(node, "Tidx") ? GetDataTypeAttr(node, "Tidx") : DT_INT32; + const auto output_type = HasAttr(node, "output_type") + ? GetDataTypeAttr(node, "output_type") + : DT_INT64; CHECK(axis_data_type == DT_INT64 || axis_data_type == DT_INT32); CHECK(output_type == DT_INT64 || output_type == DT_INT32); auto* op = new ArgMaxOperator; @@ -1772,7 +1783,7 @@ void ConvertStackOperator(const NodeDef& node, op->inputs.push_back(node.input(i)); } // Both "Stack" and "Pack" have the "axis" attribute. - op->axis = GetIntAttr(node, "axis"); + op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : 0; op->outputs.push_back(node.name()); model->operators.emplace_back(op); } diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h index 787c20e574b2686329904a30958c1f8c4e5f0e96..482cc71d8b34705041130c7518b58f7fe183bc3c 100644 --- a/tensorflow/contrib/lite/toco/model.h +++ b/tensorflow/contrib/lite/toco/model.h @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/contrib/lite/toco/model_flags.pb.h" #include "tensorflow/contrib/lite/toco/runtime/types.h" +#include "tensorflow/contrib/lite/toco/toco_port.h" #include "tensorflow/contrib/lite/toco/toco_types.h" #include "tensorflow/core/platform/logging.h" @@ -845,6 +846,60 @@ struct StridedSliceOperator : Operator { int end_mask; int new_axis_mask; int shrink_axis_mask; + + StridedSliceOperator(const StridedSliceOperator& other) + : Operator(OperatorType::kStridedSlice) { + inputs = other.inputs; + outputs = other.outputs; + + start_indices = other.start_indices; + stop_indices = other.stop_indices; + strides = other.strides; + + begin_mask = other.begin_mask; + ellipsis_mask = other.ellipsis_mask; + end_mask = other.end_mask; + new_axis_mask = other.new_axis_mask; + shrink_axis_mask = other.shrink_axis_mask; + } + + void PadIndices(int dim_count) { + // Add indices and mask bits to fully include extra dimensions + CHECK_GE(dim_count, start_indices.size()); + CHECK_EQ(start_indices.size(), stop_indices.size()); + CHECK_EQ(stop_indices.size(), strides.size()); + + for (int i = start_indices.size(); i < dim_count; i++) { + start_indices.push_back(0); + stop_indices.push_back(0); + strides.push_back(1); + begin_mask |= 1 << i; + end_mask |= 1 << i; + } + } + + void ReverseIndices() { + CHECK_EQ(start_indices.size(), stop_indices.size()); + CHECK_EQ(stop_indices.size(), strides.size()); + + std::reverse(start_indices.begin(), start_indices.end()); + std::reverse(stop_indices.begin(), stop_indices.end()); + std::reverse(strides.begin(), strides.end()); + + begin_mask = toco::port::ReverseBits32(static_cast(begin_mask)) >> + (32 - start_indices.size()); + ellipsis_mask = + toco::port::ReverseBits32(static_cast(ellipsis_mask)) >> + (32 - start_indices.size()); + end_mask = toco::port::ReverseBits32(static_cast(end_mask)) >> + (32 - start_indices.size()); + new_axis_mask = + toco::port::ReverseBits32(static_cast(new_axis_mask)) >> + (32 - start_indices.size()); + shrink_axis_mask = + toco::port::ReverseBits32(static_cast(shrink_axis_mask)) >> + (32 - start_indices.size()); + } }; // Reshaping operator, reshaping its input array to a two-dimensional shape diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD index e0191801a0f0076565c51085ec293524d63cbe88..e1025c66642d2860c5916bf7625f1c0403c9901c 100644 --- a/tensorflow/contrib/lite/toco/tflite/BUILD +++ b/tensorflow/contrib/lite/toco/tflite/BUILD @@ -54,6 +54,7 @@ cc_library( "types.h", ], deps = [ + "//tensorflow/contrib/lite:string_util", "//tensorflow/contrib/lite/schema:schema_fbs", "//tensorflow/contrib/lite/toco:model", ], diff --git a/tensorflow/contrib/lite/toco/tflite/types.cc b/tensorflow/contrib/lite/toco/tflite/types.cc index 0afd2f3df57caf3214dd198bfa2ee75fa7a8fd7b..c9c2e9ba0184ef3f531f325091afaf6976e07f4f 100644 --- a/tensorflow/contrib/lite/toco/tflite/types.cc +++ b/tensorflow/contrib/lite/toco/tflite/types.cc @@ -13,12 +13,29 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/toco/tflite/types.h" +#include "tensorflow/contrib/lite/string_util.h" namespace toco { namespace tflite { namespace { + +DataBuffer::FlatBufferOffset CopyStringToBuffer( + const Array& array, flatbuffers::FlatBufferBuilder* builder) { + const auto& src_data = array.GetBuffer().data; + ::tflite::DynamicBuffer dyn_buffer; + for (const string& str : src_data) { + dyn_buffer.AddString(str.c_str(), str.length()); + } + char* tensor_buffer; + int bytes = dyn_buffer.WriteToBuffer(&tensor_buffer); + std::vector dst_data(bytes); + memcpy(dst_data.data(), tensor_buffer, bytes); + free(tensor_buffer); + return builder->CreateVector(dst_data.data(), bytes); +} + template DataBuffer::FlatBufferOffset CopyBuffer( const Array& array, flatbuffers::FlatBufferBuilder* builder) { @@ -29,6 +46,18 @@ DataBuffer::FlatBufferOffset CopyBuffer( return builder->CreateVector(dst_data, size); } +void CopyStringFromBuffer(const ::tflite::Buffer& buffer, Array* array) { + auto* src_data = reinterpret_cast(buffer.data()->data()); + std::vector* dst_data = + &array->GetMutableBuffer().data; + int32_t num_strings = ::tflite::GetStringCount(src_data); + for (int i = 0; i < num_strings; i++) { + ::tflite::StringRef str_ref = ::tflite::GetString(src_data, i); + string this_str(str_ref.str, str_ref.len); + dst_data->push_back(this_str); + } +} + template void CopyBuffer(const ::tflite::Buffer& buffer, Array* array) { using NativeT = ::toco::DataType; @@ -93,7 +122,7 @@ flatbuffers::Offset> DataBuffer::Serialize( case ArrayDataType::kInt64: return CopyBuffer(array, builder); case ArrayDataType::kString: - return CopyBuffer(array, builder); + return CopyStringToBuffer(array, builder); case ArrayDataType::kUint8: return CopyBuffer(array, builder); default: @@ -114,7 +143,7 @@ void DataBuffer::Deserialize(const ::tflite::Tensor& tensor, case ::tflite::TensorType_INT64: return CopyBuffer(buffer, array); case ::tflite::TensorType_STRING: - return CopyBuffer(buffer, array); + return CopyStringFromBuffer(buffer, array); case ::tflite::TensorType_UINT8: return CopyBuffer(buffer, array); default: diff --git a/tensorflow/contrib/lite/toco/tflite/types_test.cc b/tensorflow/contrib/lite/toco/tflite/types_test.cc index a040fe135841b92a6e668f32cc5e36cf812ab15b..29fb0b2af22ef13e735f7c8128a2b9f6dbc42d19 100644 --- a/tensorflow/contrib/lite/toco/tflite/types_test.cc +++ b/tensorflow/contrib/lite/toco/tflite/types_test.cc @@ -151,6 +151,13 @@ TEST(DataBuffer, Int32) { ::testing::ElementsAre(1, 1 << 30)); } +TEST(DataBuffer, String) { + Array recovered = ToFlatBufferAndBack( + {"AA", "BBB", "Best. String. Ever."}); + EXPECT_THAT(recovered.GetBuffer().data, + ::testing::ElementsAre("AA", "BBB", "Best. String. Ever.")); +} + TEST(Padding, All) { EXPECT_EQ(::tflite::Padding_SAME, Padding::Serialize(PaddingType::kSame)); EXPECT_EQ(PaddingType::kSame, Padding::Deserialize(::tflite::Padding_SAME)); diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc index d1d68b6b470253b0d756f2841ce0a62b509ca89a..1611c4d0c0b148e00dbf0b21b9cd65d4c8163c2c 100644 --- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc +++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc @@ -61,11 +61,21 @@ bool ParseTocoFlagsFromCommandLineFlags( Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(), parsed_flags.default_ranges_min.default_value(), "If defined, will be used as the default value for the min bound " - "of min/max ranges used for quantization."), + "of min/max ranges used for quantization of uint8 arrays."), Flag("default_ranges_max", parsed_flags.default_ranges_max.bind(), parsed_flags.default_ranges_max.default_value(), "If defined, will be used as the default value for the max bound " - "of min/max ranges used for quantization."), + "of min/max ranges used for quantization of uint8 arrays."), + Flag("default_int16_ranges_min", + parsed_flags.default_int16_ranges_min.bind(), + parsed_flags.default_int16_ranges_min.default_value(), + "If defined, will be used as the default value for the min bound " + "of min/max ranges used for quantization of int16 arrays."), + Flag("default_int16_ranges_max", + parsed_flags.default_int16_ranges_max.bind(), + parsed_flags.default_int16_ranges_max.default_value(), + "If defined, will be used as the default value for the max bound " + "of min/max ranges used for quantization of int16 arrays."), Flag("inference_type", parsed_flags.inference_type.bind(), parsed_flags.inference_type.default_value(), "Target data type of arrays in the output file (for input_arrays, " @@ -131,6 +141,13 @@ bool ParseTocoFlagsFromCommandLineFlags( parsed_flags.propagate_fake_quant_num_bits.default_value(), "If true, use FakeQuant* operator num_bits attributes to adjust " "array data_types."), + Flag("allow_nudging_weights_to_use_fast_gemm_kernel", + parsed_flags.allow_nudging_weights_to_use_fast_gemm_kernel.bind(), + parsed_flags.allow_nudging_weights_to_use_fast_gemm_kernel + .default_value(), + "Some fast uint8 GEMM kernels require uint8 weights to avoid the " + "value 0. This flag allows nudging them to 1 to allow proceeding, " + "with moderate inaccuracy."), }; bool asked_for_help = *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help")); @@ -212,12 +229,16 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags, PARSE_TOCO_FLAG(IODataType, inference_input_type, FlagRequirement::kNone); READ_TOCO_FLAG(default_ranges_min, FlagRequirement::kNone); READ_TOCO_FLAG(default_ranges_max, FlagRequirement::kNone); + READ_TOCO_FLAG(default_int16_ranges_min, FlagRequirement::kNone); + READ_TOCO_FLAG(default_int16_ranges_max, FlagRequirement::kNone); READ_TOCO_FLAG(drop_fake_quant, FlagRequirement::kNone); READ_TOCO_FLAG(reorder_across_fake_quant, FlagRequirement::kNone); READ_TOCO_FLAG(allow_custom_ops, FlagRequirement::kNone); READ_TOCO_FLAG(drop_control_dependency, FlagRequirement::kNone); READ_TOCO_FLAG(debug_disable_recurrent_cell_fusion, FlagRequirement::kNone); READ_TOCO_FLAG(propagate_fake_quant_num_bits, FlagRequirement::kNone); + READ_TOCO_FLAG(allow_nudging_weights_to_use_fast_gemm_kernel, + FlagRequirement::kNone); // Deprecated flag handling. if (parsed_toco_flags.input_type.specified()) { diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto index 751aca948ca4149778c468f4be2a454ee71bafa7..a04017a6bf05fb5a1ae324051375340a0adaccc6 100644 --- a/tensorflow/contrib/lite/toco/toco_flags.proto +++ b/tensorflow/contrib/lite/toco/toco_flags.proto @@ -37,7 +37,7 @@ enum FileFormat { // of as properties of models, instead describing how models are to be // processed in the context of the present tooling job. // -// Next ID to use: 15. +// Next ID to use: 18. message TocoFlags { // Input file format optional FileFormat input_format = 1; @@ -103,8 +103,14 @@ message TocoFlags { // for experimentation purposes only and should not be used in production: // they make it easy to quantize models, but the resulting quantized model // will be inaccurate. + // + // These values only apply to arrays quantized with the kUint8 data type. optional float default_ranges_min = 5; optional float default_ranges_max = 6; + // Equivalent versions of default_ranges_min/_max for arrays quantized with + // the kInt16 data type. + optional float default_int16_ranges_min = 15; + optional float default_int16_ranges_max = 16; // Ignore and discard FakeQuant nodes. For instance, that can be used to // generate plain float code without fake-quantization from a quantized @@ -150,4 +156,9 @@ message TocoFlags { // Input and output array data types may change because of this propagation // and users must be sure to query the final data_type values. optional bool propagate_fake_quant_num_bits = 14; + + // Some fast uint8 GEMM kernels require uint8 weights to avoid the value 0. + // This flag allows nudging them to 1 to allow proceeding, with moderate + // inaccuracy. + optional bool allow_nudging_weights_to_use_fast_gemm_kernel = 17; } diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h index 4be3b5a0bf00ed204a1218545d9e66f7685a50d7..2d5c231bef350884cd2b0bb62cfdbddebfed7f58 100644 --- a/tensorflow/contrib/lite/toco/toco_port.h +++ b/tensorflow/contrib/lite/toco/toco_port.h @@ -75,6 +75,14 @@ Status Exists(const string& filename, const Options& options); void CopyToBuffer(const ::Cord& src, char* dest); #endif // PLATFORM_GOOGLE void CopyToBuffer(const string& src, char* dest); + +inline uint32 ReverseBits32(uint32 n) { + n = ((n >> 1) & 0x55555555) | ((n & 0x55555555) << 1); + n = ((n >> 2) & 0x33333333) | ((n & 0x33333333) << 2); + n = ((n >> 4) & 0x0F0F0F0F) | ((n & 0x0F0F0F0F) << 4); + return (((n & 0xFF) << 24) | ((n & 0xFF00) << 8) | ((n & 0xFF0000) >> 8) | + ((n & 0xFF000000) >> 24)); +} } // namespace port inline bool ParseFromStringOverload(const std::string& in, diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc index b69852453cc57ec42c6212cf8537d80640629736..7252ec2ea4d8864334fc989c91fc8c26efe2efea 100644 --- a/tensorflow/contrib/lite/toco/toco_tooling.cc +++ b/tensorflow/contrib/lite/toco/toco_tooling.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "absl/strings/str_join.h" #include "tensorflow/contrib/lite/toco/allocate_transient_arrays.h" #include "tensorflow/contrib/lite/toco/dump_graphviz.h" @@ -270,10 +271,6 @@ void Transform(const TocoFlags& toco_flags, Model* model) { RunGraphTransformations(model, "general graph transformations", transformations); - // Fix any issues with IO edges. This must happen after any transform that - // may modify the structure of the edges. - FixEdgeArrays(model); - if (quantize_output) { if (toco_flags.propagate_fake_quant_num_bits()) { RunGraphTransformations(model, @@ -287,23 +284,50 @@ void Transform(const TocoFlags& toco_flags, Model* model) { }); } + // Fix any issues with IO edges. This must happen after any transform that + // may modify the structure of the edges. + FixEdgeArrays(model); + if (quantize_output) { + // If the user specified default min/max ranges we need to set all arrays + // that didn't either have a min/max specified or get one set via + // HardcodeMinMax or PropagateFakeQuantNumBits. This may require running + // HardcodeMinMax to move changes through the graph as we make changes. + auto propagate_default_min_max = + absl::make_unique(); if (toco_flags.has_default_ranges_min() && toco_flags.has_default_ranges_max()) { - UseDefaultMinMaxRangeValues(model, toco_flags.default_ranges_min(), - toco_flags.default_ranges_max()); - // The new MinMax info may need to be propagated a bit. + propagate_default_min_max->DefineTypeRange( + ArrayDataType::kUint8, toco_flags.default_ranges_min(), + toco_flags.default_ranges_max()); + } + if (toco_flags.has_default_int16_ranges_min() && + toco_flags.has_default_int16_ranges_max()) { + propagate_default_min_max->DefineTypeRange( + ArrayDataType::kInt16, toco_flags.default_int16_ranges_min(), + toco_flags.default_int16_ranges_max()); + } + if (propagate_default_min_max->has_any_ranges_defined()) { RunGraphTransformations( model, "default min-max range propagation graph transformations", - {new HardcodeMinMax}); + { + propagate_default_min_max.release(), + new HardcodeMinMax, + }); } + CheckIsReadyForQuantization(*model); + auto* ensure_safe_for_int8_kernels = + new EnsureUint8WeightsSafeForFastInt8Kernels; + ensure_safe_for_int8_kernels->set_allow_nudging_weights( + toco_flags.allow_nudging_weights_to_use_fast_gemm_kernel()); RunGraphTransformations(model, "quantization graph transformations", { new RemoveTrivialQuantizedActivationFunc, new RemoveTrivialQuantizedMinMax, new Quantize, new RemoveFinalDequantizeOp, + ensure_safe_for_int8_kernels, }); } else { GraphTransformationsSet dequantization_transformations{new Dequantize}; diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc index ecac0c28a5836488aba8590af0d32141fbd94e60..5a341294db5e5ce4e1774f8b772df9e77f3b61e8 100644 --- a/tensorflow/contrib/lite/toco/tooling_util.cc +++ b/tensorflow/contrib/lite/toco/tooling_util.cc @@ -1405,20 +1405,7 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) { } input_minmax.min = (qmin - mean_value) / std_value; input_minmax.max = (qmax - mean_value) / std_value; - if (input_array.minmax) { - if (input_array_proto.has_mean_value() || - input_array_proto.has_std_value()) { - const double width = input_minmax.max - input_minmax.min; - const double kMinMaxAllowedDiff = 1e-6 * width; - CHECK(std::abs(input_minmax.min - input_array.minmax->min) < - kMinMaxAllowedDiff && - std::abs(input_minmax.max - input_array.minmax->max) < - kMinMaxAllowedDiff) - << input_minmax.min << ", " << input_minmax.max - << " != " << input_array.minmax->min << ", " - << input_array.minmax->max; - } - } else { + if (!input_array.minmax) { input_array.GetOrCreateMinMax() = input_minmax; } } @@ -1474,28 +1461,6 @@ void CheckIsReadyForQuantization(const Model& model) { } } -void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min, - double default_ranges_max) { - for (const auto& op : model->operators) { - for (const auto& input : op->inputs) { - auto& input_array = model->GetArray(input); - if (!input_array.minmax && !input_array.buffer) { - auto& minmax = input_array.GetOrCreateMinMax(); - minmax.min = default_ranges_min; - minmax.max = default_ranges_max; - } - } - for (const auto& output : op->outputs) { - auto& output_array = model->GetArray(output); - if (!output_array.minmax && !output_array.buffer) { - auto& minmax = output_array.GetOrCreateMinMax(); - minmax.min = default_ranges_min; - minmax.max = default_ranges_max; - } - } - } -} - int ElementSize(ArrayDataType data_type) { switch (data_type) { case ArrayDataType::kBool: diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h index 4c705f4e5fe66db175a7b92e82b090e204ab3dc6..5cc15fa57b3ea4ecb74c3f81af14e4fea95f6232 100644 --- a/tensorflow/contrib/lite/toco/tooling_util.h +++ b/tensorflow/contrib/lite/toco/tooling_util.h @@ -188,8 +188,6 @@ T ConvertOperator(Operator* o, OperatorType type) { } void CheckIsReadyForQuantization(const Model& model); -void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min, - double default_ranges_max); bool ReshapeIsEquivalentToTranspose(const Model& model, const TensorFlowReshapeOperator* op, diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD index f616207d462954341dd0c4b2722471b50c06c917..e3928a82a2d453fdd36cb861ce178a776574269c 100644 --- a/tensorflow/contrib/lookup/BUILD +++ b/tensorflow/contrib/lookup/BUILD @@ -28,7 +28,7 @@ py_library( tf_py_test( name = "lookup_ops_test", - size = "small", + size = "medium", srcs = ["lookup_ops_test.py"], additional_deps = [ ":lookup_py", diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py index f681b7b132750ef80aa56f25143418fbc4eaa1bb..5d4682ec9f4b8c5864383bd1d2f4c0b41a11baad 100644 --- a/tensorflow/contrib/lookup/lookup_ops_test.py +++ b/tensorflow/contrib/lookup/lookup_ops_test.py @@ -58,6 +58,12 @@ class HashTableOpTest(test.TestCase): result = output.eval() self.assertAllEqual([0, 1, -1], result) + exported_keys_tensor, exported_values_tensor = table.export() + + self.assertItemsEqual([b"brain", b"salad", b"surgery"], + exported_keys_tensor.eval()) + self.assertItemsEqual([0, 1, 2], exported_values_tensor.eval()) + def testHashTableFindHighRank(self): with self.test_session(): default_val = -1 diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile index 05e8d9064bea748c935859f5f9b4c7e646f504cf..1a1ab54a53dd5866ca8357067846c002c5d5e9c1 100644 --- a/tensorflow/contrib/makefile/Makefile +++ b/tensorflow/contrib/makefile/Makefile @@ -89,6 +89,7 @@ HOST_INCLUDES := \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/double_conversion \ -I$(HOST_GENDIR) ifeq ($(HAS_GEN_HOST_PROTOC),true) HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include @@ -125,7 +126,9 @@ PROTO_TEXT := $(HOST_BINDIR)proto_text # The list of dependencies is derived from the Bazel build file by running # the gen_file_lists.sh script on a system with a working Bazel setup. PROTO_TEXT_CC_FILES := $(shell cat $(MAKEFILE_DIR)/proto_text_cc_files.txt) -PROTO_TEXT_PB_CC_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) +PROTO_TEXT_PB_CC_LIST := \ + $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) \ + $(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) PROTO_TEXT_PB_H_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_h_files.txt) # Locations of the intermediate files proto_text generates. @@ -171,6 +174,7 @@ INCLUDES := \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/double_conversion \ -I$(PROTOGENDIR) \ -I$(PBTGENDIR) ifeq ($(HAS_GEN_HOST_PROTOC),true) @@ -326,6 +330,7 @@ $(MARCH_OPTION) \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/double_conversion \ -I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \ -I$(PROTOGENDIR) \ -I$(PBTGENDIR) @@ -603,6 +608,7 @@ $(wildcard tensorflow/core/platform/*/*.cc) \ $(wildcard tensorflow/core/platform/*/*/*.cc) \ $(wildcard tensorflow/core/util/*.cc) \ $(wildcard tensorflow/core/util/*/*.cc) \ +$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) \ tensorflow/core/util/version_info.cc # Remove duplicates (for version_info.cc) CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS)) diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh index 4d3de36e2a4141bed33d60f9e2e3c520aec3de67..eff9081e35c285027c764c5bdbaf14f78bc5f512 100755 --- a/tensorflow/contrib/makefile/download_dependencies.sh +++ b/tensorflow/contrib/makefile/download_dependencies.sh @@ -34,7 +34,8 @@ GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.g NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" -FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" +FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)" +DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL_FILE_PATH}" | head -n1)" ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)" CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" @@ -89,6 +90,7 @@ download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync" download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf" download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2" download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d" +download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion" download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl" download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive" diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py index 4fe4e8d044bd0b0987c0221ab225f449a71ccfc7..c35e60a5547c23e5f9c7b7fc2a0702d8a7decf30 100644 --- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py +++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py @@ -13,7 +13,10 @@ # limitations under the License. # ============================================================================== -"""Apply graph_transforms tool to MetaGraphDefs.""" +"""Apply graph_transforms tool to MetaGraphDefs. + +@@meta_graph_transform +""" from __future__ import absolute_import from __future__ import division @@ -30,7 +33,7 @@ from tensorflow.python.framework import importer as _importer from tensorflow.python.framework import ops as _ops from tensorflow.python.saved_model import constants as _saved_model_constants from tensorflow.python.training import saver as _saver_lib -from tensorflow.python.util import compat +from tensorflow.python.util import compat as _compat from tensorflow.tools import graph_transforms as _graph_transforms @@ -161,7 +164,7 @@ def _clean_save_and_restore(graph_def, op, removed_op_names): shapes = [] dtypes = [] for index, value in enumerate(name_op_value_tensor.string_val): - if not _is_removed(compat.as_str(value), removed_op_names): + if not _is_removed(_compat.as_str(value), removed_op_names): names.append(value) shapes.append(shape_op_value_tensor.string_val[index]) dtypes.append(op.attr['dtypes'].list.type[index]) @@ -651,7 +654,7 @@ def _is_removed_mentioned(s, removed_op_names): # /foo/bar. This regex ensures that we handle these two nodes # as separate entities. It matches on nodes having names in the form of # '/foo/bar_x' as well as nodes having names in the form of 'foo.' - s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', compat.as_str_any(s)) + s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', _compat.as_str_any(s)) for removed_op_name in removed_op_names: for s_name in s_names: if s_name.endswith(removed_op_name): @@ -737,9 +740,9 @@ def meta_graph_transform( for tag in tags: meta_graph_def.meta_info_def.tags.append(tag) - base_op_names = [compat.as_str(node.name) + base_op_names = [_compat.as_str(node.name) for node in base_meta_graph_def.graph_def.node] - retained_op_names = [compat.as_str(node.name) + retained_op_names = [_compat.as_str(node.name) for node in meta_graph_def.graph_def.node] removed_op_names = set(base_op_names) - set(retained_op_names) diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD index 5ca42f41c1c5055bf1917ad175b7b30666b18d4b..e050f3c8d4fc61adfdd3d15869e533ed2b51c4a8 100644 --- a/tensorflow/contrib/metrics/BUILD +++ b/tensorflow/contrib/metrics/BUILD @@ -77,7 +77,7 @@ py_test( py_test( name = "metric_ops_test", srcs = ["python/ops/metric_ops_test.py"], - shard_count = 3, + shard_count = 8, srcs_version = "PY2AND3", tags = ["noasan"], # times out b/63678675 deps = [ diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py index de02dc8f457364450929776035829d86035d706b..5effea3596bb83a08e0a8627e411684262aef5f7 100644 --- a/tensorflow/contrib/metrics/__init__.py +++ b/tensorflow/contrib/metrics/__init__.py @@ -71,6 +71,7 @@ See the @{$python/contrib.metrics} guide. @@count @@precision_recall_at_equal_thresholds @@recall_at_precision +@@precision_at_recall """ from __future__ import absolute_import @@ -87,6 +88,7 @@ from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metrics from tensorflow.contrib.metrics.python.ops.metric_ops import auc_with_confidence_intervals from tensorflow.contrib.metrics.python.ops.metric_ops import cohen_kappa from tensorflow.contrib.metrics.python.ops.metric_ops import count +from tensorflow.contrib.metrics.python.ops.metric_ops import precision_at_recall from tensorflow.contrib.metrics.python.ops.metric_ops import precision_recall_at_equal_thresholds from tensorflow.contrib.metrics.python.ops.metric_ops import recall_at_precision from tensorflow.contrib.metrics.python.ops.metric_ops import sparse_recall_at_top_k diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py index 4ecf71ab89ec18e7e39ee6d642be5e4ca66e54ae..00a933e5e0c537033573b225d43581f74557b240 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py @@ -2588,6 +2588,121 @@ def recall_at_precision(labels, return recall, update_op +def precision_at_recall(labels, + predictions, + target_recall, + weights=None, + num_thresholds=200, + metrics_collections=None, + updates_collections=None, + name=None): + """Computes the precision at a given recall. + + This function creates variables to track the true positives, false positives, + true negatives, and false negatives at a set of thresholds. Among those + thresholds where recall is at least `target_recall`, precision is computed + at the threshold where recall is closest to `target_recall`. + + For estimation of the metric over a stream of data, the function creates an + `update_op` operation that updates these variables and returns the + precision at `target_recall`. `update_op` increments the counts of true + positives, false positives, true negatives, and false negatives with the + weight of each case found in the `predictions` and `labels`. + + If `weights` is `None`, weights default to 1. Use weights of 0 to mask values. + + For additional information about precision and recall, see + http://en.wikipedia.org/wiki/Precision_and_recall + + Args: + labels: The ground truth values, a `Tensor` whose dimensions must match + `predictions`. Will be cast to `bool`. + predictions: A floating point `Tensor` of arbitrary shape and whose values + are in the range `[0, 1]`. + target_recall: A scalar value in range `[0, 1]`. + weights: Optional `Tensor` whose rank is either 0, or the same rank as + `labels`, and must be broadcastable to `labels` (i.e., all dimensions must + be either `1`, or the same as the corresponding `labels` dimension). + num_thresholds: The number of thresholds to use for matching the given + recall. + metrics_collections: An optional list of collections to which `precision` + should be added. + updates_collections: An optional list of collections to which `update_op` + should be added. + name: An optional variable_scope name. + + Returns: + precision: A scalar `Tensor` representing the precision at the given + `target_recall` value. + update_op: An operation that increments the variables for tracking the + true positives, false positives, true negatives, and false negatives and + whose value matches `precision`. + + Raises: + ValueError: If `predictions` and `labels` have mismatched shapes, if + `weights` is not `None` and its shape doesn't match `predictions`, or if + `target_recall` is not between 0 and 1, or if either `metrics_collections` + or `updates_collections` are not a list or tuple. + RuntimeError: If eager execution is enabled. + """ + if context.executing_eagerly(): + raise RuntimeError('tf.metrics.precision_at_recall is not ' + 'supported when eager execution is enabled.') + + if target_recall < 0 or target_recall > 1: + raise ValueError('`target_recall` must be in the range [0, 1].') + + with variable_scope.variable_scope(name, 'precision_at_recall', + (predictions, labels, weights)): + kepsilon = 1e-7 # Used to avoid division by zero. + thresholds = [ + (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2) + ] + thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon] + + values, update_ops = _streaming_confusion_matrix_at_thresholds( + predictions, labels, thresholds, weights) + + def compute_precision_at_recall(tp, fp, fn, name): + """Computes the precision at a given recall. + + Args: + tp: True positives. + fp: False positives. + fn: False negatives. + name: A name for the operation. + + Returns: + The precision at the desired recall. + """ + recalls = math_ops.div(tp, tp + fn + kepsilon) + + # Because recall is monotone decreasing as a function of the threshold, + # the smallest recall exceeding target_recall occurs at the largest + # threshold where recall >= target_recall. + admissible_recalls = math_ops.cast( + math_ops.greater_equal(recalls, target_recall), dtypes.int64) + tf_index = math_ops.reduce_sum(admissible_recalls) - 1 + + # Now we have the threshold at which to compute precision: + return math_ops.div(tp[tf_index] + kepsilon, + tp[tf_index] + fp[tf_index] + kepsilon, + name) + + precision_value = compute_precision_at_recall( + values['tp'], values['fp'], values['fn'], 'value') + update_op = compute_precision_at_recall( + update_ops['tp'], update_ops['fp'], update_ops['fn'], 'update_op') + + if metrics_collections: + ops.add_to_collections(metrics_collections, precision_value) + + if updates_collections: + ops.add_to_collections(updates_collections, update_op) + + return precision_value, update_op + + def streaming_sparse_average_precision_at_k(predictions, labels, k, diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py index 33eb655fb660f0ecdfe1c5ab870d7f17690ae3ff..76420db8bda39435bcc2be2fd3d8c3467d6753e2 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py @@ -3380,6 +3380,138 @@ class RecallAtPrecisionTest(test.TestCase): self.assertAlmostEqual(target_recall, recall.eval()) +class PrecisionAtRecallTest(test.TestCase): + + def setUp(self): + np.random.seed(1) + ops.reset_default_graph() + + def testVars(self): + metrics.precision_at_recall( + predictions=array_ops.ones((10, 1)), + labels=array_ops.ones((10, 1)), + target_recall=0.7) + _assert_metric_variables(self, + ('precision_at_recall/true_positives:0', + 'precision_at_recall/false_negatives:0', + 'precision_at_recall/false_positives:0', + 'precision_at_recall/true_negatives:0')) + + def testMetricsCollection(self): + my_collection_name = '__metrics__' + mean, _ = metrics.precision_at_recall( + predictions=array_ops.ones((10, 1)), + labels=array_ops.ones((10, 1)), + target_recall=0.7, + metrics_collections=[my_collection_name]) + self.assertListEqual(ops.get_collection(my_collection_name), [mean]) + + def testUpdatesCollection(self): + my_collection_name = '__updates__' + _, update_op = metrics.precision_at_recall( + predictions=array_ops.ones((10, 1)), + labels=array_ops.ones((10, 1)), + target_recall=0.7, + updates_collections=[my_collection_name]) + self.assertListEqual(ops.get_collection(my_collection_name), [update_op]) + + def testValueTensorIsIdempotent(self): + predictions = random_ops.random_uniform( + (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1) + labels = random_ops.random_uniform( + (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=1) + precision, update_op = metrics.precision_at_recall( + labels, predictions, target_recall=0.7) + + with self.test_session() as sess: + sess.run(variables.local_variables_initializer()) + + # Run several updates. + for _ in range(10): + sess.run(update_op) + + # Then verify idempotency. + initial_precision = precision.eval() + for _ in range(10): + self.assertAlmostEqual(initial_precision, precision.eval(), places=5) + + def testAllCorrect(self): + inputs = np.random.randint(0, 2, size=(100, 1)) + + predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32) + labels = constant_op.constant(inputs) + precision, update_op = metrics.precision_at_recall( + labels, predictions, target_recall=0.7) + + with self.test_session() as sess: + sess.run(variables.local_variables_initializer()) + self.assertEqual(1, sess.run(update_op)) + self.assertEqual(1, precision.eval()) + + def testAllIncorrect(self): + inputs = np.random.randint(0, 2, size=(100, 1)) + + predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32) + labels = 1.0 - predictions + label_prior = math_ops.reduce_mean(labels) + precision, update_op = metrics.precision_at_recall( + labels, predictions, target_recall=0.2) + + with self.test_session() as sess: + sess.run(variables.local_variables_initializer()) + self.assertEqual(sess.run(label_prior), sess.run(update_op)) + self.assertEqual(sess.run(label_prior), precision.eval()) + + def testSomeCorrectHighRecall(self): + predictions_values = [0.1, 0.2, 0.5, 0.3, 0.0, 0.1, 0.45, 0.5, 0.8, 0.9] + labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + + predictions = constant_op.constant( + predictions_values, dtype=dtypes_lib.float32) + labels = constant_op.constant(labels_values) + precision, update_op = metrics.precision_at_recall( + labels, predictions, target_recall=0.8) + + with self.test_session() as sess: + sess.run(variables.local_variables_initializer()) + self.assertAlmostEqual(0.8, sess.run(update_op)) + self.assertAlmostEqual(0.8, precision.eval()) + + def testSomeCorrectLowRecall(self): + predictions_values = [0.1, 0.2, 0.7, 0.3, 0.0, 0.1, 0.45, 0.5, 0.6, 0.9] + labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + + predictions = constant_op.constant( + predictions_values, dtype=dtypes_lib.float32) + labels = constant_op.constant(labels_values) + precision, update_op = metrics.precision_at_recall( + labels, predictions, target_recall=0.4) + + with self.test_session() as sess: + sess.run(variables.local_variables_initializer()) + self.assertAlmostEqual(2.0/3, sess.run(update_op)) + self.assertAlmostEqual(2.0/3, precision.eval()) + + def testWeighted_multipleLabelDtypes(self): + for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32): + predictions_values = [ + 0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.22, 0.25, 0.31, 0.35] + labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + weights_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + predictions = constant_op.constant( + predictions_values, dtype=dtypes_lib.float32) + labels = math_ops.cast(labels_values, dtype=label_dtype) + weights = constant_op.constant(weights_values) + precision, update_op = metrics.precision_at_recall( + labels, predictions, target_recall=0.8, weights=weights) + + with self.test_session() as sess: + sess.run(variables.local_variables_initializer()) + self.assertAlmostEqual(34.0/43, sess.run(update_op)) + self.assertAlmostEqual(34.0/43, precision.eval()) + + class StreamingFNRThresholdsTest(test.TestCase): def setUp(self): diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc index 8dca90a1e34d6a234c2b1479ca5594e88afcc194..ed22ee667f1d73b3f86f77e09bad9bfec7e46391 100644 --- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc +++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc @@ -73,7 +73,7 @@ limitations under the License. */ template -using StatusOr = perftools::gputools::port::StatusOr; +using StatusOr = se::port::StatusOr; using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/mpi_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..475297ca92111c6ead01b41d402556094dab1ee0 --- /dev/null +++ b/tensorflow/contrib/mpi_collectives/mpi_ops.cc @@ -0,0 +1,1236 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef TENSORFLOW_USE_MPI + +#include +#include +#include + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/platform/mutex.h" + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#include +#include "tensorflow/stream_executor/stream.h" +#endif + +#include "tensorflow/stream_executor/lib/statusor.h" + +#define OMPI_SKIP_MPICXX +#include "third_party/mpi/mpi.h" +#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h" +#include "tensorflow/contrib/mpi_collectives/ring.h" + +/* + * MPI Allreduce and Allgather Ops for TensorFlow. + * + * TensorFlow natively provides inter-device communication through send and + * receive ops and inter-node communication through Distributed TensorFlow, + * based on the same send and receive abstractions. These end up being + * insufficient for synchronous data-parallel training on HPC clusters where + * Infiniband or other high-speed interconnects are available. This module + * implements MPI ops for allgather and allreduce, which do bandwidth-optimal + * gathers and reductions and can take advantage of hardware-optimized + * communication libraries through the MPI implementation. + * + * The primary logic of the allreduce and allgather are in RingAllgather() and + * RingAllreduce(). The background thread which facilitates MPI operations is + * run in BackgroundThreadLoop(). The provided MPI ops are: + * – MPIInit: + * Initialize MPI on a given device (CPU or GPU). + * Should only be run on a single device in every process. + * – MPISize: + * Get the number of MPI processes in the global communicator. + * – MPIRank: + * Get the rank of the current MPI process in the global communicator. + * – MPILocalRank: + * Get the local rank of the current MPI process within its node. + * – MPIAllreduce: + * Perform an allreduce on a Tensor, returning the sum + * across all MPI processes in the global communicator. + * – MPIAllgather: + * Perform an allgather on a Tensor, returning the concatenation of + * the tensor on the first dimension across all MPI processes in the + * global communicator. + * + */ + +template +using StatusOr = se::port::StatusOr; + +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +namespace tensorflow { +namespace contrib { +namespace mpi { + +// Make sure template specializations are generated in the ring.cu.cc and the +// ring.cc file, not in this file. +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, Tensor*, + Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, + Tensor*, Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, Tensor*, + Tensor*); +extern template Status RingAllgather(OpKernelContext*, + const Tensor*, + const std::vector&, + Tensor*); +extern template Status RingAllgather( + OpKernelContext*, const Tensor*, const std::vector&, Tensor*); +extern template Status RingAllgather( + OpKernelContext*, const Tensor*, const std::vector&, Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, Tensor*, + Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, + Tensor*, Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, Tensor*, + Tensor*); +extern template Status RingAllgather(OpKernelContext*, + const Tensor*, + const std::vector&, + Tensor*); +extern template Status RingAllgather( + OpKernelContext*, const Tensor*, const std::vector&, Tensor*); +extern template Status RingAllgather( + OpKernelContext*, const Tensor*, const std::vector&, Tensor*); + +namespace { + +// Return true if the templated type is GPUDevice, otherwise false. +template +bool IsGPUDevice(); +template <> +bool IsGPUDevice() { + return true; +}; +template <> +bool IsGPUDevice() { + return false; +}; + +// A callback to call after the MPI communication completes. Since the +// allreduce and allgather ops are asynchronous, this callback is what resumes +// computation after the reduction is completed. +typedef std::function)> CommunicationDoneCallback; + +struct CollectiveOpRecord { + // The rank performing this piece of the op + int rank; + + // The name of the op/tensor to be reduced + std::string name; + + // The op's kernel context + OpKernelContext* context; + + // Data type of the op + DataType dtype; + + // The input tensor + const Tensor* in_t; + + // Allgather: Vector of per-rank first-dimension sizes + std::vector sizes_vec; + + // The temp tensor for intermediate results + Tensor temp_t; + + // The output tensor + Tensor* out_t; + + // Whether to run this op on the gpu + bool on_gpu; + + // The callback to call after the op has completed + CommunicationDoneCallback callback; +}; + +// Table storing Tensors to be reduced, keyed by unique name. +// This table contains everything necessary to do the reduction +typedef std::unordered_map TensorTable; + +// Table for storing Tensor metadata on rank zero. This is used for error +// checking and size calculations, as well as determining when a reduction is +// ready to be done (when all nodes are ready to do it). +typedef std::unordered_map > MessageTable; + +// The global state required for the MPI ops. +// +// MPI is a library that stores a lot of global per-program state and often +// requires running on a single thread. As a result, we have to have a single +// background thread responsible for all MPI operations, and communicate with +// that background thread through global state. +struct MPIGlobalState { + // An atomic boolean which is set to true when MPI is initialized. + // This ensures that MPI_Init is never called twice. + std::atomic_flag initialized_flag = ATOMIC_FLAG_INIT; + + // Condition variable to wait for initialization + condition_variable cv; + + // Whether MPI_Init has been completed on the background thread. + bool initialization_done = false; + + // Whether MPI_Init succeeded on the background thread. + Status init_status; + + // A mutex that needs to be used whenever MPI operations touch + // shared structures. + mutex mu; + + // Tensors waiting to be allreduced or allgathered. + TensorTable tensor_table; + + // Queue of MPI requests waiting to be sent to the coordinator node. + std::queue message_queue; + + // Background thread running MPI communication. + std::thread background_thread; + + // Whether the background thread should shutdown. + bool shut_down = false; + + // Only exists on the coordinator node (rank zero). Maintains a count of + // how many nodes are ready to allreduce every tensor (keyed by tensor + // name). + std::unique_ptr message_table; + + // The MPI rank, local rank, and size. + int rank = 0; + int local_rank = 0; + int size = 1; + + // The device that MPI was initialized on. (-1 for no GPU) + int device = -1; + + // The CUDA stream used for data transfers and within-allreduce operations. + // A naive implementation would use the TensorFlow StreamExecutor CUDA + // stream. However, the allreduce and allgather require doing memory copies + // and kernel executions (for accumulation of values on the GPU). However, + // the subsequent operations must wait for those operations to complete, + // otherwise MPI (which uses its own stream internally) will begin the data + // transfers before the CUDA calls are complete. In order to wait for those + // CUDA operations, if we were using the TensorFlow stream, we would have + // to synchronize that stream; however, other TensorFlow threads may be + // submitting more work to that stream, so synchronizing on it can cause + // the allreduce to be delayed, waiting for compute totally unrelated to it + // in other parts of the graph. Overlaying memory transfers and compute + // during backpropagation is crucial for good performance, so we cannot use + // the TensorFlow stream, and must use our own stream. +#if GOOGLE_CUDA + cudaStream_t stream; + std::atomic_flag stream_created_flag = ATOMIC_FLAG_INIT; +#endif + + ~MPIGlobalState() { + // Make sure that the destructor of the background thread is safe to + // call. If a thread is still joinable (not detached or complete) its + // destructor cannot be called. + if (background_thread.joinable()) { + shut_down = true; + background_thread.join(); + } + } +}; + +// All the MPI state that must be stored globally per-process. +static MPIGlobalState mpi_global; + +// For clarify in argument lists. +#define RANK_ZERO 0 + +// A tag used for all coordinator messaging. +#define TAG_NOTIFY 1 + +// Store the MPIRequest for a name, and return whether the total count of +// MPIRequests for that tensor is now equal to the MPI size (and thus we are +// ready to reduce the tensor). +bool IncrementTensorCount(std::unique_ptr& message_table, + MPIRequest msg, int mpi_size) { + auto name = msg.tensor_name(); + auto table_iter = message_table->find(name); + if (table_iter == message_table->end()) { + message_table->emplace(name, std::vector({msg})); + table_iter = message_table->find(name); + } else { + table_iter->second.push_back(msg); + } + + int count = table_iter->second.size(); + return count == mpi_size; +} + +// Once a tensor is ready to be reduced, the coordinator sends an MPIResponse +// instructing all ranks to start the reduction to all ranks. The MPIResponse +// also contains error messages in case the submitted MPIRequests were not +// valid (for example, contained mismatched shapes or types). +// +// Constructing the MPIResponse, thus, requires a whole lot of error checking. +MPIResponse ConstructMPIResponse(std::unique_ptr& message_table, + std::string name) { + bool error = false; + auto it = message_table->find(name); + assert(it != message_table->end()); + + std::vector requests = it->second; + assert(requests.size() > 0); + + std::ostringstream error_message_stream; + + // Check that all data types being reduced or gathered are identical + auto data_type = requests[0].tensor_type(); + for (unsigned int i = 1; i < requests.size(); i++) { + auto request_type = requests[i].tensor_type(); + if (data_type != request_type) { + error = true; + error_message_stream << "Mismatched data types: One rank had type " + << DataType_Name(data_type) + << ", but another rank had type " + << DataType_Name(request_type) << "."; + break; + } + } + + // Check that all requested operations are the same + auto message_type = requests[0].request_type(); + for (unsigned int i = 1; i < requests.size(); i++) { + if (error) { + break; + } + + auto request_type = requests[i].request_type(); + if (message_type != request_type) { + error = true; + error_message_stream << "Mismatched MPI operations: One rank did an " + << message_type << ", but another rank did an " + << request_type << "."; + break; + } + } + + // If we are doing an allreduce, check that all tensor shapes + // are identical + if (message_type == MPIRequest::ALLREDUCE) { + TensorShape tensor_shape = requests[0].tensor_shape(); + for (unsigned int i = 1; i < requests.size(); i++) { + if (error) { + break; + } + + TensorShape request_shape = requests[i].tensor_shape(); + if (tensor_shape != request_shape) { + error = true; + error_message_stream << "Mismatched allreduce tensor shapes: " + << "One rank reduced a tensor of shape " + << tensor_shape.DebugString() + << ", but another rank sent a tensor of shape " + << request_shape.DebugString() << "."; + break; + } + } + } + + // If we are doing an allgather, make sure all but the first dimension are + // the same. The first dimension may be different and the output tensor is + // the sum of the first dimension. Collect the sizes by rank. + if (message_type == MPIRequest::ALLGATHER) { + TensorShape tensor_shape = requests[0].tensor_shape(); + + if (tensor_shape.dims() == 0) { + error = true; + error_message_stream << "Rank zero tried to gather a rank-zero tensor."; + } + + for (unsigned int i = 1; i < requests.size(); i++) { + if (error) { + break; + } + + TensorShape request_shape = requests[i].tensor_shape(); + if (tensor_shape.dims() != request_shape.dims()) { + error = true; + error_message_stream << "Mismatched allgather tensor shapes: " + << "One rank gathered a tensor of rank " + << tensor_shape.dims() + << ", but another rank sent a tensor of rank " + << request_shape.dims() << "."; + break; + } + + for (unsigned int dim = 1; dim < tensor_shape.dims(); dim++) { + if (tensor_shape.dim_size(dim) != request_shape.dim_size(dim)) { + error = true; + error_message_stream + << "Mismatched allgather tensor shapes: " + << "One rank gathered a tensor with dimension " << dim + << " equal to " << tensor_shape.dim_size(dim) + << ", but another rank sent a tensor with dimension " << dim + << " equal to " << request_shape.dim_size(dim) << "."; + break; + } + } + } + } + + MPIResponse response; + response.set_tensor_name(name); + if (error) { + std::string error_message = error_message_stream.str(); + response.set_response_type(MPIResponse::ERROR); + response.set_error_message(error_message); + } else { + auto response_type = MPIResponse::ERROR; + if (message_type == MPIRequest::ALLREDUCE) { + response_type = MPIResponse::ALLREDUCE; + } else { + response_type = MPIResponse::ALLGATHER; + } + response.set_response_type(response_type); + } + + // Clear all queued up requests for this name. They are now taken care of + // by the constructed MPI response. + message_table->erase(it); + + return response; +} + +// Process an MPIResponse by doing a reduction, a gather, or raising an error. +void PerformCollectiveOp(TensorTable& tensor_table, MPIResponse response) { + OpKernelContext* context; + const Tensor* input_tensor; + std::vector sizes_vec; + Tensor temp_tensor; + Tensor* output_tensor; + CommunicationDoneCallback callback; + bool on_gpu; + { + // Lock on the tensor table. + mutex_lock guard(mpi_global.mu); + + // We should never fail at finding this key in the tensor table. + auto name = response.tensor_name(); + auto iter = tensor_table.find(name); + assert(iter != tensor_table.end()); + + assert(response.response_type() == MPIResponse::ALLREDUCE || + response.response_type() == MPIResponse::ALLGATHER || + response.response_type() == MPIResponse::ERROR); + + CollectiveOpRecord record = iter->second; + context = record.context; + input_tensor = record.in_t; + sizes_vec = record.sizes_vec; + temp_tensor = record.temp_t; + output_tensor = record.out_t; + on_gpu = record.on_gpu; + callback = record.callback; + + // Clear the tensor table of this tensor and its callbacks; the rest of + // this function takes care of it. + tensor_table.erase(iter); + } + + // Use CPUDevice instead of GPUDevice if no CUDA, to ensure we don't + // link to non-existent symbols. +#if GOOGLE_CUDA +#define GPU_DEVICE_IF_CUDA GPUDevice +#else +#define GPU_DEVICE_IF_CUDA CPUDevice +#endif + + Status status; + auto dtype = input_tensor->dtype(); + if (response.response_type() == MPIResponse::ALLGATHER) { + if (dtype == DT_FLOAT) { + status = on_gpu ? RingAllgather( + context, input_tensor, sizes_vec, output_tensor) + : RingAllgather( + context, input_tensor, sizes_vec, output_tensor); + } else if (dtype == DT_INT32) { + status = on_gpu ? RingAllgather( + context, input_tensor, sizes_vec, output_tensor) + : RingAllgather(context, input_tensor, + sizes_vec, output_tensor); + } else if (dtype == DT_INT64) { + status = on_gpu ? RingAllgather( + context, input_tensor, sizes_vec, output_tensor) + : RingAllgather( + context, input_tensor, sizes_vec, output_tensor); + } else { + status = errors::Unknown("Invalid tensor type for MPI allgather."); + } + } else if (response.response_type() == MPIResponse::ALLREDUCE) { + if (dtype == DT_FLOAT) { + status = on_gpu ? RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor) + : RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor); + } else if (dtype == DT_INT32) { + status = on_gpu ? RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor) + : RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor); + } else if (dtype == DT_INT64) { + status = on_gpu ? RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor) + : RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor); + } else { + status = errors::Unknown("Invalid tensor type for MPI allreduce."); + } + } else if (response.response_type() == MPIResponse::ERROR) { + status = errors::FailedPrecondition(response.error_message()); + } + + if (status.ok()) { + callback(StatusOr(*output_tensor)); + } else { + callback(StatusOr(status)); + } +} + +// The MPI background thread loop coordinates all the MPI processes and the +// tensor reductions. The design of the communicator mechanism is limited by a +// few considerations: +// +// 1. Some MPI implementations require all MPI calls to happen from a +// single thread. Since TensorFlow may use several threads for graph +// processing, this means we must have our own dedicated thread for +// dealing with MPI. +// 2. We want to gracefully handle errors, when MPI processes do not +// properly agree upon what should happen (such as mismatched types or +// shapes). To do so requires the MPI processes to know about the shapes +// and types of the relevant tensors on the other processes. +// 3. The MPI reductions and gathers should be able to happen in parallel +// with other ongoing operations. Since MPI uses an internal +// (inaccessible) GPU stream separate from the TF GPUDevice streams, we +// cannot explicitly synchronize memcpys or kernels with it. As a result, +// MPIAllreduce and MPIAllgather must be AsyncOpKernels to ensure proper +// ordering of memcpys and kernels with respect to TF streams. +// 4. NOTE: We cannot guarantee that all the MPI processes reduce their +// tensors in the same order. Thus, there must be a way to ensure the +// reduction memcpys and kernels occur for correct tensors across all +// ranks at the same time. We choose to use a coordinator (rank ID 0) to +// gather and trigger the reduction operations that are ready to execute. +// +// The coordinator currently follows a master-worker paradigm. Rank zero acts +// as the master (the "coordinator"), whereas all other ranks are simply +// workers. Each rank runs its own background thread which progresses in ticks. +// In each tick, the following actions happen: +// +// a) The workers send any available MPIRequests to the coordinator. These +// MPIRequests indicate what the worker would like to do (i.e. which +// tensor they would like to gather or reduce, as well as their shape and +// type). They repeat this for every tensor that they would like to +// operate on after that tensor's collective op has executed ComputeAsync. +// +// b) The workers send an empty "DONE" message to the coordinator to +// indicate that there are no more tensors they wish to operate on. +// +// c) The coordinator receives the MPIRequests from the workers, as well +// as from its own TensorFlow ops, and stores them in a request table. The +// coordinator continues to receive MPIRequest messages until it has +// received MPI_SIZE number of empty "DONE" messages. +// +// d) The coordinator finds all tensors that are ready to be reduced, +// gathered, or all operations that result in an error. For each of those, +// it sends an MPIResponse to all the workers. When no more MPIResponses +// are available, it sends a "DONE" response to the workers. If the +// process is being shutdown, it instead sends a "SHUTDOWN" response. +// +// e) The workers listen for MPIResponse messages, processing each one by +// doing the required reduce or gather, until they receive a "DONE" +// response from the coordinator. At that point, the tick ends. +// If instead of "DONE" they receive "SHUTDOWN", they exit their +// background loop. +// TODO: Use the global mpi_global state variable instead of a local one +void BackgroundThreadLoop() { +#if GOOGLE_CUDA + // Set the device, so that this thread uses the same GPU context as the + // calling thread. + // TODO: Ensure that this is operating correctly. The background thread + // needs to be able to control all GPUs that the rank has access to, and + // might be more than 1 GPU. Tensors could be resident in any of the + // GPUs, so the background thread's accumulate and copy kernels might need + // to correctly set the device and it might be necessary for the background + // thread to manage multiple streams. + cudaSetDevice(mpi_global.device); + cudaStreamCreate(&mpi_global.stream); +#endif + + // Initialize MPI. This must happen on the background thread, since not all + // MPI implementations support being called from multiple threads. + auto init_result = MPI_Init(NULL, NULL); + if (init_result != MPI_SUCCESS) { + mpi_global.init_status = + errors::Unknown("Could not initialize MPI; MPI_Init() failed."); + mpi_global.initialization_done = true; + mpi_global.cv.notify_all(); + return; + } else { + mpi_global.init_status = Status::OK(); + } + + // Get MPI rank to determine if we are rank zero. + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + bool is_coordinator = rank == 0; + + // Get MPI size to determine how many tensors to wait for before reducing. + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // Determine local rank by querying the local communicator. + MPI_Comm local_comm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, + &local_comm); + int local_rank; + MPI_Comm_rank(local_comm, &local_rank); + + mpi_global.rank = rank; + mpi_global.local_rank = local_rank; + mpi_global.size = size; + mpi_global.initialization_done = true; + + // Notify calling thread that initialization is complete + mpi_global.cv.notify_all(); + + // TODO: MOVE MESSAGE TABLE INITIALIZATION TO LIBRARY LOAD! + // Initialize the tensor count table. No tensors are available yet. + if (is_coordinator) { + mpi_global.message_table = + std::unique_ptr(new MessageTable()); + } + + // The coordinator sends a SHUTDOWN message to trigger shutdown. + bool should_shut_down = false; + do { + // TODO: Eliminate the need for thread sleep by making all activity + // depend on other activity (e.g. condition or MPI waits). + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + + // Copy the data structures from global state under this lock. + // However, don't keep the lock for the rest of the loop, so that + // enqueued stream callbacks can continue. + std::queue message_queue; + { + mutex_lock guard(mpi_global.mu); + while (!mpi_global.message_queue.empty()) { + MPIRequest message = mpi_global.message_queue.front(); + mpi_global.message_queue.pop(); + message_queue.push(message); + } + } + + // Collect all tensors that are ready to be reduced. Record them in the + // tensor count table (rank zero) or send them to rank zero to be + // recorded (everyone else). + std::vector ready_to_reduce; + while (!message_queue.empty()) { + // Pop the first available message message + MPIRequest message = message_queue.front(); + message_queue.pop(); + + if (is_coordinator) { + bool reduce = + IncrementTensorCount(mpi_global.message_table, message, size); + if (reduce) { + ready_to_reduce.push_back(message.tensor_name()); + } + } else { + std::string encoded_message; + message.SerializeToString(&encoded_message); + MPI_Send(encoded_message.c_str(), encoded_message.length() + 1, + MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD); + } + } + + // Rank zero has put all its own tensors in the tensor count table. + // Now, it should count all the tensors that are coming from other + // ranks at this tick. It should keep getting tensors until it gets a + // DONE message from all the other ranks. + if (is_coordinator) { + // Count of DONE messages. Keep receiving messages until the number + // of messages is equal to the number of processes. Initialize to + // one since the coordinator is effectively done. + int completed_ranks = 1; + while (completed_ranks != size) { + MPI_Status status; + MPI_Probe(MPI_ANY_SOURCE, TAG_NOTIFY, MPI_COMM_WORLD, &status); + + // Find number of characters in message (including zero byte). + int source_rank = status.MPI_SOURCE; + int msg_length; + MPI_Get_count(&status, MPI_BYTE, &msg_length); + + // If the length is zero, this is a DONE message. + if (msg_length == 0) { + completed_ranks++; + MPI_Recv(NULL, 0, MPI_BYTE, source_rank, TAG_NOTIFY, MPI_COMM_WORLD, + &status); + continue; + } + + // Get tensor name from MPI into an std::string. + char* buffer = new char[msg_length]; + MPI_Recv(buffer, msg_length, MPI_BYTE, source_rank, TAG_NOTIFY, + MPI_COMM_WORLD, &status); + std::string received_data(buffer); + delete[] buffer; + + MPIRequest received_message; + received_message.ParseFromString(received_data); + auto received_name = received_message.tensor_name(); + + bool reduce = IncrementTensorCount(mpi_global.message_table, + received_message, size); + if (reduce) { + ready_to_reduce.push_back(received_name); + } + } + + // At this point, rank zero should have a fully updated tensor + // count table and should know all the tensors that need to be + // reduced or gathered, and everyone else should have sent all + // their information to rank zero. We can now do reductions and + // gathers; rank zero will choose which ones and in what order, + // and will notify the other ranks before doing each reduction. + for (int i = 0; i < ready_to_reduce.size(); i++) { + // Notify all nodes which tensor we'd like to reduce now + auto name = ready_to_reduce[i]; + MPIResponse response = + ConstructMPIResponse(mpi_global.message_table, name); + + std::string encoded_response; + response.SerializeToString(&encoded_response); + for (int r = 1; r < size; r++) { + MPI_Send(encoded_response.c_str(), encoded_response.length() + 1, + MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD); + } + + // Perform the reduction. All nodes should end up performing + // the same reduction. + PerformCollectiveOp(mpi_global.tensor_table, response); + } + + // Notify all nodes that we are done with the reductions for this + // tick. + MPIResponse done_response; + should_shut_down = mpi_global.shut_down; + done_response.set_response_type( + mpi_global.shut_down ? MPIResponse::SHUTDOWN : MPIResponse::DONE); + std::string encoded_response; + done_response.SerializeToString(&encoded_response); + for (int r = 1; r < size; r++) { + MPI_Send(encoded_response.c_str(), encoded_response.length() + 1, + MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD); + } + } else { + // Notify the coordinator that this node is done sending messages. + // A DONE message is encoded as a zero-length message. + MPI_Send(NULL, 0, MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD); + + // Receive names for tensors to reduce from rank zero. Once we + // receive a empty DONE message, stop waiting for more names. + while (true) { + MPI_Status status; + MPI_Probe(0, TAG_NOTIFY, MPI_COMM_WORLD, &status); + + // Find number of characters in message (including zero byte). + int msg_length; + MPI_Get_count(&status, MPI_BYTE, &msg_length); + + // Get tensor name from MPI into an std::string. + char* buffer = new char[msg_length]; + MPI_Recv(buffer, msg_length, MPI_BYTE, 0, TAG_NOTIFY, MPI_COMM_WORLD, + &status); + std::string received_message(buffer); + delete[] buffer; + + MPIResponse response; + response.ParseFromString(received_message); + if (response.response_type() == MPIResponse::DONE) { + // No more messages this tick + break; + } else if (response.response_type() == MPIResponse::SHUTDOWN) { + // No more messages this tick, and the background thread + // should shut down + should_shut_down = true; + break; + } else { + // Process the current message + PerformCollectiveOp(mpi_global.tensor_table, response); + } + } + } + } while (!should_shut_down); + + MPI_Finalize(); +} + +// Initialize MPI and start the MPI background thread. Ensure that this is +// only done once no matter how many times this function is called. +Status InitializeMPIOnce(bool gpu) { + // Ensure MPI is only initialized once. + if (mpi_global.initialized_flag.test_and_set()) return mpi_global.init_status; + + mpi_global.device = -1; +#if GOOGLE_CUDA + if (gpu) { + cudaGetDevice(&mpi_global.device); + } +#endif + + // Start the MPI background thread, which assumes MPI is initialized + // TODO: Change this to a Tensorflow thread + mpi_global.background_thread = std::thread(BackgroundThreadLoop); + + // Wait to ensure that the background thread has finished initializing MPI + mutex_lock guard(mpi_global.mu); + mpi_global.cv.wait(guard); + if (!mpi_global.initialization_done) { + mpi_global.init_status = + errors::Unknown("Failed to wait for MPI initialization."); + } + + return mpi_global.init_status; +} + +// Check that MPI is initialized. +Status IsMPIInitialized() { + if (!mpi_global.initialization_done) { + return errors::FailedPrecondition( + "MPI has not been initialized; use tf.contrib.mpi.Session."); + } + return Status::OK(); +} + +// This function (called from the callback set up in MPIAll*Op::ComputeAsync) +// only adds the op's record into the local op queue (to track the op's +// progress), and sends a message to the coordinator indicating that this rank +// is ready to begin. The MPI background thread will handle the MPI message. +void EnqueueTensorCollective(CollectiveOpRecord record, + MPIRequest::RequestType rtype) { + const Tensor* input_tensor = record.in_t; + MPIRequest message; + message.set_request_rank(record.rank); + message.set_tensor_name(record.name); + message.set_tensor_type(record.dtype); + message.set_request_type(rtype); + input_tensor->shape().AsProto(message.mutable_tensor_shape()); + + mutex_lock guard(mpi_global.mu); + mpi_global.tensor_table.emplace(record.name, record); + mpi_global.message_queue.push(message); +} + +} // namespace + +#if GOOGLE_CUDA +cudaStream_t CudaStreamForMPI() { return mpi_global.stream; } +#endif + +// Op to initialize MPI in the current process. The settings used in the +// configuration are the same that must be used for all future MPI ops. +template +class MPIInitOp : public OpKernel { + public: + explicit MPIInitOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + bool on_gpu = IsGPUDevice(); + OP_REQUIRES_OK(context, InitializeMPIOnce(on_gpu)); + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_CPU), + MPIInitOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_GPU), + MPIInitOp); +#endif + +REGISTER_OP("MPIInit").Doc(R"doc( +Initialize MPI for the current process. + +If this is run on a GPU, then that GPU must be used for all future MPI +operations. If it is run on CPU, then all future MPI operations must also +run on CPU. +)doc"); + +// Op to get the current MPI Size. +template +class MPISizeOp : public OpKernel { + public: + explicit MPISizeOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + OP_REQUIRES_OK(context, IsMPIInitialized()); + + // Write integer to output tensor + Tensor* output; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + + auto flat = output->flat(); + flat(0) = mpi_global.size; + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_CPU), + MPISizeOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_GPU).HostMemory("size"), + MPISizeOp); +#endif + +REGISTER_OP("MPISize") + .Output("size: int32") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->Scalar()); + return Status::OK(); + }) + .Doc(R"doc( +Returns the number of running MPI processes. + +More precisely, returns the number of MPI processes in the group associated +with the MPI_COMM_WORLD communicator. + +size: Size of the MPI group. +)doc"); + +// Op to get the current MPI Rank. +template +class MPIRankOp : public OpKernel { + public: + explicit MPIRankOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + OP_REQUIRES_OK(context, IsMPIInitialized()); + + // Write integer to output tensor + Tensor* output; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + + auto flat = output->flat(); + flat(0) = mpi_global.rank; + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_CPU), + MPIRankOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_GPU).HostMemory("rank"), + MPIRankOp); +#endif + +REGISTER_OP("MPIRank") + .Output("rank: int32") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->Scalar()); + return Status::OK(); + }) + .Doc(R"doc( +Returns the index of the current process in the MPI group. + +More precisely, returns the rank of the calling process in the MPI_COMM_WORLD +communicator. + +rank: Rank of the calling process. +)doc"); + +// Op to get the current local MPI Rank. +template +class MPILocalRankOp : public OpKernel { + public: + explicit MPILocalRankOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + OP_REQUIRES_OK(context, IsMPIInitialized()); + + // Write integer to output tensor + Tensor* output; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + + auto flat = output->flat(); + flat(0) = mpi_global.local_rank; + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPILocalRank").Device(DEVICE_CPU), + MPILocalRankOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER( + Name("MPILocalRank").Device(DEVICE_GPU).HostMemory("rank"), + MPILocalRankOp); +#endif + +REGISTER_OP("MPILocalRank") + .Output("rank: int32") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->Scalar()); + return Status::OK(); + }) + .Doc(R"doc( +Returns the index of the current process in the node it is on. + +More precisely, returns the rank of the calling process in communicator that +only spans the MPI processes running on that node. + +rank: Rank of the calling process on the node it is on. +)doc"); + +template +class MPIAllreduceOp : public AsyncOpKernel { + public: + explicit MPIAllreduceOp(OpKernelConstruction* context) + : AsyncOpKernel(context) {} + + // Although this op is handled asynchronously, the ComputeAsync call is + // very inexpensive. It only sets up a CollectiveOpRecord and places it + // in the table for the background thread to handle. Thus, we do not need + // a TF pool thread to perform the op. + bool IsExpensive() override { return false; } + + void ComputeAsync(OpKernelContext* context, DoneCallback done) override { + OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done); + const Tensor* input_tensor = &context->input(0); + Tensor* output_tensor; + OP_REQUIRES_OK_ASYNC( + context, + context->allocate_output(0, input_tensor->shape(), &output_tensor), + done); + + // Record allocated on stack so op can fail without memory leak + CollectiveOpRecord record; + record.name = name(); + record.context = context; + record.in_t = input_tensor; + record.out_t = output_tensor; + record.on_gpu = IsGPUDevice(); + record.dtype = input_tensor->dtype(); + + const size_t temp_size = + (input_tensor->NumElements() + mpi_global.size - 1) / mpi_global.size; + TensorShape temp_shape; + temp_shape.AddDim(temp_size); + OP_REQUIRES_OK_ASYNC(context, + context->allocate_temp(input_tensor->dtype(), + temp_shape, &record.temp_t), + done); + + auto allreduce_done_callback = [done, context](StatusOr status) { + context->SetStatus(status.status()); + done(); + }; + record.callback = allreduce_done_callback; + + auto allreduce_launch_callback = [record] { + EnqueueTensorCollective(record, MPIRequest::ALLREDUCE); + }; + + // If we are on a CPU, our device context will be null and we can't + // get a stream to enqueue this on. On a CPU this op is called when the + // data is already available, so we can just immediately do the + // allreduce; we don't have to wait for the data to get populated. +#if GOOGLE_CUDA + auto device_context = context->op_device_context(); + if (device_context == nullptr) { + allreduce_launch_callback(); + } else { + auto stream = device_context->stream(); + stream->ThenDoHostCallback(allreduce_launch_callback); + } +#else + allreduce_launch_callback(); +#endif + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_CPU), + MPIAllreduceOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_GPU), + MPIAllreduceOp); +#endif + +REGISTER_OP("MPIAllreduce") + .Attr("T: {int32, int64, float32}") + .Input("tensor: T") + .Output("sum: T") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->input(0)); + return Status::OK(); + }) + .Doc(R"doc( +Perform an MPI Allreduce on a tensor. All other processes that do a reduction +on a tensor with the same name must have the same dimension for that tensor. +Tensors are reduced with other tensors that have the same node name for the +allreduce. + +Arguments + tensor: A tensor to reduce. + +Output + sum: A tensor with the same shape as `tensor`, summed across all + MPI processes. +)doc"); + +template +class MPIAllgatherOp : public AsyncOpKernel { + public: + explicit MPIAllgatherOp(OpKernelConstruction* context) + : AsyncOpKernel(context) {} + + // Although this op is handled asynchronously, the ComputeAsync call is + // very inexpensive. It only sets up a CollectiveOpRecord and places it + // in the table for the background thread to handle. Thus, we do not need + // a TF pool thread to perform the op. + bool IsExpensive() override { return false; } + + void ComputeAsync(OpKernelContext* context, DoneCallback done) override { + OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done); + const Tensor* input_tensor = &context->input(0); + const Tensor* sizing_tensor = &context->input(1); + + // Record allocated on stack so op can fail without memory leak + CollectiveOpRecord record; + record.name = name(); + record.context = context; + record.in_t = input_tensor; + record.on_gpu = IsGPUDevice(); + + // Construct the output size from the sizing tensor + size_t output_first_dim = 0; + if (sizing_tensor->shape().dims() == 0) { + // 0-dim sizing_tensor implies that the op is just gathering + // a single element from each rank + output_first_dim = mpi_global.size; + for (int i = 0; i < mpi_global.size; i++) { + record.sizes_vec.push_back(1); + } + } else { + // Collect the total output tensor sizing from the sizing tensor + // NOTE: The sizing tensor is forced to be placed on the CPU by + // declaring the input as HostMemory, so it is valid to read it here. + const int64* sizing_array = + (const int64*)sizing_tensor->tensor_data().data(); + for (int i = 0; i < mpi_global.size; i++) { + record.sizes_vec.push_back(sizing_array[i]); + output_first_dim += sizing_array[i]; + } + } + + TensorShape output_shape; + output_shape.AddDim(output_first_dim); + for (int i = 1; i < input_tensor->shape().dims(); i++) { + output_shape.AddDim(input_tensor->shape().dim_size(i)); + } + + Tensor* output_tensor; + OP_REQUIRES_OK_ASYNC( + context, context->allocate_output(0, output_shape, &output_tensor), + done); + + record.out_t = output_tensor; + record.dtype = input_tensor->dtype(); + + auto allgather_done_callback = [done, context](StatusOr status) { + context->SetStatus(status.status()); + done(); + }; + record.callback = allgather_done_callback; + + auto allgather_launch_callback = [record] { + EnqueueTensorCollective(record, MPIRequest::ALLGATHER); + }; + + // If we are on a CPU, our device context will be null and we can't + // get a stream to enqueue this on. On a CPU this op is called when the + // data is already available, so we can just immediately do the + // allgather; we don't have to wait for the data to get populated. +#if GOOGLE_CUDA + auto device_context = context->op_device_context(); + if (device_context == nullptr) { + allgather_launch_callback(); + } else { + auto stream = device_context->stream(); + stream->ThenDoHostCallback(allgather_launch_callback); + } +#else + allgather_launch_callback(); +#endif + } +}; + +REGISTER_OP("MPIAllgather") + .Attr("T: {int32, int64, float32}") + .Attr("S: {int64}") + .Input("tensor: T") + .Input("sizes: S") + .Output("gathered: T") + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle output; + TF_RETURN_IF_ERROR( + c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output)); + c->set_output(0, output); + return Status::OK(); + }) + .Doc(R"doc( +Perform an MPI Allgather on a tensor. All other processes that do a gather on a +tensor with the same name must have the same rank for that tensor, and have the +same dimension on all but the first dimension. + +Arguments + tensor: A tensor to gather. + sizes: A tensor containing the first-dimension sizes of tensors to be + gathered from other ranks + +Output + gathered: A tensor with the same shape as `tensor` except for the first + dimension, which is the sum of dimensions in `sizes`. +)doc"); + +REGISTER_KERNEL_BUILDER( + Name("MPIAllgather").Device(DEVICE_CPU).HostMemory("sizes"), + MPIAllgatherOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER( + Name("MPIAllgather").Device(DEVICE_GPU).HostMemory("sizes"), + MPIAllgatherOp); +#endif + +} // namespace mpi +} // namespace contrib +} // namespace tensorflow + +#endif // TENSORFLOW_USE_MPI diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc index b9b482a6981e03144c6d00f2a38b71959b4b3621..b1cb89391ceaa70813be47cc1bba0c16f4f70e77 100644 --- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc +++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc @@ -24,7 +24,7 @@ limitations under the License. namespace tensorflow { -using ::perftools::gputools::cuda::ScopedActivateExecutorContext; +using se::cuda::ScopedActivateExecutorContext; // Contains data for a single stream used for nccl communication; this includes // a background thread that calls NcclManager::LoopKernelLaunches. @@ -37,11 +37,11 @@ struct NcclManager::NcclStream { cv.notify_all(); } - perftools::gputools::StreamExecutor* executor = nullptr; + se::StreamExecutor* executor = nullptr; // The stream on which to run the nccl collective. // This is a different stream than the tensorflow compute stream. - std::unique_ptr stream; + std::unique_ptr stream; // See NcclManager::LoopKernelLaunches for information on these. std::unique_ptr thread; @@ -95,9 +95,8 @@ ncclDataType_t ToNcclType(DataType t) { // A participant in a Collective. See below. struct NcclManager::Participant { Participant(const Tensor* in_t, Tensor* out_t, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - perftools::gputools::StreamExecutor* executor, int gpu_device_id, - NcclManager::DoneCallback done_callback) + se::Stream* tensor_stream, se::StreamExecutor* executor, + int gpu_device_id, NcclManager::DoneCallback done_callback) : in_t(in_t), out_t(out_t), event_mgr(event_mgr), @@ -121,11 +120,11 @@ struct NcclManager::Participant { EventMgr* const event_mgr; // Owned by the caller, who must keep it live until is called. - perftools::gputools::Stream* const tensor_stream; + se::Stream* const tensor_stream; // Matches the executor in CommunicatorMember::stream. Expected to be live for // process lifetime. - perftools::gputools::StreamExecutor* const executor = nullptr; + se::StreamExecutor* const executor = nullptr; const int gpu_device_id; @@ -245,7 +244,7 @@ NcclManager::Communicator* NcclManager::GetCommunicator( if (nccl_stream == nullptr) { nccl_stream = new NcclStream(); nccl_stream->executor = executor; - nccl_stream->stream.reset(new perftools::gputools::Stream(executor)); + nccl_stream->stream.reset(new se::Stream(executor)); nccl_stream->stream->Init(); streams.emplace_back(nccl_stream); @@ -300,10 +299,10 @@ NcclManager::Communicator* NcclManager::GetCommunicator( void NcclManager::AddToAllReduce(int num_devices, const string& key, ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, Tensor* out_t, + se::Stream* tensor_stream, const Tensor* in_t, + Tensor* out_t, const DoneCallback& done_callback) { std::unique_ptr participant( new Participant(in_t, out_t, event_mgr, tensor_stream, executor, @@ -312,11 +311,12 @@ void NcclManager::AddToAllReduce(int num_devices, const string& key, kAllReduce, reduction_op); } -void NcclManager::AddBroadcastSend( - int num_devices, const string& key, - perftools::gputools::StreamExecutor* executor, int gpu_device_id, - EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, DoneCallback done_callback) { +void NcclManager::AddBroadcastSend(int num_devices, const string& key, + se::StreamExecutor* executor, + int gpu_device_id, EventMgr* event_mgr, + se::Stream* tensor_stream, + const Tensor* in_t, + DoneCallback done_callback) { std::unique_ptr participant( new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream, executor, gpu_device_id, std::move(done_callback))); @@ -325,11 +325,11 @@ void NcclManager::AddBroadcastSend( kBroadcast, ncclSum /* unused */); } -void NcclManager::AddBroadcastRecv( - int num_devices, const string& key, - perftools::gputools::StreamExecutor* executor, int gpu_device_id, - EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream, - Tensor* out_t, DoneCallback done_callback) { +void NcclManager::AddBroadcastRecv(int num_devices, const string& key, + se::StreamExecutor* executor, + int gpu_device_id, EventMgr* event_mgr, + se::Stream* tensor_stream, Tensor* out_t, + DoneCallback done_callback) { std::unique_ptr participant( new Participant(nullptr /* in_t */, out_t, event_mgr, tensor_stream, executor, gpu_device_id, std::move(done_callback))); @@ -339,9 +339,8 @@ void NcclManager::AddBroadcastRecv( void NcclManager::AddReduceSend(int num_devices, const string& key, ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, - int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, + se::StreamExecutor* executor, int gpu_device_id, + EventMgr* event_mgr, se::Stream* tensor_stream, const Tensor* in_t, DoneCallback done_callback) { std::unique_ptr participant( @@ -353,9 +352,8 @@ void NcclManager::AddReduceSend(int num_devices, const string& key, void NcclManager::AddReduceRecv(int num_devices, const string& key, ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, - int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, + se::StreamExecutor* executor, int gpu_device_id, + EventMgr* event_mgr, se::Stream* tensor_stream, const Tensor* in_t, Tensor* out_t, DoneCallback done_callback) { std::unique_ptr participant( @@ -444,7 +442,7 @@ void NcclManager::RunCollective(const string& key, Collective* collective) { } void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) { - perftools::gputools::Stream* comm_stream = nccl_stream->stream.get(); + se::Stream* comm_stream = nccl_stream->stream.get(); ScopedActivateExecutorContext scoped_context(nccl_stream->executor); const cudaStream_t* cu_stream = reinterpret_cast( comm_stream->implementation()->CudaStreamMemberHack()); diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h index 6ff8cea84eb912d5e5c891c40efc617661725a63..57a96c5d3342f6e934e88367881388fb160dc5e3 100644 --- a/tensorflow/contrib/nccl/kernels/nccl_manager.h +++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h @@ -55,41 +55,34 @@ class NcclManager { // is also the stream that will use the produced data; is // not called until the next kernel launched on would see the data. void AddToAllReduce(int num_devices, const string& key, - ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, + ncclRedOp_t reduction_op, se::StreamExecutor* executor, int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, Tensor* out_t, - const DoneCallback& done_callback); + se::Stream* tensor_stream, const Tensor* in_t, + Tensor* out_t, const DoneCallback& done_callback); // AddBroadcastSend and AddBroadcastRecv combine to sent data from one sender // to all receivers. void AddBroadcastSend(int num_devices, const string& key, - perftools::gputools::StreamExecutor* executor, - int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, + se::StreamExecutor* executor, int gpu_device_id, + EventMgr* event_mgr, se::Stream* tensor_stream, const Tensor* in_t, DoneCallback done_callback); void AddBroadcastRecv(int num_devices, const string& key, - perftools::gputools::StreamExecutor* executor, - int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, + se::StreamExecutor* executor, int gpu_device_id, + EventMgr* event_mgr, se::Stream* tensor_stream, Tensor* out_t, DoneCallback done_callback); // AddReduceSend and AddReduceRecv combine to sent data from all senders // to one receiver. void AddReduceSend(int num_devices, const string& key, - ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, + ncclRedOp_t reduction_op, se::StreamExecutor* executor, int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, DoneCallback done_callback); + se::Stream* tensor_stream, const Tensor* in_t, + DoneCallback done_callback); void AddReduceRecv(int num_devices, const string& key, - ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, + ncclRedOp_t reduction_op, se::StreamExecutor* executor, int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, Tensor* out_t, - DoneCallback done_callback); + se::Stream* tensor_stream, const Tensor* in_t, + Tensor* out_t, DoneCallback done_callback); private: enum CollectiveType { @@ -123,8 +116,7 @@ class NcclManager { // Maps a device to the communication streams that make up its collective. // This is used to share the stream across different communicators that // include the same device. - std::map>> + std::map>> device_to_comm_streams_ GUARDED_BY(mu_); std::vector> communicators_; diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc index 06ca65e33ad6f5fb6620144231dd368379dcc190..4d8d922cb42d2974dab32cf4562bee3993bef098 100644 --- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc +++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc @@ -175,11 +175,9 @@ class NcclManagerTest : public ::testing::Test { nullptr /* step_resource_manager */); } - static perftools::gputools::DeviceMemory AsDeviceMemory( - const Scalar* cuda_memory) { - perftools::gputools::DeviceMemoryBase wrapped( - const_cast(cuda_memory)); - perftools::gputools::DeviceMemory typed(wrapped); + static se::DeviceMemory AsDeviceMemory(const Scalar* cuda_memory) { + se::DeviceMemoryBase wrapped(const_cast(cuda_memory)); + se::DeviceMemory typed(wrapped); return typed; } diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD index 612ecc3e63891f4dabf97828fe75672dd7877a91..13aa1d7e7a11877373a848c1ba865aa418790cd0 100644 --- a/tensorflow/contrib/opt/BUILD +++ b/tensorflow/contrib/opt/BUILD @@ -25,6 +25,7 @@ py_library( "python/training/multitask_optimizer_wrapper.py", "python/training/nadam_optimizer.py", "python/training/powersign.py", + "python/training/reg_adagrad_optimizer.py", "python/training/sign_decay.py", "python/training/variable_clipping_optimizer.py", ], @@ -155,6 +156,25 @@ py_test( ], ) +py_test( + name = "reg_adagrad_optimizer_test", + srcs = ["python/training/reg_adagrad_optimizer_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":opt_py", + "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:embedding_ops", + "//tensorflow/python:framework_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:variable_scope", + "//tensorflow/python:variables", + "//third_party/py/numpy", + ], +) + py_test( name = "nadam_optimizer_test", srcs = ["python/training/nadam_optimizer_test.py"], diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py index aeca900bc8ff4c4cc26da490ce43dfec70fd9f11..72117c1e81a164b0517fabeaddec3ea5132af5a9 100644 --- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py +++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py @@ -56,21 +56,21 @@ class LazyAdamOptimizer(adam.AdamOptimizer): epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) - # m := beta1 * m + (1 - beta1) * g_t + # \\(m := beta1 * m + (1 - beta1) * g_t\\) m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1 - beta1_t) * grad.values, use_locking=self._use_locking) - # v := beta2 * v + (1 - beta2) * (g_t * g_t) + # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1 - beta2_t) * math_ops.square(grad.values), use_locking=self._use_locking) - # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t)) + # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\) m_t_slice = array_ops.gather(m_t, grad.indices) v_t_slice = array_ops.gather(v_t, grad.indices) denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..d0e0405a2c3e5ec05cf487a2ca48207b7a9d4663 --- /dev/null +++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py @@ -0,0 +1,107 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""RegAdagrad for TensorFlow.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import math_ops +from tensorflow.python.training import adagrad +from tensorflow.python.training import training_ops +from tensorflow.python.util import tf_contextlib + + +class RegAdagradOptimizer(adagrad.AdagradOptimizer): + """RegAdagrad: Adagrad with updates that optionally skip updating the slots. + + This is meant to address the problem of additional regularization terms in the + loss function affecting learning rate decay and causing hyper-param + entanglement. Example usage: + + loss = tf.nn.cross_entropy(x, labels) + reg_loss = reg_strength * tf.reduce_sum(x * x) + opt = tf.contrib.opt.RegAdagradOptimizer(learning_rate) + loss_update = opt.minimize(loss) + with opt.avoid_updating_slots(): + reg_update = opt.minimize(reg_loss) + total_update = tf.group([loss_update, reg_update]) + + # ... + + sess.run(total_update, ...) + """ + + def __init__(self, + learning_rate, + initial_accumulator_value=0.1, + use_locking=False, + name="RegAdagrad"): + super(RegAdagradOptimizer, self).__init__( + learning_rate, + initial_accumulator_value=initial_accumulator_value, + use_locking=use_locking, + name=name) + self._should_update_slots = True + + @tf_contextlib.contextmanager + def avoid_updating_slots(self): + old = self._should_update_slots + self._should_update_slots = False + try: + yield + finally: + self._should_update_slots = old + + def _apply_dense(self, grad, var): + acc = self.get_slot(var, "accumulator") + return training_ops.apply_adagrad( + var, + acc, + math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), + grad, + use_locking=self._use_locking, + update_slots=self._should_update_slots) + + def _resource_apply_dense(self, grad, var, update_slots=True): + acc = self.get_slot(var, "accumulator") + return training_ops.resource_apply_adagrad( + var.handle, + acc.handle, + math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), + grad, + use_locking=self._use_locking, + update_slots=self._should_update_slots) + + def _apply_sparse(self, grad, var, update_slots=True): + acc = self.get_slot(var, "accumulator") + return training_ops.sparse_apply_adagrad( + var, + acc, + math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), + grad.values, + grad.indices, + use_locking=self._use_locking, + update_slots=self._should_update_slots) + + def _resource_apply_sparse(self, grad, var, indices, update_slots=True): + acc = self.get_slot(var, "accumulator") + return training_ops.resource_sparse_apply_adagrad( + var.handle, + acc.handle, + math_ops.cast(self._learning_rate_tensor, grad.dtype), + grad, + indices, + use_locking=self._use_locking, + update_slots=self._should_update_slots) diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py new file mode 100644 index 0000000000000000000000000000000000000000..ea56e1646a0811ab065105cd260a760b5b718354 --- /dev/null +++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py @@ -0,0 +1,343 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functional tests for Regreg_adagrad_optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.opt.python.training import reg_adagrad_optimizer +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import embedding_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + + +class RegAdagradOptimizerTest(test.TestCase): + + def doTestBasic(self, use_locking=False, use_resource=False): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + if use_resource: + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) + var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) + else: + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=0.1, use_locking=use_locking) + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Run 3 steps of adagrad + for _ in range(3): + ada_update.run() + # Validate updated params + self.assertAllCloseAccordingToType( + np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([2.715679168701172, 3.715679168701172]), var1.eval()) + + def testBasic(self): + self.doTestBasic(use_locking=False) + + def testBasicResource(self): + self.doTestBasic(use_locking=False, use_resource=True) + + def testBasicLocked(self): + self.doTestBasic(use_locking=True) + + def testMinimizeSparseResourceVariable(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = resource_variable_ops.ResourceVariable( + [[1.0, 2.0], [3.0, 4.0]], dtype=dtype) + x = constant_op.constant([[4.0], [5.0]], dtype=dtype) + pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x) + loss = pred * pred + sgd_op = reg_adagrad_optimizer.RegAdagradOptimizer(1.0).minimize(loss) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]], + var0.eval()) + # Run 1 step of sgd + sgd_op.run() + # Validate updated params + self.assertAllCloseAccordingToType( + [[0, 1], [3, 4]], var0.eval(), atol=0.01) + + def testTensorLearningRate(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + constant_op.constant(3.0), initial_accumulator_value=0.1) + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Run 3 steps of adagrad + for _ in range(3): + ada_update.run() + # Validate updated params + self.assertAllCloseAccordingToType( + np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([2.715679168701172, 3.715679168701172]), var1.eval()) + + def testSparseBasic(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([[1.0], [2.0]], dtype=dtype) + var1 = variables.Variable([[3.0], [4.0]], dtype=dtype) + grads0 = ops.IndexedSlices( + constant_op.constant([0.1], shape=[1, 1], dtype=dtype), + constant_op.constant([0]), constant_op.constant([2, 1])) + grads1 = ops.IndexedSlices( + constant_op.constant([0.01], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), constant_op.constant([2, 1])) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=0.1) + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([[1.0], [2.0]], var0.eval()) + self.assertAllClose([[3.0], [4.0]], var1.eval()) + # Run 3 step of sgd + for _ in range(3): + ada_update.run() + # Validate updated params + self.assertAllCloseAccordingToType( + np.array([[-1.6026098728179932], [2.0]]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([[3.0], [3.715679168701172]]), var1.eval()) + + def testSparseRepeatedIndices(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + repeated_index_update_var = variables.Variable( + [[1.0], [2.0]], dtype=dtype) + aggregated_update_var = variables.Variable([[1.0], [2.0]], dtype=dtype) + grad_repeated_index = ops.IndexedSlices( + constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype), + constant_op.constant([1, 1]), constant_op.constant([2, 1])) + grad_aggregated = ops.IndexedSlices( + constant_op.constant([0.2], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), constant_op.constant([2, 1])) + repeated_update = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0).apply_gradients([(grad_repeated_index, + repeated_index_update_var)]) + aggregated_update = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0).apply_gradients([(grad_aggregated, aggregated_update_var)]) + variables.global_variables_initializer().run() + self.assertAllClose(aggregated_update_var.eval(), + repeated_index_update_var.eval()) + for _ in range(3): + repeated_update.run() + aggregated_update.run() + self.assertAllClose(aggregated_update_var.eval(), + repeated_index_update_var.eval()) + + def testSparseRepeatedIndicesResourceVariable(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var_repeated = resource_variable_ops.ResourceVariable( + [1.0, 2.0], dtype=dtype) + loss_repeated = math_ops.reduce_sum( + embedding_ops.embedding_lookup(var_repeated, [0, 0])) + var_aggregated = resource_variable_ops.ResourceVariable( + [1.0, 2.0], dtype=dtype) + loss_aggregated = 2 * math_ops.reduce_sum( + embedding_ops.embedding_lookup(var_aggregated, [0])) + update_op_repeated = reg_adagrad_optimizer.RegAdagradOptimizer( + 2.0).minimize(loss_repeated) + update_op_aggregated = reg_adagrad_optimizer.RegAdagradOptimizer( + 2.0).minimize(loss_aggregated) + variables.global_variables_initializer().run() + self.assertAllCloseAccordingToType(var_repeated.eval(), + var_aggregated.eval()) + for _ in range(3): + update_op_repeated.run() + update_op_aggregated.run() + self.assertAllCloseAccordingToType(var_repeated.eval(), + var_aggregated.eval()) + + def testSparseStability(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + shape = [1, 6] + var0 = variables.Variable( + [[ + 0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257, + -0.0105945 + ]], + dtype=dtype) + grads0 = ops.IndexedSlices( + constant_op.constant( + [[ + -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, + -8.4877e-05, -9.48906e-05 + ]], + shape=shape, + dtype=dtype), constant_op.constant([0]), + constant_op.constant(shape)) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 1.0, initial_accumulator_value=0.1) + ada_update = ada_opt.apply_gradients(zip([grads0], [var0])) + self.assertEqual(["accumulator"], ada_opt.get_slot_names()) + slot0 = ada_opt.get_slot(var0, "accumulator") + init = variables.global_variables_initializer() + for _ in range(100): + init.run() + ada_update.run() + self.assertAllCloseAccordingToType( + np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval()) + self.assertAllCloseAccordingToType( + np.array([[ + 0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573, + -0.01029443 + ]]), var0.eval()) + + def testSharing(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(3.0) + # Apply the optimizer twice. Both applications will use + # the same accums. + ada_update1 = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + ada_update2 = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + self.assertEqual(["accumulator"], ada_opt.get_slot_names()) + slot0 = ada_opt.get_slot(var0, "accumulator") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + slot1 = ada_opt.get_slot(var1, "accumulator") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values. + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Mix the first and the second adagrad for 3 steps. + ada_update1.run() + ada_update2.run() + ada_update1.run() + # Validate updated params (the same as with only 1 RegAdagrad). + self.assertAllCloseAccordingToType( + np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([2.715679168701172, 3.715679168701172]), var1.eval()) + + def testDynamicShapeVariable_Ok(self): + with self.test_session(): + v = variable_scope.get_variable( + "v", initializer=constant_op.constant(1.), validate_shape=False) + self.assertFalse(v.shape.is_fully_defined()) + # Creating optimizer should cause no exception. + reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=0.1) + + def testSkipUpdatingSlots(self): + iav = 0.130005 # A value that works with float16 + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=iav) + # Apply the optimizer twice. Both applications will use + # the same accums. + with ada_opt.avoid_updating_slots(): + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + self.assertEqual(["accumulator"], ada_opt.get_slot_names()) + slot0 = ada_opt.get_slot(var0, "accumulator") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + slot1 = ada_opt.get_slot(var1, "accumulator") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values. + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Mix the first and the second adagrad for 3 steps. + for _ in range(3): + ada_update.run() + # Validate that ada_opt's slots are not updated. + self.assertAllCloseAccordingToType(np.array([iav, iav]), slot0.eval()) + self.assertAllCloseAccordingToType(np.array([iav, iav]), slot1.eval()) + + def testSparseSkipUpdatingSlots(self): + iav = 0.130005 # A value that works with float16 + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([[1.0], [2.0]], dtype=dtype) + var1 = variables.Variable([[3.0], [4.0]], dtype=dtype) + grads0 = ops.IndexedSlices( + constant_op.constant([0.1], shape=[1, 1], dtype=dtype), + constant_op.constant([0]), constant_op.constant([2, 1])) + grads1 = ops.IndexedSlices( + constant_op.constant([0.01], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), constant_op.constant([2, 1])) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=iav) + with ada_opt.avoid_updating_slots(): + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + slot0 = ada_opt.get_slot(var0, "accumulator") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + slot1 = ada_opt.get_slot(var1, "accumulator") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([[1.0], [2.0]], var0.eval()) + self.assertAllClose([[3.0], [4.0]], var1.eval()) + # Run 3 step of sgd + for _ in range(3): + ada_update.run() + # Validate that ada_opt's slots are not updated. + self.assertAllCloseAccordingToType( + np.array([[iav], [iav]]), slot0.eval()) + self.assertAllCloseAccordingToType( + np.array([[iav], [iav]]), slot1.eval()) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD index 86e5f4a43725b67cd7dba8152e788b64a5d57d26..5225ecc14fef3cec9506eceb776805b74a87714e 100644 --- a/tensorflow/contrib/optimizer_v2/BUILD +++ b/tensorflow/contrib/optimizer_v2/BUILD @@ -115,7 +115,6 @@ cuda_py_test( additional_deps = [ ":training", "@six_archive//:six", - "//tensorflow/contrib/eager/python:checkpointable_utils", "//tensorflow/python:constant_op", "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", @@ -203,4 +202,5 @@ cuda_py_test( "//tensorflow/python:client_testlib", "//third_party/py/numpy", ], + tags = ["optonly"], ) diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py index 42b7f92a76c1971e2a63722d769ee006c3f3210b..d538ad0fb02699ed8514f512208914f629a47436 100644 --- a/tensorflow/contrib/optimizer_v2/adam.py +++ b/tensorflow/contrib/optimizer_v2/adam.py @@ -40,23 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2): Initialization: - ``` - m_0 <- 0 (Initialize initial 1st moment vector) - v_0 <- 0 (Initialize initial 2nd moment vector) - t <- 0 (Initialize timestep) - ``` + $$m_0 := 0 (Initialize initial 1st moment vector)$$ + $$v_0 := 0 (Initialize initial 2nd moment vector)$$ + $$t := 0 (Initialize timestep)$$ The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: - ``` - t <- t + 1 - lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) + $$t := t + 1$$ + $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ - m_t <- beta1 * m_{t-1} + (1 - beta1) * g - v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g - variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon) - ``` + $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ + $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ + $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ The default value of 1e-8 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py index 8ac9b581455f8f4c7af1a66432169ae179de1634..9e2858d00ff192e56680b288651975410c63c539 100644 --- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py +++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py @@ -702,8 +702,7 @@ class CheckpointCompatibilityTests(test.TestCase): with save_graph.as_default(), self.test_session( graph=save_graph) as session: root = self._initialized_model() - object_saver = checkpointable_utils.CheckpointableSaver(root) - save_path = object_saver.save( + save_path = root.save( session=session, file_prefix=checkpoint_prefix) with context.eager_mode(): root = self._initialized_model() @@ -716,8 +715,7 @@ class CheckpointCompatibilityTests(test.TestCase): checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") with context.eager_mode(): root = self._initialized_model() - object_saver = checkpointable_utils.CheckpointableSaver(root) - save_path = object_saver.save(file_prefix=checkpoint_prefix) + save_path = root.save(file_prefix=checkpoint_prefix) with context.graph_mode(): save_graph = ops.Graph() with save_graph.as_default(), self.test_session( diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py index ce15db6f1ec067e5aeb6ddbc8939d2b773692269..46bfbb729fa9cdfc98f4228f516a7c5c42f23059 100644 --- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py +++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py @@ -125,19 +125,6 @@ class _DenseResourceVariableProcessor(_OptimizableVariable): return update_op -class _StreamingModelPortProcessor(_OptimizableVariable): - """Processor for streaming ModelPorts.""" - - def __init__(self, v): - self._v = v - - def target(self): - return self._v - - def update_op(self, optimizer, g, *args): - return g - - class _TensorProcessor(_OptimizableVariable): """Processor for ordinary Tensors. @@ -167,8 +154,6 @@ def _get_processor(v): return _DenseResourceVariableProcessor(v) if isinstance(v, variables.Variable): return _RefVariableProcessor(v) - if v.op.type == "SubmodelPort": - return _StreamingModelPortProcessor(v) if isinstance(v, ops.Tensor): return _TensorProcessor(v) raise NotImplementedError("Trying to optimize unsupported type ", v) diff --git a/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..4e316819077c7dbb28beefd4dc260568f26da680 --- /dev/null +++ b/tensorflow/contrib/proto/python/kernel_tests/defaut_values.TestCase.pbtxt @@ -0,0 +1,94 @@ +primitive { + # No fields specified, so we get all defaults +} +shape: 1 +sizes: 0 +field { + name: "double_default" + dtype: DT_DOUBLE + expected { double_value: 1.0 } +} +sizes: 0 +field { + name: "float_default" + dtype: DT_DOUBLE # Try casting the float field to double. + expected { double_value: 2.0 } +} +sizes: 0 +field { + name: "int64_default" + dtype: DT_INT64 + expected { int64_value: 3 } +} +sizes: 0 +field { + name: "uint64_default" + dtype: DT_INT64 + expected { int64_value: 4 } +} +sizes: 0 +field { + name: "int32_default" + dtype: DT_INT32 + expected { int32_value: 5 } +} +sizes: 0 +field { + name: "fixed64_default" + dtype: DT_INT64 + expected { int64_value: 6 } +} +sizes: 0 +field { + name: "fixed32_default" + dtype: DT_INT32 + expected { int32_value: 7 } +} +sizes: 0 +field { + name: "bool_default" + dtype: DT_BOOL + expected { bool_value: true } +} +sizes: 0 +field { + name: "string_default" + dtype: DT_STRING + expected { string_value: "a" } +} +sizes: 0 +field { + name: "bytes_default" + dtype: DT_STRING + expected { string_value: "a longer default string" } +} +sizes: 0 +field { + name: "uint32_default" + dtype: DT_INT32 + expected { int32_value: -1 } +} +sizes: 0 +field { + name: "sfixed32_default" + dtype: DT_INT32 + expected { int32_value: 10 } +} +sizes: 0 +field { + name: "sfixed64_default" + dtype: DT_INT64 + expected { int64_value: 11 } +} +sizes: 0 +field { + name: "sint32_default" + dtype: DT_INT32 + expected { int32_value: 12 } +} +sizes: 0 +field { + name: "sint64_default" + dtype: DT_INT64 + expected { int64_value: 13 } +} diff --git a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt index db7555bf2dff58e894b1e84db63b6ec91e511a2a..bc07efc8f3038c6c540855c97b2254575e517ef3 100644 --- a/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt +++ b/tensorflow/contrib/proto/python/kernel_tests/promote_unsigned.TestCase.pbtxt @@ -4,7 +4,6 @@ primitive { } shape: 1 sizes: 1 -sizes: 1 field { name: "fixed32_value" dtype: DT_INT64 @@ -12,6 +11,7 @@ field { int64_value: 4294967295 } } +sizes: 1 field { name: "uint32_value" dtype: DT_INT64 @@ -19,3 +19,11 @@ field { int64_value: 4294967295 } } +sizes: 0 +field { + name: "uint32_default" + dtype: DT_INT64 + expected { + int64_value: 4294967295 # Comes from an explicitly-specified default + } +} diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto index dc495034ffae69dde4057858d37bc6afa600cd88..a2c88e372bf7c6b7f14c5bb55776b66c4c06bcd4 100644 --- a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto +++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto @@ -72,6 +72,23 @@ message RepeatedPrimitiveValue { repeated sint32 sint32_value = 17; repeated sint64 sint64_value = 18; repeated PrimitiveValue message_value = 19; + + // Optional fields with explicitly-specified defaults. + optional double double_default = 20 [default = 1.0]; + optional float float_default = 21 [default = 2.0]; + optional int64 int64_default = 22 [default = 3]; + optional uint64 uint64_default = 23 [default = 4]; + optional int32 int32_default = 24 [default = 5]; + optional fixed64 fixed64_default = 25 [default = 6]; + optional fixed32 fixed32_default = 26 [default = 7]; + optional bool bool_default = 27 [default = true]; + optional string string_default = 28 [default = "a"]; + optional bytes bytes_default = 29 [default = "a longer default string"]; + optional uint32 uint32_default = 30 [default = 4294967295]; + optional sfixed32 sfixed32_default = 31 [default = 10]; + optional sfixed64 sfixed64_default = 32 [default = 11]; + optional sint32 sint32_default = 33 [default = 12]; + optional sint64 sint64_default = 34 [default = 13]; } // A PackedPrimitiveValue looks exactly the same as a RepeatedPrimitiveValue @@ -97,6 +114,22 @@ message PackedPrimitiveValue { repeated sint32 sint32_value = 17 [packed = true]; repeated sint64 sint64_value = 18 [packed = true]; repeated PrimitiveValue message_value = 19; + + optional double double_default = 20 [default = 1.0]; + optional float float_default = 21 [default = 2.0]; + optional int64 int64_default = 22 [default = 3]; + optional uint64 uint64_default = 23 [default = 4]; + optional int32 int32_default = 24 [default = 5]; + optional fixed64 fixed64_default = 25 [default = 6]; + optional fixed32 fixed32_default = 26 [default = 7]; + optional bool bool_default = 27 [default = true]; + optional string string_default = 28 [default = "a"]; + optional bytes bytes_default = 29 [default = "a longer default string"]; + optional uint32 uint32_default = 30 [default = 4294967295]; + optional sfixed32 sfixed32_default = 31 [default = 10]; + optional sfixed64 sfixed64_default = 32 [default = 11]; + optional sint32 sint32_default = 33 [default = 12]; + optional sint64 sint64_default = 34 [default = 13]; } message EnumValue { diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py index aa0ef643088ef36b84596d08f78c29594ceca2d6..6f41722748b4754a293f4998e925577a71bc4576 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py @@ -501,8 +501,27 @@ def _GetBatchNormParams(graph, context, has_scaling): bn_decay_var_tensor = None split_context = context.split('/') - base_context = split_context[-1] - + # Matching variable names is brittle and relies on scoping + # conventions. Fused batch norm folding is more robust. Support for unfused + # batch norms will be deprecated as we move forward. Fused batch norms allow + # for faster training and should be used whenever possible. + # context contains part of the names of the tensors we are interested in: + # For MobilenetV1, the context has repetitions: + # MobilenetV1/MobilenetV1/Conv2d_3_depthwise + # when the moving_mean tensor has the name: + # MobilenetV1/Conv2d_3_depthwise/BatchNorm/moving_mean/read + # To pick the correct variable name, it is necessary to ignore the repeating + # header. + + # For MobilenetV2, this problem does not exist: + # The context is: MobilenetV2/expanded_conv_3/depthwise + # and the names of the tensors start with a single MobilenetV2 + # The moving mean for example, has the name: + # MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read + # We ignore the first string (MobilenetV1 or MobilenetV2) + # in the context to match correctly in both cases + + base_context = '/'.join(split_context[1:]) oplist = graph.get_operations() op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze' op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1' @@ -520,7 +539,6 @@ def _GetBatchNormParams(graph, context, has_scaling): op_suffix_gamma = base_context + '/BatchNorm/gamma' op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read' op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read' - # Parse through list of ops to find relevant ops for op in oplist: if op.name.endswith(op_suffix_mean): diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py index af31467476b1536adef2bb74308fd1093f7bea7a..64e8142e7c60924e0e5f6136886943b518976c6f 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py @@ -134,6 +134,85 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): def testFoldConv2d(self): self._RunTestOverParameters(self._TestFoldConv2d) + def testMultipleLayerConv2d(self, + relu=nn_ops.relu, + relu_op_name='Relu', + has_scaling=True, + fused_batch_norm=False, + freeze_batch_norm_delay=None): + """Tests folding cases for a network with multiple layers. + + Args: + relu: Callable that returns an Operation, a factory method for the Relu*. + relu_op_name: String, name of the Relu* operation. + has_scaling: Bool, when true the batch norm has scaling. + fused_batch_norm: Bool, when true the batch norm is fused. + freeze_batch_norm_delay: None or the number of steps after which training + switches to using frozen mean and variance + """ + g = ops.Graph() + with g.as_default(): + batch_size, height, width = 5, 128, 128 + inputs = array_ops.zeros((batch_size, height, width, 3)) + out_depth = 3 + stride = 1 + activation_fn = relu + scope = 'network/expanded_conv_1/conv' + layer1 = conv2d( + inputs, + out_depth, [5, 5], + stride=stride, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=activation_fn, + normalizer_fn=batch_norm, + normalizer_params=self._BatchNormParams( + scale=has_scaling, fused=fused_batch_norm), + scope=scope) + # Add another layer + scope = 'network/expanded_conv_2/conv' + + _ = conv2d( + layer1, + 2 * out_depth, [5, 5], + stride=stride, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=activation_fn, + normalizer_fn=batch_norm, + normalizer_params=self._BatchNormParams( + scale=has_scaling, fused=fused_batch_norm), + scope=scope) + + fold_batch_norms.FoldBatchNorms( + g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay) + folded_mul = g.get_operation_by_name(scope + '/mul_fold') + self.assertEqual(folded_mul.type, 'Mul') + self._AssertInputOpsAre(folded_mul, [ + scope + '/correction_mult', + self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm) + ]) + self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold']) + + folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold') + self.assertEqual(folded_conv.type, 'Conv2D') + # Remove :0 at end of name for tensor prior to comparison + self._AssertInputOpsAre(folded_conv, + [scope + '/mul_fold', layer1.name[:-2]]) + self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul']) + + folded_add = g.get_operation_by_name(scope + '/add_fold') + self.assertEqual(folded_add.type, 'Add') + self._AssertInputOpsAre(folded_add, [ + scope + '/correction_add', + self._BathNormBiasName(scope, fused_batch_norm) + ]) + output_op_names = [scope + '/' + relu_op_name] + self._AssertOutputGoesToOps(folded_add, g, output_op_names) + + for op in g.get_operations(): + self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name) + def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm, freeze_batch_norm_delay): diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py index d2d0426d233aaadb4ffd0fb222c77ade0a98278c..efc1a94b3c6e34cfd9f45e57af0be3faf4490866 100644 --- a/tensorflow/contrib/quantize/python/quantize.py +++ b/tensorflow/contrib/quantize/python/quantize.py @@ -133,19 +133,27 @@ def Quantize(graph, bits=activation_bits, producer_scope=scope, consumer_scope=scope) - _InsertQuantOp( - add_context, - 'add_quant', - layer_match.bypass_op, - input_to_ops_map.ConsumerOperations(layer_match.bypass_op), - is_training, - moving_avg=True, - ema_decay=ema_decay, - quant_delay=quant_delay, - vars_collection=vars_collection, - bits=activation_bits, - producer_scope=scope, - consumer_scope=scope) + # Make sure the op following this isn't an activation. In which case, we + # shouldn't quantize it, since the activation will be Fused into the + # Add at inference time. + consumers = input_to_ops_map.ConsumerOperations(layer_match.bypass_op) + if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]): + logging.info('Skipping %s, because its followed by an activation.', + layer_match.bypass_op.name) + else: + _InsertQuantOp( + add_context, + 'add_quant', + layer_match.bypass_op, + input_to_ops_map.ConsumerOperations(layer_match.bypass_op), + is_training, + moving_avg=True, + ema_decay=ema_decay, + quant_delay=quant_delay, + vars_collection=vars_collection, + bits=activation_bits, + producer_scope=scope, + consumer_scope=scope) # Quantize bypass ops that occur after the activation. if layer_match.post_activation_bypass_op is not None: @@ -153,19 +161,27 @@ def Quantize(graph, r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1) # If `scope` is given, only quantize it if the producer is in the right # scope. - _InsertQuantOp( - post_activation_bypass_context, - 'post_activation_bypass_quant', - layer_match.post_activation_bypass_op, - input_to_ops_map.ConsumerOperations( - layer_match.post_activation_bypass_op), - is_training, - moving_avg=True, - ema_decay=ema_decay, - quant_delay=quant_delay, - vars_collection=vars_collection, - bits=activation_bits, - producer_scope=scope) + # Make sure the op following this isn't an activation. In which case, we + # shouldn't quantize it, since the activation will be Fused into the + # Add at inference time. + consumers = input_to_ops_map.ConsumerOperations( + layer_match.post_activation_bypass_op) + if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]): + logging.info('Skipping %s, because its followed by an activation.', + layer_match.post_activation_bypass_op.name) + else: + _InsertQuantOp( + post_activation_bypass_context, + 'post_activation_bypass_quant', + layer_match.post_activation_bypass_op, + consumers, + is_training, + moving_avg=True, + ema_decay=ema_decay, + quant_delay=quant_delay, + vars_collection=vars_collection, + bits=activation_bits, + producer_scope=scope) def _FindLayersToQuantize(graph): diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py index caf8ff28d50d2880d491d04c1ed368597519dcd7..54faf582f15a26c12813f3fdffe2dda6aa5cc91f 100644 --- a/tensorflow/contrib/quantize/python/quantize_graph_test.py +++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py @@ -113,20 +113,6 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase): # Ensure that variables were added. self.assertTrue(len(orig_variable_names) < len(q_variables)) - def testWithPreActivationBypass(self): - self._RunTestOverAllRewrites(self._TestWithPreActivationBypass) - - def _TestWithPreActivationBypass(self, rewrite_fn): - # Tests that the default graph is correctly used when no args are provided - # to rewrite_fn. - with ops.Graph().as_default() as g: - self._ConvLayer(pre_activation_bypass=True, scope='scope1') - rewrite_fn() - - op_names = [op.name for op in g.get_operations()] - self.assertTrue( - any('scope1/add_quant/' in name for name in op_names)) - def testWithPostActivationBypass(self): self._RunTestOverAllRewrites(self._TestWithPostActivationBypass) diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py index d37c83d6839f02c52a72cac97c9238c135dc2f66..5e479f394680420e510550a510d46d8fff2c03b0 100644 --- a/tensorflow/contrib/quantize/python/quantize_test.py +++ b/tensorflow/contrib/quantize/python/quantize_test.py @@ -82,9 +82,22 @@ class QuantizeTest(test_util.TensorFlowTestCase): quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) quantization_node_name = 'FakeQuantWithMinMaxVars' - add_quant = graph.get_operation_by_name('test/add_quant/' + - quantization_node_name) - self.assertEqual(add_quant.type, quantization_node_name) + conv_quant = graph.get_operation_by_name('test/test/conv_quant/' + + quantization_node_name) + self.assertEqual(conv_quant.type, quantization_node_name) + + # Scan through all FakeQuant operations, ensuring that the activation + # isn't in the consumers of the operation. Since activations are folded + # the preceding operation during inference, the FakeQuant operation after + # the activation is all that is needed. + for op in graph.get_operations(): + if op.type == quantization_node_name: + quant_op = graph.get_operation_by_name(op.name) + consumers = [] + for output in quant_op.outputs: + consumers.extend(output.consumers()) + + self.assertNotIn('test/identity', [c.name for c in consumers]) def testInsertQuantOpForAddAfterSeparableConv2d(self): self._RunTestOverParameters( @@ -109,9 +122,20 @@ class QuantizeTest(test_util.TensorFlowTestCase): quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) quantization_node_name = 'FakeQuantWithMinMaxVars' - add_quant = graph.get_operation_by_name('test/add_quant/' + - quantization_node_name) - self.assertEqual(add_quant.type, quantization_node_name) + conv_quant = graph.get_operation_by_name('test/test/conv_quant/' + + quantization_node_name) + self.assertEqual(conv_quant.type, quantization_node_name) + + for op in graph.get_operations(): + if op.type == quantization_node_name: + quant_op = graph.get_operation_by_name(op.name) + # Scan through all FakeQuant operations, ensuring that the activation + # identity op isn't in the consumers of the operation. + consumers = [] + for output in quant_op.outputs: + consumers.extend(output.consumers()) + + self.assertNotIn('test/identity', [c.name for c in consumers]) def testFinalLayerQuantized(self): self._RunTestOverParameters(self._TestFinalLayerQuantized) @@ -153,12 +177,21 @@ class QuantizeTest(test_util.TensorFlowTestCase): activation_fn=array_ops.identity, scope='test/test') bypass_tensor = math_ops.add(conv, input2, name='test/add') - _ = array_ops.identity(bypass_tensor, name='test/output') + # The output of the post_activation bypass will be another layer. + _ = conv2d( + bypass_tensor, + 32, [5, 5], + stride=2, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=array_ops.identity, + scope='test/unused') quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) - # Ensure that the bypass node is preceded and followed by - # FakeQuantWithMinMaxVars operations. + # Ensure that the bypass node is preceded by and followed by a + # FakeQuantWithMinMaxVar operation, since the output of the Add isn't an + # activation. self.assertTrue('FakeQuantWithMinMaxVars' in [c.type for c in bypass_tensor.consumers()]) self.assertTrue('FakeQuantWithMinMaxVars' in @@ -198,9 +231,9 @@ class QuantizeTest(test_util.TensorFlowTestCase): quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) - # Ensure that the bypass node is preceded and followed by - # FakeQuantWithMinMaxVars operations. - self.assertTrue('FakeQuantWithMinMaxVars' in + # Ensure that the bypass node is preceded by a FakeQuantWithMinMaxVar + # operation, and NOT followed by one. + self.assertTrue('FakeQuantWithMinMaxVars' not in [c.type for c in bypass_tensor.consumers()]) self.assertTrue('FakeQuantWithMinMaxVars' in [i.op.type for i in bypass_tensor.op.inputs]) diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc index 03006dab323a7c6dc83d9a17c035ef705f7b0366..45d22b739b8c597c7ebda85968aa44cd599a798c 100644 --- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc +++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc @@ -26,9 +26,9 @@ namespace tensorflow { #if GOOGLE_CUDA namespace { template -perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory) { - perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory)); - perftools::gputools::DeviceMemory typed(wrapped); +se::DeviceMemory AsDeviceMemory(const T* cuda_memory) { + se::DeviceMemoryBase wrapped(const_cast(cuda_memory)); + se::DeviceMemory typed(wrapped); return typed; } } // namespace @@ -41,9 +41,8 @@ void TensorCuBlasGemm::operator()(OpKernelContext* ctx, bool transa, T alpha, const T* a, int lda, const T* b, int ldb, T beta, T* c, int ldc) { #if GOOGLE_CUDA - perftools::gputools::blas::Transpose trans[] = { - perftools::gputools::blas::Transpose::kNoTranspose, - perftools::gputools::blas::Transpose::kTranspose}; + se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose, + se::blas::Transpose::kTranspose}; auto a_ptr = AsDeviceMemory(a); auto b_ptr = AsDeviceMemory(b); diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py index e2e0dbc7a22efb2d8bf1e03e325eb8b6a4734993..3fc6bfbb4d03a39906d4441e48b2788423caa234 100644 --- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py +++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py @@ -35,6 +35,7 @@ class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase): _protocol = 'grpc' invalid_method_string = 'Method not found' + connect_failed_string = 'Connect Failed' def __init__(self, methodName='runTest'): # pylint: disable=invalid-name super(RpcOpTest, self).__init__(methodName) diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py index 89f3ee1a1c52e4e292ef850ed375ad49b79fa1f5..27273d16b1c09eba60e124e632b353b09ea2d063 100644 --- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py +++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py @@ -93,40 +93,39 @@ class RpcOpTestBase(object): response_values = sess.run(response_tensors) self.assertAllEqual(response_values.shape, [0]) - def testInvalidAddresses(self): - with self.test_session() as sess: - with self.assertRaisesOpError(self.invalid_method_string): - sess.run( - self.rpc( - method='/InvalidService.IncrementTestShapes', - address=self._address, - request='')) + def testInvalidMethod(self): + for method in [ + '/InvalidService.IncrementTestShapes', + self.get_method_name('InvalidMethodName') + ]: + with self.test_session() as sess: + with self.assertRaisesOpError(self.invalid_method_string): + sess.run(self.rpc(method=method, address=self._address, request='')) - with self.assertRaisesOpError(self.invalid_method_string): - sess.run( - self.rpc( - method=self.get_method_name('InvalidMethodName'), - address=self._address, - request='')) + _, status_code_value, status_message_value = sess.run( + self.try_rpc(method=method, address=self._address, request='')) + self.assertEqual(errors.UNIMPLEMENTED, status_code_value) + self.assertTrue( + self.invalid_method_string in status_message_value.decode('ascii')) - # This also covers the case of address='' - # and address='localhost:293874293874' + def testInvalidAddress(self): + # This covers the case of address='' and address='localhost:293874293874' + address = 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@' + with self.test_session() as sess: with self.assertRaises(errors.UnavailableError): sess.run( self.rpc( method=self.get_method_name('IncrementTestShapes'), - address='unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@', + address=address, request='')) - - # Test invalid method with the TryRpc op _, status_code_value, status_message_value = sess.run( self.try_rpc( - method=self.get_method_name('InvalidMethodName'), - address=self._address, + method=self.get_method_name('IncrementTestShapes'), + address=address, request='')) - self.assertEqual(errors.UNIMPLEMENTED, status_code_value) + self.assertEqual(errors.UNAVAILABLE, status_code_value) self.assertTrue( - self.invalid_method_string in status_message_value.decode('ascii')) + self.connect_failed_string in status_message_value.decode('ascii')) def testAlwaysFailingMethod(self): with self.test_session() as sess: @@ -138,6 +137,18 @@ class RpcOpTestBase(object): with self.assertRaisesOpError(I_WARNED_YOU): sess.run(response_tensors) + response_tensors, status_code, status_message = self.try_rpc( + method=self.get_method_name('AlwaysFailWithInvalidArgument'), + address=self._address, + request='') + self.assertEqual(response_tensors.shape, ()) + self.assertEqual(status_code.shape, ()) + self.assertEqual(status_message.shape, ()) + status_code_value, status_message_value = sess.run((status_code, + status_message)) + self.assertEqual(errors.INVALID_ARGUMENT, status_code_value) + self.assertTrue(I_WARNED_YOU in status_message_value.decode('ascii')) + def testSometimesFailingMethodWithManyRequests(self): with self.test_session() as sess: # Fail hard by default. @@ -197,8 +208,7 @@ class RpcOpTestBase(object): address=self._address, request=request_tensors) for _ in range(10) ] - # Launch parallel 10 calls to the RpcOp, each containing - # 20 rpc requests. + # Launch parallel 10 calls to the RpcOp, each containing 20 rpc requests. many_response_values = sess.run(many_response_tensors) self.assertEqual(10, len(many_response_values)) for response_values in many_response_values: diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py index 6781433a1f7ac712a62cfd19f1a2ecb632509fd4..cd162bae25aa1c1b6718b8e5b0b8687e5b80eab3 100644 --- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py +++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py @@ -411,11 +411,11 @@ class AttentionWrapperTest(test.TestCase): def testLuongScaledDType(self): # Test case for GitHub issue 18099 - for dtype in [np.float16, np.float32, np.float64]: + for dt in [np.float16, np.float32, np.float64]: num_units = 128 - encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256]) + encoder_outputs = array_ops.placeholder(dt, shape=[64, None, 256]) encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64]) - decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128]) + decoder_inputs = array_ops.placeholder(dt, shape=[64, None, 128]) decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64]) batch_size = 64 attention_mechanism = wrapper.LuongAttention( @@ -423,7 +423,7 @@ class AttentionWrapperTest(test.TestCase): memory=encoder_outputs, memory_sequence_length=encoder_sequence_length, scale=True, - dtype=dtype, + dtype=dt, ) cell = rnn_cell.LSTMCell(num_units) cell = wrapper.AttentionWrapper(cell, attention_mechanism) @@ -434,12 +434,12 @@ class AttentionWrapperTest(test.TestCase): cell=cell, helper=helper, initial_state=cell.zero_state( - dtype=dtype, batch_size=batch_size)) + dtype=dt, batch_size=batch_size)) final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder) self.assertTrue( isinstance(final_outputs, basic_decoder.BasicDecoderOutput)) - self.assertEqual(final_outputs.rnn_output.dtype, dtype) + self.assertEqual(final_outputs.rnn_output.dtype, dt) self.assertTrue( isinstance(final_state, wrapper.AttentionWrapperState)) self.assertTrue( diff --git a/tensorflow/contrib/slim/python/slim/summaries.py b/tensorflow/contrib/slim/python/slim/summaries.py index 358359d6ebeea209fe83f7282f47db8be63747ac..a7dc3f6723a0d1a55dd3e8dce006e6acce083e6f 100644 --- a/tensorflow/contrib/slim/python/slim/summaries.py +++ b/tensorflow/contrib/slim/python/slim/summaries.py @@ -144,7 +144,7 @@ def add_zero_fraction_summary(tensor, name=None, prefix=None, A scalar `Tensor` of type `string` whose contents are the serialized `Summary` protocol buffer. """ - name = _get_summary_name(tensor, name, prefix, 'Fraction of Zero Values') + name = _get_summary_name(tensor, name, prefix, 'Fraction_of_Zero_Values') tensor = nn.zero_fraction(tensor) return add_scalar_summary(tensor, name, print_summary=print_summary) diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py index 90033015ebc5e44ea70fbf2bc9735d0aeb4ec27d..e893e1d1c836cc7feef15757dde79d0db362cbaf 100644 --- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py +++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py @@ -37,7 +37,7 @@ def _top_k_generator(k): def _top_k(probabilities, targets): targets = math_ops.to_int32(targets) if targets.get_shape().ndims > 1: - targets = array_ops.squeeze(targets, squeeze_dims=[1]) + targets = array_ops.squeeze(targets, axis=[1]) return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k)) return _top_k @@ -57,7 +57,7 @@ def _r2(probabilities, targets, weights=None): def _squeeze_and_onehot(targets, depth): - targets = array_ops.squeeze(targets, squeeze_dims=[1]) + targets = array_ops.squeeze(targets, axis=[1]) return array_ops.one_hot(math_ops.to_int32(targets), depth) diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py index ff3ab21eaa9a4aa823f2ae7d3dd39674abea3d2a..745a5b1caf2fe348f1b276ccc245aa2ef350a62e 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py +++ b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py @@ -55,7 +55,7 @@ class ManyToOneLayer(hybrid_layer.HybridLayer): # There is always one activation per instance by definition, so squeeze # away the extra dimension. - return array_ops.squeeze(nn_activations, squeeze_dims=[1]) + return array_ops.squeeze(nn_activations, axis=[1]) class FlattenedFullyConnectedLayer(hybrid_layer.HybridLayer): diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py index b9bcbb170b04fe953be2d2dd515b607127d3cae6..7a35a70bbe3112e0649cefd8116cc50565978da5 100644 --- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py +++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py @@ -445,7 +445,7 @@ class RandomForestGraphs(object): mask = math_ops.less( r, array_ops.ones_like(r) * self.params.bagging_fraction) gather_indices = array_ops.squeeze( - array_ops.where(mask), squeeze_dims=[1]) + array_ops.where(mask), axis=[1]) # TODO(thomaswc): Calculate out-of-bag data and labels, and store # them for use in calculating statistics later. tree_data = array_ops.gather(processed_dense_features, gather_indices) diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index b412b296e02751427b80e7c1596f2530942519c6..07740277115fe4956f21e8db1dabfb10990a2095 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -111,20 +111,22 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph, } } -std::pair ParseTensorName(string name, int default_idx = 0) { +std::pair ParseTensorName(const string& name, + int default_idx = 0) { + string name_no_idx = name; int idx = default_idx; - size_t sep = name.find_last_of(':'); + const size_t sep = name_no_idx.find_last_of(':'); if (sep != string::npos) { - name = name.substr(0, sep); + name_no_idx = name_no_idx.substr(0, sep); idx = std::stoi(name.substr(sep + 1)); } - return std::make_pair(name, idx); + return std::make_pair(name_no_idx, idx); } std::unordered_map> BuildTensorNameMap( const std::vector& tensor_names) { std::unordered_map> result; - for (string const& tensor_name : tensor_names) { + for (const string& tensor_name : tensor_names) { string node_name; int index; std::tie(node_name, index) = ParseTensorName(tensor_name); @@ -132,6 +134,7 @@ std::unordered_map> BuildTensorNameMap( } return result; } + // TODO(sami): convert references to pointers struct ConvertGraphParams { ConvertGraphParams( diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index b32371b642f38b0851955a4a3beab97b86e1f6a0..53ba7badcaea15a049010049f427e1f4786df7ce 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -25,7 +25,6 @@ limitations under the License. namespace tensorflow { static ::tensorflow::tensorrt::Logger logger; -namespace gpu = ::perftools::gputools; using IRuntime = nvinfer1::IRuntime; using Dims = nvinfer1::Dims; diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD index 32e948a009741b126e21a64473ac2d020a25a7af..355303acf6ddf866ecf18815b394fcea8488d67d 100644 --- a/tensorflow/contrib/timeseries/examples/BUILD +++ b/tensorflow/contrib/timeseries/examples/BUILD @@ -8,14 +8,22 @@ licenses(["notice"]) # Apache 2.0 exports_files(["LICENSE"]) +config_setting( + name = "empty_condition", + values = {"define": "UNUSED=unused"}, +) + py_binary( name = "predict", srcs = ["predict.py"], srcs_version = "PY2AND3", tags = ["no_pip"], - deps = [ - "//tensorflow:tensorflow_py", + deps = select({ + ":empty_condition": [], + "//conditions:default": [], + }) + [ "//third_party/py/numpy", + "//tensorflow:tensorflow_py", ], ) @@ -41,9 +49,12 @@ py_binary( data = ["data/changepoints.csv"], srcs_version = "PY2AND3", tags = ["no_pip"], - deps = [ - "//tensorflow:tensorflow_py", + deps = select({ + ":empty_condition": [], + "//conditions:default": [], + }) + [ "//third_party/py/numpy", + "//tensorflow:tensorflow_py", ], ) @@ -64,9 +75,12 @@ py_binary( data = ["data/multivariate_level.csv"], srcs_version = "PY2AND3", tags = ["no_pip"], - deps = [ - "//tensorflow:tensorflow_py", + deps = select({ + ":empty_condition": [], + "//conditions:default": [], + }) + [ "//third_party/py/numpy", + "//tensorflow:tensorflow_py", ], ) @@ -89,11 +103,14 @@ py_binary( data = ["data/multivariate_periods.csv"], srcs_version = "PY2AND3", tags = ["no_pip"], - deps = [ + deps = select({ + ":empty_condition": [], + "//conditions:default": [], + }) + [ + "//third_party/py/numpy", "//tensorflow:tensorflow_py", "//tensorflow/contrib/timeseries/python/timeseries:estimators", "//tensorflow/contrib/timeseries/python/timeseries:model", - "//third_party/py/numpy", ], ) diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly.py b/tensorflow/contrib/timeseries/examples/known_anomaly.py index e77628ddd390374d6336e3583e07ce03cdec7aea..71621abc7190fae9973f78522e23f03d43e342c6 100644 --- a/tensorflow/contrib/timeseries/examples/known_anomaly.py +++ b/tensorflow/contrib/timeseries/examples/known_anomaly.py @@ -41,17 +41,8 @@ _MODULE_PATH = path.dirname(__file__) _DATA_FILE = path.join(_MODULE_PATH, "data/changepoints.csv") -def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300): - """Training, evaluating, and predicting on a series with changepoints.""" - - # Indicate the format of our exogenous feature, in this case a string - # representing a boolean value. - string_feature = tf.feature_column.categorical_column_with_vocabulary_list( - key="is_changepoint", vocabulary_list=["no", "yes"]) - # Specify the way this feature is presented to the model, here using a one-hot - # encoding. - one_hot_feature = tf.feature_column.indicator_column( - categorical_column=string_feature) +def state_space_esitmator(exogenous_feature_columns): + """Constructs a StructuralEnsembleRegressor.""" def _exogenous_update_condition(times, features): del times # unused @@ -62,14 +53,48 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300): # no changepoint. return tf.equal(tf.squeeze(features["is_changepoint"], axis=-1), "yes") - estimator = tf.contrib.timeseries.StructuralEnsembleRegressor( - periodicities=12, - # Extract a smooth period by constraining the number of latent values - # being cycled between. - cycle_num_latent_values=3, - num_features=1, - exogenous_feature_columns=[one_hot_feature], - exogenous_update_condition=_exogenous_update_condition) + return ( + tf.contrib.timeseries.StructuralEnsembleRegressor( + periodicities=12, + # Extract a smooth period by constraining the number of latent values + # being cycled between. + cycle_num_latent_values=3, + num_features=1, + exogenous_feature_columns=exogenous_feature_columns, + exogenous_update_condition=_exogenous_update_condition), + # Use truncated backpropagation with a window size of 64, batching + # together 4 of these windows (random offsets) per training step. Training + # with exogenous features often requires somewhat larger windows. + 4, 64) + + +def autoregressive_esitmator(exogenous_feature_columns): + input_window_size = 8 + output_window_size = 2 + return ( + tf.contrib.timeseries.ARRegressor( + periodicities=12, + num_features=1, + input_window_size=input_window_size, + output_window_size=output_window_size, + exogenous_feature_columns=exogenous_feature_columns), + 64, input_window_size + output_window_size) + + +def train_and_evaluate_exogenous( + estimator_fn, csv_file_name=_DATA_FILE, train_steps=300): + """Training, evaluating, and predicting on a series with changepoints.""" + # Indicate the format of our exogenous feature, in this case a string + # representing a boolean value. + string_feature = tf.feature_column.categorical_column_with_vocabulary_list( + key="is_changepoint", vocabulary_list=["no", "yes"]) + # Specify the way this feature is presented to the model, here using a one-hot + # encoding. + one_hot_feature = tf.feature_column.indicator_column( + categorical_column=string_feature) + + estimator, batch_size, window_size = estimator_fn( + exogenous_feature_columns=[one_hot_feature]) reader = tf.contrib.timeseries.CSVReader( csv_file_name, # Indicate the format of our CSV file. First we have two standard columns, @@ -85,10 +110,7 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300): # This CSV has a header line; here we just ignore it. skip_header_lines=1) train_input_fn = tf.contrib.timeseries.RandomWindowInputFn( - # Use truncated backpropagation with a window size of 64, batching - # together 4 of these windows (random offsets) per training step. Training - # with exogenous features often requires somewhat larger windows. - reader, batch_size=4, window_size=64) + reader, batch_size=batch_size, window_size=window_size) estimator.train(input_fn=train_input_fn, steps=train_steps) evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader) evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1) @@ -145,7 +167,12 @@ def main(unused_argv): if not HAS_MATPLOTLIB: raise ImportError( "Please install matplotlib to generate a plot from this example.") - make_plot("Ignoring a known anomaly", *train_and_evaluate_exogenous()) + make_plot("Ignoring a known anomaly (state space)", + *train_and_evaluate_exogenous( + estimator_fn=state_space_esitmator)) + make_plot("Ignoring a known anomaly (autoregressive)", + *train_and_evaluate_exogenous( + estimator_fn=autoregressive_esitmator, train_steps=3000)) pyplot.show() diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py index c3e307cad815d3c9c8556d0349d366d6f938101a..8c64f2e186a1aab0235f7cfbf1a942b872edd93b 100644 --- a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py +++ b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py @@ -23,12 +23,24 @@ from tensorflow.contrib.timeseries.examples import known_anomaly from tensorflow.python.platform import test -class KnownAnaomalyExampleTest(test.TestCase): +class KnownAnomalyExampleTest(test.TestCase): - def test_shapes_and_variance_structural(self): + def test_shapes_and_variance_structural_ar(self): (times, observed, all_times, mean, upper_limit, lower_limit, anomaly_locations) = known_anomaly.train_and_evaluate_exogenous( - train_steps=50) + train_steps=1, estimator_fn=known_anomaly.autoregressive_esitmator) + self.assertAllEqual( + anomaly_locations, + [25, 50, 75, 100, 125, 150, 175, 249]) + self.assertAllEqual(all_times.shape, mean.shape) + self.assertAllEqual(all_times.shape, upper_limit.shape) + self.assertAllEqual(all_times.shape, lower_limit.shape) + self.assertAllEqual(times.shape, observed.shape) + + def test_shapes_and_variance_structural_ssm(self): + (times, observed, all_times, mean, upper_limit, lower_limit, + anomaly_locations) = known_anomaly.train_and_evaluate_exogenous( + train_steps=50, estimator_fn=known_anomaly.state_space_esitmator) self.assertAllEqual( anomaly_locations, [25, 50, 75, 100, 125, 150, 175, 249]) diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py index 4f6527a5465ca01ed34150a26ba26d73a858cd74..558d9480b495ca87e828e8b440370ec9c6e3be2f 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py +++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py @@ -60,7 +60,8 @@ class ARModel(model.TimeSeriesModel): num_features, num_time_buckets=10, loss=NORMAL_LIKELIHOOD_LOSS, - hidden_layer_sizes=None): + hidden_layer_sizes=None, + exogenous_feature_columns=None): """Constructs an auto-regressive model. Args: @@ -81,6 +82,11 @@ class ARModel(model.TimeSeriesModel): observations and predictions, while the training loss is computed on normalized data (if input statistics are available). hidden_layer_sizes: list of sizes of hidden layers. + exogenous_feature_columns: A list of `tf.feature_column`s (for example + `tf.feature_column.embedding_column`) corresponding to exogenous + features which provide extra information to the model but are not part + of the series to be predicted. Passed to + `tf.feature_column.input_layer`. """ self.input_window_size = input_window_size self.output_window_size = output_window_size @@ -90,7 +96,12 @@ class ARModel(model.TimeSeriesModel): self.window_size = self.input_window_size + self.output_window_size self.loss = loss super(ARModel, self).__init__( - num_features=num_features) + num_features=num_features, + exogenous_feature_columns=exogenous_feature_columns) + if exogenous_feature_columns is not None: + self.exogenous_size = self._get_exogenous_embedding_shape()[-1] + else: + self.exogenous_size = 0 assert num_time_buckets > 0 self._buckets = int(num_time_buckets) if periodicities is None or not periodicities: @@ -110,7 +121,10 @@ class ARModel(model.TimeSeriesModel): # that the serving input_receiver_fn gets placeholder shapes correct. return (array_ops.zeros([self.input_window_size], dtype=dtypes.int64), array_ops.zeros( - [self.input_window_size, self.num_features], dtype=self.dtype)) + [self.input_window_size, self.num_features], dtype=self.dtype), + array_ops.zeros( + [self.input_window_size, self.exogenous_size], + dtype=self.dtype)) # TODO(allenl,agarwal): Support sampling for AR. def random_model_parameters(self, seed=None): @@ -163,7 +177,7 @@ class ARModel(model.TimeSeriesModel): activations.append((activation, activation_size)) return activations - def prediction_ops(self, times, values): + def prediction_ops(self, times, values, exogenous_regressors): """Compute model predictions given input data. Args: @@ -173,6 +187,8 @@ class ARModel(model.TimeSeriesModel): prediction times. values: A [batch size, self.input_window_size, self.num_features] Tensor with input features. + exogenous_regressors: A [batch size, self.window_size, + self.exogenous_size] Tensor with exogenous features. Returns: Tuple (predicted_mean, predicted_covariance), where each element is a Tensor with shape [batch size, self.output_window_size, @@ -183,25 +199,33 @@ class ARModel(model.TimeSeriesModel): if self.input_window_size: values.get_shape().assert_is_compatible_with( [None, self.input_window_size, self.num_features]) + if exogenous_regressors is not None: + exogenous_regressors.get_shape().assert_is_compatible_with( + [None, self.window_size, self.exogenous_size]) # Create input features. + activation_components = [] if self._periods: _, time_features = self._compute_time_features(times) activation_size = self.window_size * self._buckets * len(self._periods) - activation = array_ops.reshape(time_features, [-1, activation_size]) + activation_components.append( + array_ops.reshape(time_features, [-1, activation_size])) else: activation_size = 0 - activation = None - if self.input_window_size: inp = array_ops.slice(values, [0, 0, 0], [-1, self.input_window_size, -1]) inp_size = self.input_window_size * self.num_features inp = array_ops.reshape(inp, [-1, inp_size]) - if activation is not None: - activation = array_ops.concat([inp, activation], 1) - else: - activation = inp + activation_components.append(inp) activation_size += inp_size + if self.exogenous_size: + exogenous_size = self.window_size * self.exogenous_size + activation_size += exogenous_size + exogenous_flattened = array_ops.reshape( + exogenous_regressors, [-1, exogenous_size]) + activation_components.append(exogenous_flattened) assert activation_size + assert activation_components + activation = array_ops.concat(activation_components, axis=1) activations.append((activation, activation_size)) # Create hidden layers. activations += self._create_hidden_stack(activation, activation_size) @@ -228,6 +252,19 @@ class ARModel(model.TimeSeriesModel): math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype) return loss_op + def _process_exogenous_features(self, times, features): + embedded = super(ARModel, self)._process_exogenous_features( + times=times, features=features) + if embedded is None: + assert self.exogenous_size == 0 + # No embeddings. Return a zero-size [batch, times, 0] array so we don't + # have to special case it downstream. + return array_ops.zeros( + array_ops.concat([array_ops.shape(times), constant_op.constant([0])], + axis=0)) + else: + return embedded + # TODO(allenl, agarwal): Consider better ways of warm-starting predictions. def predict(self, features): """Computes predictions multiple steps into the future. @@ -243,6 +280,7 @@ class ARModel(model.TimeSeriesModel): segment of the time series before `TIMES`. This data is used to start of the autoregressive computation. This should have data for at least self.input_window_size timesteps. + And any exogenous features, with shapes prefixed by shape of `TIMES`. Returns: A dictionary with keys, "mean", "covariance". The values are Tensors of shape [batch_size, predict window size, @@ -250,25 +288,39 @@ class ARModel(model.TimeSeriesModel): """ predict_times = math_ops.cast( ops.convert_to_tensor(features[PredictionFeatures.TIMES]), dtypes.int32) + exogenous_regressors = self._process_exogenous_features( + times=predict_times, + features={key: value for key, value in features.items() + if key not in [TrainEvalFeatures.TIMES, + TrainEvalFeatures.VALUES, + PredictionFeatures.STATE_TUPLE]}) + with ops.control_dependencies( + [check_ops.assert_equal(array_ops.shape(predict_times)[1], + array_ops.shape(exogenous_regressors)[1])]): + exogenous_regressors = array_ops.identity(exogenous_regressors) batch_size = array_ops.shape(predict_times)[0] num_predict_values = array_ops.shape(predict_times)[1] prediction_iterations = ((num_predict_values + self.output_window_size - 1) // self.output_window_size) - # Pad predict_times so as to have exact multiple of self.output_window_size - # values per example. + # Pad predict_times and exogenous regressors so as to have exact multiple of + # self.output_window_size values per example. padding_size = (prediction_iterations * self.output_window_size - num_predict_values) - padding = array_ops.zeros([batch_size, padding_size], predict_times.dtype) - predict_times = control_flow_ops.cond( - padding_size > 0, lambda: array_ops.concat([predict_times, padding], 1), - lambda: predict_times) + predict_times = array_ops.pad( + predict_times, [[0, 0], [0, padding_size]]) + exogenous_regressors = array_ops.pad( + exogenous_regressors, [[0, 0], [0, padding_size], [0, 0]]) state = features[PredictionFeatures.STATE_TUPLE] - (state_times, state_values) = state + (state_times, state_values, state_exogenous_regressors) = state state_times = math_ops.cast( ops.convert_to_tensor(state_times), dtypes.int32) state_values = ops.convert_to_tensor(state_values, dtype=self.dtype) + state_exogenous_regressors = ops.convert_to_tensor( + state_exogenous_regressors, dtype=self.dtype) initial_input_times = predict_times[:, :self.output_window_size] + initial_input_exogenous_regressors = ( + exogenous_regressors[:, :self.output_window_size, :]) if self.input_window_size > 0: initial_input_times = array_ops.concat( [state_times[:, -self.input_window_size:], initial_input_times], 1) @@ -279,6 +331,11 @@ class ARModel(model.TimeSeriesModel): check_ops.assert_equal(values_size, times_size) ]): initial_input_values = state_values[:, -self.input_window_size:, :] + initial_input_exogenous_regressors = array_ops.concat( + [state_exogenous_regressors[:, -self.input_window_size:, :], + initial_input_exogenous_regressors[ + :, :self.output_window_size, :]], + axis=1) else: initial_input_values = 0 @@ -288,9 +345,10 @@ class ARModel(model.TimeSeriesModel): return math_ops.less(iteration_number, prediction_iterations) def _while_body(iteration_number, input_times, input_values, - mean_ta, covariance_ta): + input_exogenous_regressors, mean_ta, covariance_ta): """Predict self.output_window_size values.""" - prediction_ops = self.prediction_ops(input_times, input_values) + prediction_ops = self.prediction_ops( + input_times, input_values, input_exogenous_regressors) predicted_mean = prediction_ops["mean"] predicted_covariance = prediction_ops["covariance"] offset = self.output_window_size * gen_math_ops.minimum( @@ -299,20 +357,33 @@ class ARModel(model.TimeSeriesModel): if self.output_window_size < self.input_window_size: new_input_values = array_ops.concat( [input_values[:, self.output_window_size:, :], predicted_mean], 1) + new_input_exogenous_regressors = array_ops.concat( + [input_exogenous_regressors[:, -self.input_window_size:, :], + exogenous_regressors[ + :, offset:offset + self.output_window_size, :]], + axis=1) new_input_times = array_ops.concat([ - input_times[:, self.output_window_size:], + input_times[:, -self.input_window_size:], predict_times[:, offset:offset + self.output_window_size] ], 1) else: new_input_values = predicted_mean[:, -self.input_window_size:, :] + new_input_exogenous_regressors = exogenous_regressors[ + :, + offset - self.input_window_size:offset + self.output_window_size, + :] new_input_times = predict_times[ :, offset - self.input_window_size:offset + self.output_window_size] else: new_input_values = input_values + new_input_exogenous_regressors = exogenous_regressors[ + :, offset:offset + self.output_window_size, :] new_input_times = predict_times[:, offset:offset + self.output_window_size] new_input_times.set_shape(initial_input_times.get_shape()) + new_input_exogenous_regressors.set_shape( + initial_input_exogenous_regressors.get_shape()) new_mean_ta = mean_ta.write(iteration_number, predicted_mean) if isinstance(covariance_ta, tensor_array_ops.TensorArray): new_covariance_ta = covariance_ta.write(iteration_number, @@ -322,6 +393,7 @@ class ARModel(model.TimeSeriesModel): return (iteration_number + 1, new_input_times, new_input_values, + new_input_exogenous_regressors, new_mean_ta, new_covariance_ta) @@ -332,9 +404,13 @@ class ARModel(model.TimeSeriesModel): if self.loss != ARModel.SQUARED_LOSS else 0.) mean_ta_init = tensor_array_ops.TensorArray( dtype=self.dtype, size=prediction_iterations) - _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop( + _, _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop( _while_condition, _while_body, [ - 0, initial_input_times, initial_input_values, mean_ta_init, + 0, + initial_input_times, + initial_input_values, + initial_input_exogenous_regressors, + mean_ta_init, covariance_ta_init ]) @@ -366,11 +442,11 @@ class ARModel(model.TimeSeriesModel): return {"mean": predicted_mean, "covariance": predicted_covariance} - def _process_window(self, features, mode): + def _process_window(self, features, mode, exogenous_regressors): """Compute model outputs on a single window of data.""" - # TODO(agarwal): Use exogenous features times = math_ops.cast(features[TrainEvalFeatures.TIMES], dtypes.int64) values = math_ops.cast(features[TrainEvalFeatures.VALUES], dtype=self.dtype) + exogenous_regressors = math_ops.cast(exogenous_regressors, dtype=self.dtype) original_values = values # Extra shape checking for the window size (above that in @@ -395,7 +471,8 @@ class ARModel(model.TimeSeriesModel): input_values = values[:, :self.input_window_size, :] else: input_values = None - prediction_ops = self.prediction_ops(times, input_values) + prediction_ops = self.prediction_ops( + times, input_values, exogenous_regressors) prediction = prediction_ops["mean"] covariance = prediction_ops["covariance"] targets = array_ops.slice(values, [0, self.input_window_size, 0], @@ -419,7 +496,8 @@ class ARModel(model.TimeSeriesModel): return model.ModelOutputs( loss=loss, end_state=(times[:, -self.input_window_size:], - values[:, -self.input_window_size:, :]), + values[:, -self.input_window_size:, :], + exogenous_regressors[:, -self.input_window_size:, :]), predictions={"mean": prediction, "covariance": covariance, "observed": original_values[:, -self.output_window_size:]}, prediction_times=times[:, -self.output_window_size:]) @@ -454,17 +532,24 @@ class ARModel(model.TimeSeriesModel): """ features = {feature_name: ops.convert_to_tensor(feature_value) for feature_name, feature_value in features.items()} + times = features[TrainEvalFeatures.TIMES] + exogenous_regressors = self._process_exogenous_features( + times=times, + features={key: value for key, value in features.items() + if key not in [TrainEvalFeatures.TIMES, + TrainEvalFeatures.VALUES, + PredictionFeatures.STATE_TUPLE]}) if mode == estimator_lib.ModeKeys.TRAIN: # For training, we require the window size to be self.window_size as # iterating sequentially on larger windows could introduce a bias. - return self._process_window(features, mode=mode) + return self._process_window( + features, mode=mode, exogenous_regressors=exogenous_regressors) elif mode == estimator_lib.ModeKeys.EVAL: # For evaluation, we allow the user to pass in a larger window, in which # case we try to cover as much of the window as possible without # overlap. Quantitative evaluation is more efficient/correct with fixed # windows matching self.window_size (as with training), but this looping # allows easy plotting of "in-sample" predictions. - times = features[TrainEvalFeatures.TIMES] times.get_shape().assert_has_rank(2) static_window_size = times.get_shape()[1].value if (static_window_size is not None @@ -500,7 +585,9 @@ class ARModel(model.TimeSeriesModel): feature_name: feature_value[:, base_offset:base_offset + self.window_size] for feature_name, feature_value in features.items()}, - mode=mode) + mode=mode, + exogenous_regressors=exogenous_regressors[ + :, base_offset:base_offset + self.window_size]) # This code needs to be updated if new predictions are added in # self._process_window assert len(model_outputs.predictions) == 3 @@ -525,7 +612,9 @@ class ARModel(model.TimeSeriesModel): batch_size = array_ops.shape(times)[0] prediction_shape = [batch_size, self.output_window_size * num_iterations, self.num_features] - previous_state_times, previous_state_values = state + (previous_state_times, + previous_state_values, + previous_state_exogenous_regressors) = state # Make sure returned state always has windows of self.input_window_size, # even if we were passed fewer than self.input_window_size points this # time. @@ -540,14 +629,24 @@ class ARModel(model.TimeSeriesModel): self._scale_data(values)], axis=1)[:, -self.input_window_size:, :] new_state_values.set_shape((None, self.input_window_size, self.num_features)) + new_exogenous_regressors = array_ops.concat( + [previous_state_exogenous_regressors, + exogenous_regressors], axis=1)[:, -self.input_window_size:, :] + new_exogenous_regressors.set_shape( + (None, + self.input_window_size, + self.exogenous_size)) else: # There is no state to keep, and the strided slices above do not handle # input_window_size=0. new_state_times = previous_state_times new_state_values = previous_state_values + new_exogenous_regressors = previous_state_exogenous_regressors return model.ModelOutputs( loss=math_ops.reduce_mean(loss_ta.stack(), axis=0), - end_state=(new_state_times, new_state_values), + end_state=(new_state_times, + new_state_values, + new_exogenous_regressors), predictions={ "mean": array_ops.reshape( array_ops.transpose(mean_ta.stack(), [1, 0, 2, 3]), @@ -604,7 +703,8 @@ class AnomalyMixtureARModel(ARModel): num_features, anomaly_distribution=GAUSSIAN_ANOMALY, num_time_buckets=10, - hidden_layer_sizes=None): + hidden_layer_sizes=None, + exogenous_feature_columns=None): assert (anomaly_prior_probability < 1.0 and anomaly_prior_probability > 0.0) self._anomaly_prior_probability = anomaly_prior_probability @@ -619,7 +719,8 @@ class AnomalyMixtureARModel(ARModel): input_window_size=input_window_size, output_window_size=output_window_size, loss=ARModel.NORMAL_LIKELIHOOD_LOSS, - hidden_layer_sizes=hidden_layer_sizes) + hidden_layer_sizes=hidden_layer_sizes, + exogenous_feature_columns=exogenous_feature_columns) def _create_anomaly_ops(self, times, values, prediction_ops_dict): anomaly_log_param = variable_scope.get_variable( @@ -631,9 +732,9 @@ class AnomalyMixtureARModel(ARModel): # distribution. prediction_ops_dict["anomaly_params"] = gen_math_ops.exp(anomaly_log_param) - def prediction_ops(self, times, values): + def prediction_ops(self, times, values, exogenous_regressors): prediction_ops_dict = super(AnomalyMixtureARModel, self).prediction_ops( - times, values) + times, values, exogenous_regressors) self._create_anomaly_ops(times, values, prediction_ops_dict) return prediction_ops_dict diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py index 1e1ca4e77fc41bb418cf2521c2c7fbed9f27c6a8..d078ac8d46397d00efefc32e19aa9bd1133aebaa 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py +++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py @@ -155,12 +155,15 @@ class ARModelTest(test.TestCase): state_times = np.expand_dims(train_data_times[:input_window_size], 0) state_values = np.expand_dims( train_data_values[:input_window_size, :], 0) + state_exogenous = state_times[:, :, None][:, :, :0] def prediction_input_fn(): return ({ PredictionFeatures.TIMES: training.limit_epochs( predict_times, num_epochs=1), - PredictionFeatures.STATE_TUPLE: (state_times, state_values) + PredictionFeatures.STATE_TUPLE: (state_times, + state_values, + state_exogenous) }, {}) (predictions,) = tuple(estimator.predict(input_fn=prediction_input_fn)) predicted_mean = predictions["mean"][:, 0] @@ -246,7 +249,8 @@ class ARModelTest(test.TestCase): with session.Session(): predicted_values = model.predict({ PredictionFeatures.TIMES: [[4, 6, 10]], - PredictionFeatures.STATE_TUPLE: ([[1, 2]], [[[1.], [2.]]]) + PredictionFeatures.STATE_TUPLE: ( + [[1, 2]], [[[1.], [2.]]], [[[], []]]) }) variables.global_variables_initializer().run() self.assertAllEqual(predicted_values["mean"].eval().shape, diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py index 886e1846e2a4f75503a47a3ff92adf97f814053f..f4608ca2d1cc286575f10f6922335f43c7ec0231 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py +++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py @@ -190,7 +190,7 @@ class ARRegressor(TimeSeriesRegressor): def __init__( self, periodicities, input_window_size, output_window_size, - num_features, num_time_buckets=10, + num_features, exogenous_feature_columns=None, num_time_buckets=10, loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS, hidden_layer_sizes=None, anomaly_prior_probability=None, anomaly_distribution=None, optimizer=None, model_dir=None, config=None): @@ -205,7 +205,12 @@ class ARRegressor(TimeSeriesRegressor): output_window_size: Number of future time steps to predict. Note that setting it to > 1 empirically seems to give a better fit. num_features: The dimensionality of the time series (one for univariate, - more than one for multivariate). + more than one for multivariate). + exogenous_feature_columns: A list of `tf.feature_column`s (for example + `tf.feature_column.embedding_column`) corresponding to exogenous + features which provide extra information to the model but are not part + of the series to be predicted. Passed to + `tf.feature_column.input_layer`. num_time_buckets: Number of buckets into which to divide (time % periodicity) for generating time based features. loss: Loss function to use for training. Currently supported values are @@ -241,6 +246,7 @@ class ARRegressor(TimeSeriesRegressor): anomaly_distribution = ar_model.AnomalyMixtureARModel.GAUSSIAN_ANOMALY model = ar_model.ARModel( periodicities=periodicities, num_features=num_features, + exogenous_feature_columns=exogenous_feature_columns, num_time_buckets=num_time_buckets, input_window_size=input_window_size, output_window_size=output_window_size, loss=loss, @@ -255,6 +261,7 @@ class ARRegressor(TimeSeriesRegressor): input_window_size=input_window_size, output_window_size=output_window_size, num_features=num_features, + exogenous_feature_columns=exogenous_feature_columns, num_time_buckets=num_time_buckets, hidden_layer_sizes=hidden_layer_sizes, anomaly_prior_probability=anomaly_prior_probability, diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py index 9f161c1695f415ad28c41ad0c00bc0b056399b96..eebee053f8e6000bdf17996abde03896bf4f32e1 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py +++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py @@ -29,6 +29,7 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils from tensorflow.python.client import session from tensorflow.python.estimator import estimator_lib +from tensorflow.python.feature_column import feature_column from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.platform import test @@ -48,12 +49,17 @@ class TimeSeriesRegressorTest(test.TestCase): def _fit_restore_fit_test_template(self, estimator_fn, dtype): """Tests restoring previously fit models.""" model_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) - first_estimator = estimator_fn(model_dir) + exogenous_feature_columns = ( + feature_column.numeric_column("exogenous"), + ) + first_estimator = estimator_fn(model_dir, exogenous_feature_columns) times = numpy.arange(20, dtype=numpy.int64) values = numpy.arange(20, dtype=dtype.as_numpy_dtype) + exogenous = numpy.arange(20, dtype=dtype.as_numpy_dtype) features = { feature_keys.TrainEvalFeatures.TIMES: times, - feature_keys.TrainEvalFeatures.VALUES: values + feature_keys.TrainEvalFeatures.VALUES: values, + "exogenous": exogenous } train_input_fn = input_pipeline.RandomWindowInputFn( input_pipeline.NumpyReader(features), shuffle_seed=2, num_threads=1, @@ -68,14 +74,19 @@ class TimeSeriesRegressorTest(test.TestCase): first_loss_after_fit = first_estimator.evaluate( input_fn=eval_input_fn, steps=1)["loss"] self.assertLess(first_loss_after_fit, first_loss_before_fit) - second_estimator = estimator_fn(model_dir) + second_estimator = estimator_fn(model_dir, exogenous_feature_columns) second_estimator.train(input_fn=train_input_fn, steps=2) whole_dataset_input_fn = input_pipeline.WholeDatasetInputFn( input_pipeline.NumpyReader(features)) whole_dataset_evaluation = second_estimator.evaluate( input_fn=whole_dataset_input_fn, steps=1) + exogenous_values_ten_steps = { + "exogenous": numpy.arange( + 10, dtype=dtype.as_numpy_dtype)[None, :, None] + } predict_input_fn = input_pipeline.predict_continuation_input_fn( evaluation=whole_dataset_evaluation, + exogenous_features=exogenous_values_ten_steps, steps=10) # Also tests that limit_epochs in predict_continuation_input_fn prevents # infinite iteration @@ -92,6 +103,7 @@ class TimeSeriesRegressorTest(test.TestCase): saved_prediction = saved_model_utils.predict_continuation( continue_from=whole_dataset_evaluation, steps=10, + exogenous_features=exogenous_values_ten_steps, signatures=signatures, session=sess) # Saved model predictions should be the same as Estimator predictions @@ -104,7 +116,8 @@ class TimeSeriesRegressorTest(test.TestCase): continue_from=whole_dataset_evaluation, features={ feature_keys.FilteringFeatures.TIMES: times[None, -1] + 2, - feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2. + feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2., + "exogenous": values[None, -1, None] + 12. }, signatures=signatures, session=sess) @@ -112,6 +125,10 @@ class TimeSeriesRegressorTest(test.TestCase): second_saved_prediction = saved_model_utils.predict_continuation( continue_from=first_filtering, steps=1, + exogenous_features={ + "exogenous": numpy.arange( + 1, dtype=dtype.as_numpy_dtype)[None, :, None] + }, signatures=signatures, session=sess) self.assertEqual( @@ -122,7 +139,8 @@ class TimeSeriesRegressorTest(test.TestCase): continue_from=first_filtering, features={ feature_keys.FilteringFeatures.TIMES: times[-1] + 3, - feature_keys.FilteringFeatures.VALUES: values[-1] + 3. + feature_keys.FilteringFeatures.VALUES: values[-1] + 3., + "exogenous": values[-1, None] + 13. }, signatures=signatures, session=sess) @@ -131,7 +149,8 @@ class TimeSeriesRegressorTest(test.TestCase): six.assertCountEqual( self, [feature_keys.FilteringFeatures.TIMES, - feature_keys.FilteringFeatures.VALUES], + feature_keys.FilteringFeatures.VALUES, + "exogenous"], signatures.signature_def[ feature_keys.SavedModelLabels.COLD_START_FILTER].inputs.keys()) batch_numpy_times = numpy.tile( @@ -142,7 +161,8 @@ class TimeSeriesRegressorTest(test.TestCase): session=sess, features={ feature_keys.FilteringFeatures.TIMES: batch_numpy_times, - feature_keys.FilteringFeatures.VALUES: batch_numpy_values + feature_keys.FilteringFeatures.VALUES: batch_numpy_values, + "exogenous": 10. + batch_numpy_values } ) predict_times = numpy.tile( @@ -150,26 +170,32 @@ class TimeSeriesRegressorTest(test.TestCase): predictions = saved_model_utils.predict_continuation( continue_from=state, times=predict_times, + exogenous_features={ + "exogenous": numpy.tile(numpy.arange( + 15, dtype=dtype.as_numpy_dtype), (10,))[None, :, None] + }, signatures=signatures, session=sess) self.assertAllEqual([10, 15, 1], predictions["mean"].shape) def test_fit_restore_fit_ar_regressor(self): - def _estimator_fn(model_dir): + def _estimator_fn(model_dir, exogenous_feature_columns): return estimators.ARRegressor( periodicities=10, input_window_size=10, output_window_size=6, num_features=1, model_dir=model_dir, config=_SeedRunConfig(), # This test is flaky with normal likelihood loss (could add more # training iterations instead). - loss=ar_model.ARModel.SQUARED_LOSS) + loss=ar_model.ARModel.SQUARED_LOSS, + exogenous_feature_columns=exogenous_feature_columns) self._fit_restore_fit_test_template(_estimator_fn, dtype=dtypes.float32) def test_fit_restore_fit_structural_ensemble_regressor(self): dtype = dtypes.float32 - def _estimator_fn(model_dir): + def _estimator_fn(model_dir, exogenous_feature_columns): return estimators.StructuralEnsembleRegressor( num_features=1, periodicities=10, model_dir=model_dir, dtype=dtype, - config=_SeedRunConfig()) + config=_SeedRunConfig(), + exogenous_feature_columns=exogenous_feature_columns) self._fit_restore_fit_test_template(_estimator_fn, dtype=dtype) diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py index d5dce30fda0353bd70f44ec567ac91acce1e9394..5f7e3da2db6da26f50aad9d500959238063a3e3c 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py +++ b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py @@ -78,7 +78,7 @@ class StubTimeSeriesModel(model.TimeSeriesModel): batch_end_values = array_ops.squeeze( array_ops.slice(values, [0, array_ops.shape(times)[1] - 1, 0], [-1, 1, -1]), - squeeze_dims=[1, 2]) + axis=[1, 2]) # A pretty odd but easy to think about loss: L1 loss on the batch end # values. loss = math_ops.reduce_sum( diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py index 1fcd3e391b63c2362d6187da9556e2c71836dbaa..a614386121e000961bf8b32625a28e1251654320 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py +++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py @@ -170,7 +170,7 @@ class KalmanFilter(object): math_ops.matmul( transition_matrices, prior_state[..., None]), - squeeze_dims=[-1]) + axis=[-1]) return advanced_state def predict_state_var( @@ -254,7 +254,7 @@ class KalmanFilter(object): kalman_gain_transposed, array_ops.expand_dims(residual, -1), adjoint_a=True), - squeeze_dims=[-1]) + axis=[-1]) gain_obs = math_ops.matmul( kalman_gain_transposed, observation_model, adjoint_a=True) identity_extradim = linalg_ops.eye( @@ -332,7 +332,7 @@ class KalmanFilter(object): array_ops.expand_dims(state_mean, 1), observation_model, adjoint_b=True), - squeeze_dims=[1]) + axis=[1]) observed_var = math_ops.matmul( math_ops.matmul(observation_model, state_var), observation_model, diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD index 9646d15486ef618f206936ce55a5eb6ca0387e41..eac210418b57eadc2e99ea0c29690236c3a09516 100644 --- a/tensorflow/contrib/tpu/BUILD +++ b/tensorflow/contrib/tpu/BUILD @@ -162,6 +162,7 @@ py_library( "python/tpu/__init__.py", "python/tpu/bfloat16.py", "python/tpu/device_assignment.py", + "python/tpu/keras_support.py", "python/tpu/topology.py", "python/tpu/tpu.py", "python/tpu/tpu_feed.py", diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc index 5900c61a38726551391c212f92b9b9eacd4a465b..b05c76ca64fbaedc205ab06cc31616787ccc84b8 100644 --- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc +++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc @@ -26,6 +26,7 @@ REGISTER_OP("OutfeedEnqueue") .Input("input: dtype") .Attr("dtype: type") .SetIsStateful() + .SetShapeFn(shape_inference::NoOutputs) .Doc(R"doc( An op which emits a single Tensor value from an XLA computation. @@ -36,6 +37,7 @@ REGISTER_OP("OutfeedEnqueueTuple") .Input("inputs: dtypes") .Attr("dtypes: list(type)") .SetIsStateful() + .SetShapeFn(shape_inference::NoOutputs) .Doc(R"doc( An op which emits multiple Tensor values from an XLA computation. diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc index a5358842630bed15ad4f0b71ec2d4042f3223ca1..816897499b7a49365060c026d60b977990f3ecdc 100644 --- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc +++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc @@ -41,7 +41,7 @@ namespace tensorflow { namespace tpu { namespace { -using ::tensorflow::grpc::TPUProfileAnalysis; +using ::tensorflow::TPUProfileAnalysis; using ::tensorflow::TPUProfiler; constexpr uint64 kMaxEvents = 1000000; @@ -137,9 +137,9 @@ bool NewSession(const string& service_addr, PopulateProfileRequest(duration_ms, repository_root, session_id, opts); new_session_request.set_repository_root(repository_root); new_session_request.set_session_id(session_id); - std::copy( - hostnames.begin(), hostnames.end(), - proto2::RepeatedFieldBackInserter(new_session_request.mutable_hosts())); + for (const auto& hostname : hostnames) { + new_session_request.add_hosts(hostname); + } ::grpc::ClientContext context; ::grpc::ChannelArguments channel_args; @@ -159,8 +159,8 @@ bool NewSession(const string& service_addr, TF_QCHECK_OK(FromGrpcStatus( stub->NewSession(&context, new_session_request, &new_session_response))); - std::cout << "Profile session succeed for hosts:" - << str_util::Join(hostnames, ","); + std::cout << "Profile session succeed for host(s):" + << str_util::Join(hostnames, ",") << std::endl; return new_session_response.empty_trace(); } diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py index 0b78cf8695091daf797bcb80586397e7ab1c6284..508c7a842fb82ec080082d7e7f02f8d2f2a79447 100644 --- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py +++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py @@ -37,12 +37,17 @@ flags.DEFINE_string( 'will attempt to automatically detect the GCE project from metadata.') flags.DEFINE_string('tpu_name', None, 'Name of the Cloud TPU for Cluster Resolvers. You must ' - 'specify either this flag or --master.') + 'specify either this flag or --service_addr.') # Tool specific parameters flags.DEFINE_string( 'service_addr', None, 'Address of TPU profiler service e.g. ' 'localhost:8466, you must specify either this flag or --tpu_name.') +flags.DEFINE_string( + 'workers_list', None, 'The list of worker TPUs that we are about to profile' + ' e.g. 10.0.1.2, 10.0.1.3. You can specify this flag with --tpu_name or ' + '--service_addr to profile a subset of tpu nodes. You can also use only' + '--tpu_name and leave this flag unspecified to profile all the tpus.') flags.DEFINE_string('logdir', None, 'Path of TensorBoard log directory e.g. /tmp/tb_log, ' 'gs://tb_bucket') @@ -56,18 +61,25 @@ flags.DEFINE_boolean('include_dataset_ops', True, FLAGS = flags.FLAGS EXECUTABLE = 'data/capture_tpu_profile' +JOB_NAME = 'worker' +def get_workers_list(cluster_resolver): + cluster_spec = cluster_resolver.cluster_spec() + task_indices = cluster_spec.task_indices(JOB_NAME) + workers_list = [cluster_spec.task_address(JOB_NAME, i).split(':')[0] + for i in task_indices] + return ','.join(workers_list) def run_main(): tf.app.run(main) - def main(unused_argv=None): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.service_addr is None and FLAGS.tpu_name is None: sys.exit('You must specify either --service_addr or --tpu_name.') + tpu_cluster_resolver = None if FLAGS.service_addr is not None: if FLAGS.tpu_name is not None: tf.logging.warn('Both --service_addr and --tpu_name are set. Ignoring ' @@ -82,6 +94,12 @@ def main(unused_argv=None): service_addr = tpu_cluster_resolver.get_master() service_addr = service_addr.replace('grpc://', '').replace(':8470', ':8466') + workers_list = "" + if FLAGS.workers_list is not None: + workers_list = FLAGS.workers_list + elif tpu_cluster_resolver is not None: + workers_list = get_workers_list(tpu_cluster_resolver) + if not FLAGS.logdir: sys.exit('logdir must be provided.') executable_path = os.path.join(os.path.dirname(__file__), EXECUTABLE) @@ -89,6 +107,7 @@ def main(unused_argv=None): cmd = [executable_path] cmd.append('--logdir=' + logdir) cmd.append('--service_addr=' + service_addr) + cmd.append('--workers_list=' + workers_list) cmd.append('--duration_ms=' + str(FLAGS.duration_ms)) cmd.append('--num_tracing_attempts=' + str(FLAGS.num_tracing_attempts)) cmd.append('--include_dataset_ops=' + str(FLAGS.include_dataset_ops).lower()) diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py index 8d99835b64152629c66607e6792495eb36319eb8..ebd478fd02295108b9d2454963eb06165828b523 100644 --- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py +++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py @@ -20,7 +20,7 @@ from __future__ import print_function from setuptools import setup -_VERSION = '1.6.0-rc1' +_VERSION = '1.6.0' CONSOLE_SCRIPTS = [ 'capture_tpu_profile=cloud_tpu_profiler.main:run_main', diff --git a/tensorflow/contrib/tpu/profiler/version.h b/tensorflow/contrib/tpu/profiler/version.h index dc6a934891138018d32d511750120453bdf290cf..618479e1a6ccf26a4103ea1f182b662d7d9998da 100644 --- a/tensorflow/contrib/tpu/profiler/version.h +++ b/tensorflow/contrib/tpu/profiler/version.h @@ -16,6 +16,6 @@ limitations under the License. #ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ #define TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ -#define TPU_PROFILER_VERSION "1.5.0" +#define TPU_PROFILER_VERSION "1.6.0" #endif // TENSORFLOW_CONTRIB_TPU_PROFILER_VERSION_H_ diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py index 465c668fd8b42f150892f8e4b52de76c6fe13fa9..2e472a2805f98b15505f56af403aa6223e28c667 100644 --- a/tensorflow/contrib/tpu/python/tpu/datasets.py +++ b/tensorflow/contrib/tpu/python/tpu/datasets.py @@ -170,7 +170,7 @@ def StreamingFilesDataset(files, args=[source_handle], Tout=[dtypes.string], f=LoadingFunc, - target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job) + target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0] with ops.device('/job:%s' % worker_job): output_dataset = dataset_ops.Dataset.range(2).repeat().map( diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py new file mode 100644 index 0000000000000000000000000000000000000000..e86ca0a1d8f15ea77046b4e177e04f494ced55e9 --- /dev/null +++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py @@ -0,0 +1,391 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""*Experimental* support for running Keras models on the TPU. + +To use, wrap your model with the `keras_support.tpu_model` function. + +Example usage: + +``` +# Must activate before building TPU models +keras_support.setup_tpu_session(master_address) + +image = tf.keras.layers.Input(shape=(28, 28, 3), name='image') +c1 = tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3))( image) +flattened = tf.keras.layers.Flatten()(c1) +logits = tf.keras.layers.Dense(10, activation='softmax')(flattened) +model = tf.keras.Model(inputs=[image], outputs=[logits]) +model = keras_support.tpu_model(model) + +# Only TF optimizers are currently supported. +model.compile(optimizer=tf.train.AdamOptimizer(), ...) + +# `images` and `labels` should be Numpy arrays. Support for tensor input +# (e.g. datasets) is planned. +model.fit(images, labels) + +# Invoke before shutting down +keras_support.shutdown_tpu_session() +``` +""" + +# pylint: disable=protected-access + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re + +from tensorflow.contrib.framework.python.framework import experimental +from tensorflow.contrib.tpu.python.ops import tpu_ops +from tensorflow.contrib.tpu.python.tpu import tpu +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.client import session as tf_session +from tensorflow.python.estimator import model_fn as model_fn_lib +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_spec +from tensorflow.python.keras._impl.keras import backend as K +from tensorflow.python.keras._impl.keras import layers +from tensorflow.python.keras._impl.keras import models +from tensorflow.python.keras._impl.keras import optimizers as keras_optimizers +from tensorflow.python.keras._impl.keras.layers import embeddings +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import training_util + + +class TPUEmbedding(embeddings.Embedding): + """TPU compatible embedding layer. + + The default Keras layer is not TPU compatible. This layer is a drop-in + replacement: it has the same behavior and will work on CPU and GPU devices. + """ + + def __init__(self, *args, **kw): + super(TPUEmbedding, self).__init__(*args, **kw) + + def build(self, input_shape): + if input_shape[0] is None: + raise ValueError( + 'TPUEmbeddings must have a fixed input_length or input shape.') + return super(TPUEmbedding, self).build(input_shape) + + def call(self, inputs): + if K.dtype(inputs) != 'int32': + inputs = math_ops.cast(inputs, 'int32') + + inputs = array_ops.one_hot(inputs, self.input_dim) + return math_ops.tensordot(inputs, self.embeddings, 1) + + +class CompiledTPUOp( + collections.namedtuple( + 'CompiledTPUOp', + ['tpu_execute_op', 'infeed_tensors', 'infeed_op', 'outfeed_op'])): + pass + + +def _valid_name(tensor_name): + """Return a valid tensor name (strips '/', ':', etc).""" + return re.sub('[^a-zA-Z0-9_-]+', '', tensor_name) + + +class TPUFunction(object): + """K.function compatible interface for invoking a TPU compiled function. + + Recompilation is triggered on-demand for each set of new inputs shapes: the + results are cached for future execution. We expect most computations will + be dominated by a standard batch-size, followed by a straggler batch for + the end of training or evaluation. + + All `inputs` and `outputs` will be loaded via the infeed and outfeed queues + instead of being injected as `feed_dict` items or fetches. + """ + + def __init__(self, model, execution_mode): + self.model = model + self.execution_mode = execution_mode + self._compilation_cache = {} + + def _specialize_model(self, input_specs): + """Specialize `self.model` (a Keras model) for the given input shapes.""" + # Re-create our input and output layers inside our subgraph. They will be + # attached to the true computation when we clone our model in `tpu_fn`. + K.set_learning_phase(self.execution_mode == model_fn_lib.ModeKeys.TRAIN) + + # functools.partial and callable objects are not supported by tpu.rewrite + def _model_fn(): + """Compute fit/eval/predict for the TPU.""" + is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN + is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL + is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT + + # During train/eval, we infeed our features as well as labels. + if is_training or is_test: + infeed_layers = self.model._input_layers + self.model._output_layers + else: + infeed_layers = self.model._input_layers + + # Generate our infeed operation to read features & labels. + infeed_tensors = tpu_ops.infeed_dequeue_tuple( + dtypes=[spec.dtype for spec in input_specs], + shapes=[spec.shape for spec in input_specs], + name='infeed-%s' % self.execution_mode) + + assert len(infeed_tensors) == len(infeed_layers), ( + 'Infeed inputs did not match model: %s vs %s', (infeed_layers, + infeed_tensors)) + + tpu_targets = [] + tpu_inputs = [] + + # Sort infeed outputs into inputs and labels for calling our Keras model. + for tensor, layer in zip(infeed_tensors, infeed_layers): + if layer in self.model._input_layers: + tpu_inputs.append(layers.Input(name=layer.name, tensor=tensor)) + if layer in self.model._output_layers: + tpu_targets.append(tensor) + + optimizer = self.model.optimizer + optimizer.iterations = training_util.get_or_create_global_step() + + # Call our model with our infeed inputs (re-using the weights). + model_outputs = self.model(tpu_inputs) + child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs) + if is_training or is_test: + child_model.compile( + optimizer=self.model.optimizer, + loss=self.model.loss, + loss_weights=self.model.loss_weights, + metrics=self.model.metrics, + weighted_metrics=self.model.weighted_metrics, + target_tensors=tpu_targets, + ) + + # Compute our outfeed depending on the execution mode + if is_training: + child_model._make_train_function() + self._outfeed_spec = [ + tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) + for tensor in child_model.train_function.outputs + ] + return [ + child_model.train_function.updates_op, + tpu_ops.outfeed_enqueue_tuple( + child_model.train_function.outputs, name='oufeed-enqueue-train') + ] + elif is_test: + child_model._make_test_function() + self._outfeed_spec = [ + tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) + for tensor in child_model.test_function.outputs + ] + return [ + tpu_ops.outfeed_enqueue_tuple( + child_model.test_function.outputs, name='outfeed-enqueue-test') + ] + elif is_predict: + child_model._make_predict_function() + self._outfeed_spec = [ + tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) + for tensor in child_model.predict_function.outputs + ] + return [ + tpu_ops.outfeed_enqueue_tuple( + child_model.predict_function.outputs, + name='outfeed-enqueue-predict', + ) + ] + else: + assert False, 'Unexpected execution mode: %s' % self.execution_mode + + # Capture outfeed metadata computed during the rewrite. + self._outfeed_spec = None + + tpu_execute_op = tpu.rewrite(_model_fn) + + K._initialize_variables(K.get_session()) # pylint-disable: protected-access + + # Generate CPU side operations to enqueue features/labels and dequeue + # outputs from the model call. + with ops.device('/device:TPU:0'): + infeed_tensors = [] + for spec in input_specs: + infeed_tensors.append( + array_ops.placeholder( + dtype=spec.dtype, + shape=spec.shape, + name='infeed-enqueue-%s' % spec.name)) + + infeed_op = tpu_ops.infeed_enqueue_tuple( + infeed_tensors, [spec.shape for spec in input_specs], + name='infeed-enqueue-%s' % self.execution_mode) + + outfeed_op = tpu_ops.outfeed_dequeue_tuple( + dtypes=[spec.dtype for spec in self._outfeed_spec], + shapes=[spec.shape for spec in self._outfeed_spec], + name='outfeed-dequeue-%s' % self.execution_mode) + + return CompiledTPUOp(tpu_execute_op, infeed_tensors, infeed_op, outfeed_op) + + def __call__(self, inputs): + assert isinstance(inputs, list) + + # Strip sample weight from inputs + if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or + self.execution_mode == model_fn_lib.ModeKeys.EVAL): + input_tensors = self.model._feed_inputs + self.model._feed_targets + inputs = inputs[:len(input_tensors)] + else: + input_tensors = self.model._feed_inputs + + # Compute an input specification (used to generate infeed enqueue and + # dequeue operations). We use the shape from our input array and the + # dtype from our model. A user may pass in a float64 for a float32 + # input: for model compatibility we still must generate a float32 infeed. + input_specs = [] + for tensor, ary in zip(input_tensors, inputs): + input_specs.append( + tensor_spec.TensorSpec(ary.shape, tensor.dtype, + _valid_name(tensor.name))) + + # XLA requires every operation in the graph has a fixed shape. To + # handle varying batch sizes we recompile a new sub-graph for each + # unique input shape. + shape_key = tuple([tuple(spec.shape.as_list()) for spec in input_specs]) + + if shape_key not in self._compilation_cache: + logging.info('New input shapes; (re-)compiling: mode=%s, %s', + self.execution_mode, input_specs) + self._compilation_cache[shape_key] = self._specialize_model(input_specs) + + compiled_model = self._compilation_cache[shape_key] + + infeed_dict = {} + for tensor, value in zip(compiled_model.infeed_tensors, inputs): + infeed_dict[tensor] = value + + session = K.get_session() + _, _, outfeed_outputs = session.run([ + compiled_model.infeed_op, compiled_model.tpu_execute_op, + compiled_model.outfeed_op + ], infeed_dict) + + return outfeed_outputs + + +@experimental +def setup_tpu_session(master): + """Initializes and returns a Keras/TF session connected the TPU `master`.""" + session = tf_session.Session( + target=master, config=config_pb2.ConfigProto(isolate_session_state=True)) + K.set_session(session) + K.get_session().run(tpu.initialize_system()) + K.manual_variable_initialization(True) + return session + + +@experimental +def shutdown_tpu_session(session=None): + """Shutdown the TPU attached to session. + + This should be called to cleanly shut down the TPU system before the client + exits. + + Args: + session: Session to shutdown, or None to use the default session. + + Returns: + + """ + if session is None: + session = K.get_session() + + session.run(tpu.shutdown_system()) + + +class KerasTPUModel(models.Model): + """TPU compatible Keras model wrapper.""" + + def __init__(self, inputs, outputs, name=None): + super(models.Model, self).__init__( + inputs=inputs, + outputs=outputs, + name=name, + ) + self.predict_function = None + self.test_function = None + self.train_function = None + + def compile(self, + optimizer, + loss=None, + metrics=None, + loss_weights=None, + sample_weight_mode=None, + weighted_metrics=None, + target_tensors=None, + **kwargs): + if sample_weight_mode: + raise ValueError('sample_weight_mode not supported for TPU execution.') + if weighted_metrics: + raise ValueError('weighted_metrics not supported for TPU execution.') + if target_tensors: + raise ValueError('target_tensors is not supported for TPU execution.') + + super(KerasTPUModel, self).compile(optimizer, loss, metrics, loss_weights, + sample_weight_mode, weighted_metrics, + target_tensors, **kwargs) + + # Keras optimizers are not compatible with TPU rewrite + if not isinstance(self.optimizer, keras_optimizers.TFOptimizer): + raise ValueError( + 'Optimizer must be a TFOptimizer, got: %s' % self.optimizer) + + def train_on_batch(self, x, y, sample_weight=None, class_weight=None): + return super(KerasTPUModel, self).train_on_batch(x, y, sample_weight, + class_weight) + + def _make_train_function(self): + if not self.train_function: + self.train_function = TPUFunction(self, model_fn_lib.ModeKeys.TRAIN) + + return self.train_function + + def _make_test_function(self): + if not self.test_function: + self.test_function = TPUFunction(self, model_fn_lib.ModeKeys.EVAL) + return self.test_function + + def _make_predict_function(self): + if not self.predict_function: + self.predict_function = TPUFunction(self, model_fn_lib.ModeKeys.PREDICT) + return self.predict_function + + def cpu_model(self): + return models.Model( + inputs=self.inputs, + outputs=self.outputs, + name=self.name, + ) + + +@experimental +def tpu_model(model): + return KerasTPUModel( + inputs=model.inputs, outputs=model.outputs, name=model.name) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py index a1690dadffe5770af9416a7c5ad3a7e336f6bc18..7b8786304ccf7e707712f178838cafb8346d2941 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu.py @@ -173,36 +173,18 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): # gradients, and put the gradient of X in cluster # 'root_cluster.gradient_uid'. # - # When the gradient code adds multiple Ops, it asks them to - # be colocated either with the original Op X, or with one of - # the preceding Ops that was added to the gradient. In other - # words, we want to detect the case where we are colocating - # with an Op that is in cluster root_cluster.gradient_uid - # and put the new Op in that same cluster if the - # gradient_uid is the same (the case that we are in the same - # invocation of gradients, and just adding new Ops to the - # cluster); and in a different cluster if the gradient_uids - # are different (the case that we are in a new invocation of - # gradients, taking the gradient of a previously-computed - # gradient). + # When taking a gradient of a gradient, some ops will be + # colocated with Op in the forward pass (e.g., cluster + # root_cluster) and some in the backward pass (e.g., cluster + # root_cluster.initial_gradient_uid). We need all of the + # grad-of-grad ops to be in the same cluster to avoid cyclic + # dependencies between clusters. We adopt a heuristic that + # puts any op clustered with root_cluster. in + # root_cluster.gradient_uid, even if xxx was + # initial_gradient_uid. self._in_gradient_colocation = op parts = outside_attr.split(".") - if len(parts) > 1: - uid = parts[-1] - if uid == gradient_uid: - # Keep using the same cluster - cluster = outside_attr - else: - # We're taking the gradient of a gradient so make a new - # cluster attr, adding a new '.uid' on the end to - # preserve the invariant that the gradient_uid is the - # suffix after the last '.' in the attr. - cluster = outside_attr + "." + gradient_uid - else: - # We're taking the gradient of an Op in the forward pass, so - # make a new cluster combining the Op's cluster and the - # gradient id. - cluster = outside_attr + "." + gradient_uid + cluster = parts[0] + "." + gradient_uid self._EnterOutsideCompilationScope(cluster=cluster) except ValueError: # The attr was not present: do nothing. diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py index cc1a7fd801506e3f0b758c4848205f1c375403d2..6d7331e3c79ade9c12c15de79f550cf3973c4e6c 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py @@ -210,8 +210,9 @@ class RunConfig(run_config_lib.RunConfig): raise ValueError( 'You cannot provide a ClusterResolver and ' 'session_config.cluster_def.') - self._session_config.cluster_def.CopyFrom( - self._cluster_spec.as_cluster_def()) + if self._cluster_spec: + self._session_config.cluster_def.CopyFrom( + self._cluster_spec.as_cluster_def()) @property def evaluation_master(self): diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py index 7fab19afeecc258c5185f219da2a11f3ffdad056..98eb0e240f0666b2d4a1b5135faef383e49b7468 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py @@ -2054,6 +2054,16 @@ class TPUEstimator(estimator_lib.Estimator): }, every_n_secs=30) ] + input_hooks + chief_hooks = [] + if (self._config.save_checkpoints_secs or + self._config.save_checkpoints_steps): + chief_hooks.append( + training.CheckpointSaverHook( + self.model_dir, + save_secs=self._config.save_checkpoints_secs, + save_steps=self._config.save_checkpoints_steps, + steps_per_run=self._config.tpu_config.iterations_per_loop, + scaffold=scaffold)) summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss) with ops.control_dependencies([loss]): update_ops = _sync_variables_ops() @@ -2067,6 +2077,7 @@ class TPUEstimator(estimator_lib.Estimator): return model_fn_lib.EstimatorSpec( mode, loss=loss, + training_chief_hooks=chief_hooks, training_hooks=hooks, train_op=train_op, scaffold=scaffold) diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD index 4d2bfd3e434e60b3fac408931688e8e486b7e494..5de55b5f7f2a41ac6edd27e5a102e565f33df12c 100644 --- a/tensorflow/contrib/training/BUILD +++ b/tensorflow/contrib/training/BUILD @@ -60,6 +60,7 @@ py_library( "//tensorflow/python:util", "//tensorflow/python:variable_scope", "//tensorflow/python:variables", + "//tensorflow/python/data", "//tensorflow/python/estimator:inputs_queues", "//third_party/py/numpy", "@six_archive//:six", diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py index da2de3e421b841937e4125168ea1ecea066ff841..edd71fb2502cf6c965a97485e074d20f876fd504 100644 --- a/tensorflow/contrib/training/__init__.py +++ b/tensorflow/contrib/training/__init__.py @@ -57,6 +57,8 @@ from tensorflow.contrib.training.python.training.hparam import * from tensorflow.contrib.training.python.training.resample import * from tensorflow.contrib.training.python.training.sampling_ops import * from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import * +from tensorflow.contrib.training.python.training.tensor_queue_dataset import enqueue_in_queue_dataset +from tensorflow.contrib.training.python.training.tensor_queue_dataset import prepend_from_queue_and_padded_batch_dataset from tensorflow.contrib.training.python.training.training import add_gradients_summaries from tensorflow.contrib.training.python.training.training import clip_gradient_norms from tensorflow.contrib.training.python.training.training import clip_gradient_norms_fn @@ -75,6 +77,7 @@ _allowed_symbols = [ 'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook', 'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries', 'clip_gradient_norms', 'clip_gradient_norms_fn', 'create_train_op', - 'multiply_gradients', 'train'] + 'multiply_gradients', 'enqueue_in_queue_dataset', + 'prepend_from_queue_and_padded_batch_dataset', 'train'] remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 823893d02cd07effc4b406ecaa2c01881a7f62ba..acca47e9a3569cdac692349c142e7c51fa98dcae 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -145,6 +145,7 @@ load( "if_static", ) load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") +load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library") load( "//third_party/mkl:build_defs.bzl", "if_mkl", @@ -161,7 +162,7 @@ exports_files(["ops/ops.pbtxt"]) # Note that some protos are in neither additional_core_proto_srcs nor this # filegroup; e.g. ones with individual proto_library targets. # LINT.IfChange -CORE_PROTO_SRCS = [ +COMMON_PROTO_SRCS = [ "example/example.proto", "example/feature.proto", "framework/allocation_description.proto", @@ -189,7 +190,6 @@ CORE_PROTO_SRCS = [ "framework/types.proto", "framework/variable.proto", "framework/versions.proto", - "lib/core/error_codes.proto", "protobuf/config.proto", "protobuf/cluster.proto", "protobuf/debug.proto", @@ -202,8 +202,14 @@ CORE_PROTO_SRCS = [ "util/memmapped_file_system.proto", "util/saved_tensor_slice.proto", ] + +ERROR_CODES_PROTO_SRCS = [ + "lib/core/error_codes.proto", +] # LINT.ThenChange(//tensorflow/core/android_proto_config.asciipb) +CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS + # Protos which are not needed on mobile builds, but should be included in # protos_all. # @@ -224,12 +230,16 @@ ADDITIONAL_CORE_PROTO_SRCS = [ tf_proto_library( name = "protos_all", - srcs = CORE_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS, + srcs = [], cc_api_version = 2, default_header = True, j2objc_api_version = 1, java_api_version = 2, js_api_version = 2, + protodeps = [ + ":protos_all_proto", + ":error_codes_proto", + ], visibility = ["//visibility:public"], ) @@ -257,12 +267,6 @@ proto_library( visibility = ["//visibility:public"], ) -closure_proto_library( - name = "example_protos_closure", - visibility = ["//visibility:public"], - deps = [":example_protos"], -) - exports_files([ "framework/types.proto", ]) @@ -287,7 +291,7 @@ PLATFORM_BASE_HDRS = [ "platform/logging.h", "platform/macros.h", "platform/types.h", - "platform/cpu_info.h", + "platform/byte_order.h", ] PLATFORM_OTHER_HDRS = [ @@ -295,6 +299,7 @@ PLATFORM_OTHER_HDRS = [ "platform/stacktrace.h", "platform/stacktrace_handler.h", "platform/context.h", + "platform/cpu_info.h", "platform/cpu_feature_guard.h", "platform/dynamic_annotations.h", "platform/env.h", @@ -323,7 +328,6 @@ cc_library( srcs = glob([ "platform/*/integral_types.h", "platform/*/logging.h", - "platform/*/cpu_info.h", ]), hdrs = PLATFORM_BASE_HDRS, deps = [ @@ -353,7 +357,9 @@ cc_library( "lib/bfloat16/bfloat16.h", ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()), copts = tf_copts(), - deps = tf_lib_proto_parsing_deps(), + deps = tf_lib_proto_parsing_deps() + [ + "@double_conversion//:double-conversion", + ], ) # This build rule (along with :lib_internal, :framework, and @@ -540,6 +546,7 @@ tf_cuda_library( "framework/device_base.h", "framework/function.h", "framework/graph_def_util.h", + "framework/graph_to_functiondef.h", "framework/kernel_def_builder.h", "framework/log_memory.h", "framework/lookup_interface.h", @@ -563,6 +570,7 @@ tf_cuda_library( "framework/selective_registration.h", "framework/session_state.h", "framework/shape_inference.h", + "framework/stats_aggregator.h", "framework/tensor.h", "framework/tensor_shape.h", "framework/tensor_slice.h", @@ -671,6 +679,7 @@ cc_library( "framework/tensor_types.h", "framework/type_traits.h", "lib/bfloat16/bfloat16.h", + "platform/byte_order.h", "platform/default/dynamic_annotations.h", "platform/default/integral_types.h", "platform/default/logging.h", @@ -992,6 +1001,7 @@ cc_library( "//tensorflow/core/kernels:nn", "//tensorflow/core/kernels:parameterized_truncated_normal_op", "//tensorflow/core/kernels:parsing", + "//tensorflow/core/kernels:partitioned_function_ops", "//tensorflow/core/kernels:random_ops", "//tensorflow/core/kernels:random_poisson_op", "//tensorflow/core/kernels:remote_fused_graph_ops", @@ -1136,7 +1146,8 @@ filegroup( filegroup( name = "mobile_srcs_no_runtime", srcs = [ - ":proto_text_srcs_all", + ":protos_all_proto_text_srcs", + ":error_codes_proto_text_srcs", "//tensorflow/core/platform/default/build_config:android_srcs", ] + glob( [ @@ -1246,6 +1257,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1285,6 +1297,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1348,6 +1361,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1370,6 +1384,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1617,6 +1632,18 @@ tf_proto_library_cc( ], ) +tf_proto_library_cc( + name = "eager_service_proto", + srcs = ["protobuf/eager_service.proto"], + has_services = 1, + cc_api_version = 2, + cc_stubby_versions = ["2"], + protodeps = tf_additional_all_protos(), + visibility = [ + "//tensorflow:internal", + ], +) + LIB_INTERNAL_PRIVATE_HEADERS = ["framework/resource_handle.h"] + glob( [ "lib/**/*.h", @@ -1766,6 +1793,7 @@ cc_library( "//tensorflow/core/platform/default/build_config:platformlib", "@snappy", "@zlib_archive//:zlib", + "@double_conversion//:double-conversion", "@protobuf_archive//:protobuf", ] + tf_protos_all_impl() + tf_protos_grappler_impl(), ) @@ -1911,6 +1939,7 @@ cc_library( "lib/core/casts.h", "lib/core/stringpiece.h", "lib/png/png_io.h", + "platform/byte_order.h", "platform/cpu_info.h", "platform/default/integral_types.h", "platform/default/logging.h", @@ -1926,15 +1955,58 @@ cc_library( ], ) -proto_text_hdrs_and_srcs = tf_generate_proto_text_sources( - name = "proto_text_srcs_all", - srcs = CORE_PROTO_SRCS, +tf_proto_library( + name = "error_codes_proto", + srcs = ERROR_CODES_PROTO_SRCS, + cc_api_version = 2, + default_header = True, + j2objc_api_version = 1, + java_api_version = 2, + js_api_version = 2, +) + +tf_generate_proto_text_sources( + name = "error_codes_proto_text", + srcs = ERROR_CODES_PROTO_SRCS, + protodeps = [], + srcs_relative_dir = "tensorflow/core/", + deps = [ + ":error_codes_proto_cc", + ":lib_internal", + ], +) + +tf_proto_library( + name = "protos_all_proto", + srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS, + cc_api_version = 2, + default_header = True, + j2objc_api_version = 1, + java_api_version = 2, + js_api_version = 2, + protodeps = [ + ":error_codes_proto", + ], +) + +tf_generate_proto_text_sources( + name = "protos_all_proto_text", + srcs = COMMON_PROTO_SRCS, + protodeps = ERROR_CODES_PROTO_SRCS, srcs_relative_dir = "tensorflow/core/", + deps = [ + ":error_codes_proto_text", + ":lib_internal", + ":protos_all_proto_cc", + ], ) cc_library( name = "proto_text", - hdrs = proto_text_hdrs_and_srcs.hdrs, + hdrs = [ + ":error_codes_proto_text_hdrs", + ":protos_all_proto_text_hdrs", + ], deps = [ ":lib", ":lib_internal", @@ -2079,7 +2151,7 @@ tf_cuda_library( "util/memmapped_file_system.cc", "util/memmapped_file_system_writer.cc", ], - }) + proto_text_hdrs_and_srcs.srcs + tf_additional_framework_srcs(), + }) + tf_additional_framework_srcs(), hdrs = FRAMEWORK_INTERNAL_PUBLIC_HEADERS, copts = tf_copts(), linkopts = select({ @@ -2093,7 +2165,8 @@ tf_cuda_library( deps = [ ":lib", ":lib_internal", - ":proto_text", + ":protos_all_proto_text", + ":error_codes_proto_text", ":protos_all_cc", ":version_lib", "//tensorflow/core/platform/default/build_config:platformlib", @@ -2271,6 +2344,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [ "common_runtime/allocator_retry.h", "common_runtime/base_collective_executor.h", "common_runtime/bfc_allocator.h", + "common_runtime/broadcaster.h", "common_runtime/buf_rendezvous.h", "common_runtime/build_graph_options.h", "common_runtime/collective_executor_mgr.h", @@ -2318,6 +2392,7 @@ tf_cuda_library( "common_runtime/allocator_retry.cc", "common_runtime/base_collective_executor.cc", "common_runtime/bfc_allocator.cc", + "common_runtime/broadcaster.cc", "common_runtime/buf_rendezvous.cc", "common_runtime/build_graph_options.cc", "common_runtime/collective_executor_mgr.cc", @@ -2503,6 +2578,19 @@ tf_cuda_library( cc_library( name = "gpu_id", + hdrs = [ + "common_runtime/gpu/gpu_id.h", + "common_runtime/gpu/gpu_id_manager.h", + ], + deps = [ + ":lib", + ] + if_static([ + ":gpu_id_impl", + ]), +) + +cc_library( + name = "gpu_id_impl", srcs = ["common_runtime/gpu/gpu_id_manager.cc"], hdrs = [ "common_runtime/gpu/gpu_id.h", @@ -2552,7 +2640,7 @@ tf_cuda_library( ":core_cpu_lib", ":framework", ":framework_internal", - ":gpu_id", + ":gpu_id_impl", ":gpu_init_impl", ":gpu_lib", ":graph", @@ -2775,7 +2863,6 @@ tf_cc_tests( "lib/monitoring/sampler_test.cc", "lib/random/distribution_sampler_test.cc", "lib/random/philox_random_test.cc", - "lib/random/random_distributions_test.cc", "lib/random/random_test.cc", "lib/random/simple_philox_test.cc", "lib/strings/base64_test.cc", @@ -2805,6 +2892,21 @@ tf_cc_tests( ], ) +tf_cc_test( + name = "lib_random_random_distributions_test", + srcs = ["lib/random/random_distributions_test.cc"], + tags = ["optonly"], + deps = [ + ":lib", + ":lib_internal", + ":lib_test_internal", + ":protos_all_cc", + ":test", + ":test_main", + "//third_party/eigen3", + ], +) + tf_cc_test( name = "platform_env_test", size = "small", @@ -2974,6 +3076,7 @@ tf_cc_tests( "framework/common_shape_fns_test.cc", "framework/function_test.cc", "framework/graph_def_util_test.cc", + "framework/graph_to_functiondef_test.cc", "framework/kernel_def_builder_test.cc", "framework/memory_types_test.cc", "framework/node_def_builder_test.cc", @@ -3052,6 +3155,8 @@ tf_cc_tests( ":testlib", "//tensorflow/cc:cc_ops", "//tensorflow/cc:cc_ops_internal", + "//tensorflow/cc:function_ops", + "//tensorflow/cc:ops", "//tensorflow/cc:scope", "//tensorflow/cc:sendrecv_ops", "//tensorflow/cc:while_loop", @@ -3141,6 +3246,34 @@ tf_cc_tests_gpu( ], ) +tf_cc_tests_gpu( + name = "broadcaster_test", + size = "small", + srcs = [ + "common_runtime/broadcaster_test.cc", + ], + linkstatic = tf_kernel_tests_linkstatic(), + tags = tf_cuda_tests_tags(), + deps = [ + ":all_kernels", + ":core", + ":core_cpu", + ":core_cpu_internal", + ":direct_session_internal", + ":framework", + ":framework_internal", + ":gpu_runtime", + ":lib", + ":lib_internal", + ":ops", + ":protos_all_cc", + ":protos_test_cc", + ":test", + ":test_main", + ":testlib", + ], +) + tf_cc_test_mkl( name = "mkl_runtime_tests", size = "small", @@ -4028,3 +4161,9 @@ alias( actual = ":mobile_srcs", visibility = ["//visibility:public"], ) + +closure_proto_library( + name = "example_protos_closure", + visibility = ["//visibility:public"], + deps = [":example_protos"], +) diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt index c2858a1bfbb5a30e5e72bcfad694f696e08d2346..b90f5473c89cbe3afe38f0283025e7273817d0e4 100644 --- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt @@ -82,9 +82,9 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: <