diff --git a/.gitignore b/.gitignore index be75938ec401b1d72fa54773c85191aaac7d7f35..828bbe9bd3363853ae3f58f54a8d5f60cefad837 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,7 @@ Podfile.lock /tensorflow/contrib/lite/examples/ios/simple/data/*.txt /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite xcuserdata/** +/api_init_files_list.txt # Android .gradle diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc index ca80db23ed3ccbbdc49c61db6cd03ff735470512..9b86425aa5fbc2be2872b3f5d2809eaa844f9d68 100644 --- a/tensorflow/c/c_api_test.cc +++ b/tensorflow/c/c_api_test.cc @@ -1700,7 +1700,7 @@ TEST_F(CApiGradientsTest, OpWithNoGradientRegistered_NoGradInputs) { TestGradientsError(false); } -// REGISTER_OP for CApiTestAttributesTest test cases. +// REGISTER_OP for CApiAttributesTest test cases. // Registers two ops, each with a single attribute called 'v'. // The attribute in one op will have a type 'type', the other // will have list(type). diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD index 3e14c107272cb500e634f58fd4f4007468940947..14321191625e448637aa44a7f6a17820159b97c2 100644 --- a/tensorflow/c/eager/BUILD +++ b/tensorflow/c/eager/BUILD @@ -31,7 +31,6 @@ tf_cuda_library( "//tensorflow/core/common_runtime/eager:context", "//tensorflow/core/common_runtime/eager:eager_executor", "//tensorflow/core/common_runtime/eager:execute", - "//tensorflow/core/common_runtime/eager:execute_node", "//tensorflow/core/common_runtime/eager:kernel_and_device", "//tensorflow/core/common_runtime/eager:tensor_handle", "//tensorflow/core/common_runtime/eager:copy_to_device_node", @@ -41,8 +40,6 @@ tf_cuda_library( "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", - # TODO(b/74620627): move this here - "//tensorflow/python:cpp_shape_inference_proto_cc", ], }) + select({ "//tensorflow:with_xla_support": [ @@ -51,6 +48,7 @@ tf_cuda_library( ], "//conditions:default": [], }) + [ + "//tensorflow/core/common_runtime/eager:eager_operation", "//tensorflow/core:gpu_runtime", ], ) @@ -73,6 +71,7 @@ tf_cuda_library( "//tensorflow/core:lib_internal", "//tensorflow/core/common_runtime/eager:context", "//tensorflow/core/common_runtime/eager:eager_executor", + "//tensorflow/core/common_runtime/eager:eager_operation", "//tensorflow/core/common_runtime/eager:kernel_and_device", "//tensorflow/core/common_runtime/eager:tensor_handle", ], diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index 369342b142573e21f24ec4ba979288475ae3c816..3bf071f3abaac7dfd4113964fd49cd9322913bd5 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -34,7 +34,6 @@ limitations under the License. #include "tensorflow/core/common_runtime/device_set.h" #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h" #include "tensorflow/core/common_runtime/eager/execute.h" -#include "tensorflow/core/common_runtime/eager/execute_node.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/common_runtime/rendezvous_mgr.h" #include "tensorflow/core/framework/node_def_util.h" @@ -49,7 +48,6 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/public/version.h" -#include "tensorflow/python/framework/cpp_shape_inference.pb.h" using tensorflow::int64; using tensorflow::string; @@ -219,9 +217,6 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) { } return retval; } -} // extern "C" - -extern "C" { TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, TF_Status* status) { @@ -241,21 +236,18 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name, void TFE_DeleteOp(TFE_Op* op) { delete op; } void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) { - tensorflow::Device* d = nullptr; - if (device_name != nullptr && strlen(device_name) > 0) { - status->status = op->ctx->context.FindDeviceByName(device_name, &d); - } - op->device = d; + status->status = op->operation.SetDevice(device_name); } const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) { - tensorflow::Device* device = - (op->device == nullptr) ? op->ctx->context.HostCPU() : op->device; + tensorflow::Device* device = (op->operation.Device() == nullptr) + ? op->operation.EagerContext()->HostCPU() + : op->operation.Device(); return device->name().c_str(); } void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) { - op->use_xla = enable; + op->operation.SetUseXla(enable); #ifndef TENSORFLOW_EAGER_USE_XLA LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not " "built with XLA support."; @@ -263,22 +255,20 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) { } void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) { - h->handle->Ref(); - op->inputs.push_back(h->handle); - op->attrs.NumInputs(op->inputs.size()); + op->operation.AddInput(h->handle); } TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name, unsigned char* is_list, TF_Status* status) { TF_AttrType ret; - if (op->is_function()) { + if (op->operation.is_function()) { status->status = tensorflow::errors::Unimplemented( "TODO(apassos): Support for attributes for TensorFlow functions is not " "ready yet."); return TF_ATTR_INT; // The compiler requires that we return something. } - status->status = - tensorflow::AttrTypeByName(*op->attr_types, attr_name, &ret, is_list); + status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(), + attr_name, &ret, is_list); return ret; } @@ -297,23 +287,24 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx, } void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const char* value) { - op->attrs.Set(attr_name, value); + op->operation.MutableAttrs()->Set(attr_name, value); } void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value) { - op->attrs.Set(attr_name, static_cast(value)); + op->operation.MutableAttrs()->Set(attr_name, static_cast(value)); } void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value) { - op->attrs.Set(attr_name, value); + op->operation.MutableAttrs()->Set(attr_name, value); } void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, unsigned char value) { - op->attrs.Set(attr_name, (value == 0) ? false : true); + op->operation.MutableAttrs()->Set(attr_name, (value == 0) ? false : true); } void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, TF_DataType value) { - op->attrs.Set(attr_name, static_cast(value)); + op->operation.MutableAttrs()->Set(attr_name, + static_cast(value)); } void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims, @@ -335,23 +326,24 @@ void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims, proto.add_dim()->set_size(dims[d]); } } - op->attrs.Set(attr_name, proto); + op->operation.MutableAttrs()->Set(attr_name, proto); } void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name, const TFE_Op* value) { tensorflow::AttrValue attr_value; tensorflow::NameAttrList* func = attr_value.mutable_func(); - func->set_name(value->name); - value->attrs.FillAttrValueMap(func->mutable_attr()); - op->attrs.Set(attr_name, attr_value); + func->set_name(value->operation.Name()); + value->operation.Attrs().FillAttrValueMap(func->mutable_attr()); + op->operation.MutableAttrs()->Set(attr_name, attr_value); } #define TFE_OP_SET_ATTR_LIST(fn, type) \ void fn(TFE_Op* op, const char* attr_name, const type* values, \ int num_values) { \ - op->attrs.Set(attr_name, tensorflow::gtl::ArraySlice( \ - values, num_values)); \ + op->operation.MutableAttrs()->Set( \ + attr_name, \ + tensorflow::gtl::ArraySlice(values, num_values)); \ } TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrStringList, char*) TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float) @@ -359,14 +351,14 @@ TFE_OP_SET_ATTR_LIST(TFE_OpSetAttrFloatList, float) void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name, const int64_t* values, int num_values) { - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice( - reinterpret_cast(values), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice( + reinterpret_cast(values), num_values)); } void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name, const TF_DataType* values, int num_values) { - op->attrs.Set( + op->operation.MutableAttrs()->Set( attr_name, tensorflow::gtl::ArraySlice( reinterpret_cast(values), num_values)); @@ -378,8 +370,8 @@ void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name, for (int i = 0; i < num_values; ++i) { b[i] = values[i]; } - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice(b.get(), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice(b.get(), num_values)); } void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name, @@ -409,9 +401,9 @@ void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name, } } } - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice( - proto.get(), num_values)); + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice( + proto.get(), num_values)); } void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name, @@ -419,531 +411,25 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name, std::unique_ptr funcs( new tensorflow::NameAttrList[num_values]); for (int i = 0; i < num_values; i++) { - funcs[i].set_name(value[i]->name); - value[i]->attrs.FillAttrValueMap(funcs[i].mutable_attr()); - } - op->attrs.Set(attr_name, - tensorflow::gtl::ArraySlice( - funcs.get(), num_values)); -} -} // extern "C" - -namespace { - -// Initializes the step stats if needed. -void MaybeInitializeStepStats(tensorflow::StepStats* step_stats, - tensorflow::EagerContext* ctx) { - // Lazily initialize the RunMetadata with information about all devices if - // this is the first call. - while (step_stats->dev_stats_size() < ctx->devices()->size()) { - int device_idx = step_stats->dev_stats_size(); - auto* dev_stats = step_stats->add_dev_stats(); - dev_stats->set_device(ctx->devices()->at(device_idx)->name()); - } -} - -int StepStatsDeviceIndex(tensorflow::StepStats* step_stats, - tensorflow::EagerContext* ctx, - tensorflow::Device* device) { - // Find the current device's index. - if (device == nullptr) { - device = ctx->HostCPU(); - } - for (int i = 0; i < ctx->devices()->size(); ++i) { - if (ctx->devices()->at(i) == device || - ctx->devices()->at(i)->name() == device->name()) { - return i; - } + funcs[i].set_name(value[i]->operation.Name()); + value[i]->operation.Attrs().FillAttrValueMap(funcs[i].mutable_attr()); } - // TODO(apassos) do not fall back to host CPU if device is unknown. - return 0; + op->operation.MutableAttrs()->Set( + attr_name, tensorflow::gtl::ArraySlice( + funcs.get(), num_values)); } -tensorflow::Status ValidateInputTypeAndPlacement( - tensorflow::EagerContext* ctx, tensorflow::Device* op_device, TFE_Op* op, - const tensorflow::OpKernel* kernel, tensorflow::RunMetadata* run_metadata) { - tensorflow::Device* host_device = ctx->HostCPU(); - const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types(); - if (memtypes.size() != op->inputs.size()) { - return tensorflow::errors::InvalidArgument( - "expected ", memtypes.size(), " inputs, got ", op->inputs.size()); - } - for (int i = 0; i < op->inputs.size(); ++i) { - const tensorflow::Device* expected_device = - memtypes[i] == tensorflow::HOST_MEMORY ? host_device : op_device; - tensorflow::TensorHandle* handle = op->inputs[i]; - tensorflow::Device* handle_device = nullptr; - TF_RETURN_IF_ERROR(handle->Device(&handle_device)); - const tensorflow::Device* actual_device = - handle_device == nullptr ? host_device : handle_device; - if (expected_device != actual_device) { - switch (ctx->GetDevicePlacementPolicy()) { - case tensorflow::DEVICE_PLACEMENT_SILENT_FOR_INT32: - // TODO(xpan): See if we could bubble python related error up - // to python level. - if (handle->dtype == tensorflow::DT_INT32) { - // Note: enabling silent copies of int32 tensors to match behavior - // of graph mode. - break; - } - TF_FALLTHROUGH_INTENDED; - case tensorflow::DEVICE_PLACEMENT_EXPLICIT: - return tensorflow::errors::InvalidArgument( - "Tensors on conflicting devices:" - " cannot compute ", - op->name, " as input #", i, " was expected to be on ", - expected_device->name(), " but is actually on ", - actual_device->name(), " (operation running on ", - op_device->name(), ")", - " Tensors can be copied explicitly using .gpu() or .cpu() " - "methods," - " or transparently copied by using tf.enable_eager_execution(" - "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors " - "between devices" - " may slow down your model"); - case tensorflow::DEVICE_PLACEMENT_WARN: - LOG(WARNING) << "before computing " << op->name << " input #" << i - << " was expected to be on " << expected_device->name() - << " but is actually on " << actual_device->name() - << " (operation running on " << op_device->name() - << "). This triggers a copy which can be a performance " - "bottleneck."; - break; - case tensorflow::DEVICE_PLACEMENT_SILENT: // Do nothing. - break; - } - // We are only here if the policy is warn or silent copies, so we should - // trigger a copy. - auto pre_time = tensorflow::Env::Default()->NowMicros(); - tensorflow::TensorHandle* copied_tensor = nullptr; - tensorflow::Status status = tensorflow::EagerCopyToDevice( - handle, ctx, expected_device->name().c_str(), &copied_tensor); - if (run_metadata != nullptr) { - auto* step_stats = run_metadata->mutable_step_stats(); - MaybeInitializeStepStats(step_stats, ctx); - // Record the sending on the source device for now. - int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device); - auto* dev_stats = step_stats->mutable_dev_stats(device_idx); - auto* node_stats = dev_stats->add_node_stats(); - node_stats->set_node_name("_Send"); - node_stats->set_all_start_micros(pre_time); - node_stats->set_op_end_rel_micros( - tensorflow::Env::Default()->NowMicros() - pre_time); - } - if (!status.ok()) { - if (copied_tensor != nullptr) copied_tensor->Unref(); - return tensorflow::errors::Internal( - "Failed copying input tensor from ", actual_device->name(), " to ", - expected_device->name(), " in order to run ", op->name, ": ", - status.error_message()); - } - handle->Unref(); - handle = copied_tensor; - op->inputs[i] = copied_tensor; - } - if (handle->dtype != kernel->input_type(i)) { - return tensorflow::errors::InvalidArgument( - "cannot compute ", op->name, " as input #", i, - " was expected to be a ", - tensorflow::DataTypeString(kernel->input_type(i)), - " tensor but is a ", tensorflow::DataTypeString(handle->dtype), - " tensor"); - } - } - return tensorflow::Status::OK(); -} - -tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef, - TFE_Context* ctx, TF_Status* status) { - tensorflow::DeviceSet ds; - for (tensorflow::Device* d : *ctx->context.devices()) { - ds.AddDevice(d); - } - tensorflow::DeviceTypeVector final_devices; - status->status = tensorflow::SupportedDeviceTypesForNode( - ds.PrioritizedDeviceTypeList(), ndef, &final_devices); - if (!status->status.ok()) { - return nullptr; - } - if (final_devices.empty()) { - status->status = tensorflow::errors::Internal( - "Could not find valid device for node ", ndef.DebugString()); - return nullptr; - } - for (tensorflow::Device* d : *ctx->context.devices()) { - if (d->device_type() == final_devices[0].type_string()) { - return d; - } - } - status->status = tensorflow::errors::Unknown( - "Could not find a device for node ", ndef.DebugString()); - return nullptr; -} - -#ifdef TENSORFLOW_EAGER_USE_XLA -// Synthesizes and returns a wrapper function over `op`, which must be a -// primitive op (e.g. matmul). -// -// The wrapper function conforms to the function signature expected by -// _XlaLaunchOp, with input params ordered by . For example, if the op has input params , they will be reordered to as the input params to the synthesized function. -// -// It populates `const_input_types`, `arg_input_types` and -// `op_input_to_func_input` based on the reordering results, that the caller can -// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets -// `status` accordingly. -const tensorflow::FunctionDef* OpToFunction( - TFE_Op* op, std::vector* const_input_types, - std::vector* arg_input_types, - tensorflow::gtl::FlatMap* op_input_to_func_input, - TF_Status* status) { - DCHECK(!op->is_function()); - - tensorflow::FunctionDef fdef; - - // Get the OpDef of the op we are trying to encapsulate. - TFE_Context* ctx = op->ctx; - const tensorflow::OpRegistrationData* op_data; - { - status->status = ctx->context.FindFunctionOpData(op->name, &op_data); - if (!status->status.ok()) { - return nullptr; - } - } - const tensorflow::OpDef& op_def = op_data->op_def; - - tensorflow::OpDef* signature = fdef.mutable_signature(); - - // Handle constant inputs. - const std::unordered_set const_inputs( - *tensorflow::XlaOpRegistry::CompileTimeConstantInputs(op->name)); - - // First add place holders for the input args, so that we can refer to them by - // position in the next loop. Also tally up the resource inputs. - int num_resource_inputs = 0; - for (int i = 0; i < op_def.input_arg_size(); ++i) { - if (op_def.input_arg(i).type() == tensorflow::DT_RESOURCE) { - ++num_resource_inputs; - } - signature->add_input_arg(); - } - - // Now we map the input params from `op_def` to `signature`, where the param - // ordering for `signature` is: . - int const_index = 0; - int arg_index = const_inputs.size(); - int resource_index = op_def.input_arg_size() - num_resource_inputs; - for (int i = 0; i < op_def.input_arg_size(); ++i) { - const tensorflow::OpDef::ArgDef& op_input_arg = op_def.input_arg(i); - tensorflow::OpDef::ArgDef* func_input_arg = nullptr; - if (const_inputs.find(op_input_arg.name()) != const_inputs.end()) { - VLOG(1) << "For const input, mapping op input " << i << " to func input " - << const_index; - (*op_input_to_func_input)[i] = const_index; - func_input_arg = signature->mutable_input_arg(const_index++); - const_input_types->push_back( - static_cast(op->inputs[i]->dtype)); - } else if (op_input_arg.type() == tensorflow::DT_RESOURCE) { - VLOG(1) << "For resource input, mapping op input " << i - << " to func input " << resource_index; - (*op_input_to_func_input)[i] = resource_index; - func_input_arg = signature->mutable_input_arg(resource_index++); - } else { - VLOG(1) << "For arg input, mapping op input " << i << " to func input " - << arg_index; - (*op_input_to_func_input)[i] = arg_index; - func_input_arg = signature->mutable_input_arg(arg_index++); - arg_input_types->push_back( - static_cast(op->inputs[i]->dtype)); - } - - func_input_arg->set_name(op_input_arg.name()); - func_input_arg->set_type(op->inputs[i]->dtype); - } - VLOG(1) << "Added OpDef Inputs: " << fdef.DebugString(); - - // Resources args are at the end of the function input params, and we should - // have iterated over all of them. - DCHECK_EQ(signature->input_arg_size(), resource_index); - - // Make the synthesized function's name unique. - signature->set_name(tensorflow::strings::StrCat( - op_def.name(), func_id_generator.fetch_add(1))); - - // Add the node def and set its input names to match op_def's names. - const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef(); - DCHECK_EQ(signature->input_arg_size(), ndef.input_size()); - *fdef.add_node_def() = ndef; - for (int i = 0; i < op_def.input_arg_size(); ++i) { - fdef.mutable_node_def(0)->set_input(i, op_def.input_arg(i).name()); - } - VLOG(1) << "Added NodeDef: " << fdef.DebugString(); - - // Fix the output names and set output types. - for (int i = 0; i < op_def.output_arg_size(); ++i) { - tensorflow::OpDef::ArgDef* arg = signature->add_output_arg(); - const tensorflow::OpDef::ArgDef& op_def_arg = op_def.output_arg(i); - const string& out_tensor_name = tensorflow::strings::StrCat( - ndef.name(), ":", op_def_arg.name(), ":", 0); - arg->set_name(op_def_arg.name()); - (*fdef.mutable_ret())[op_def_arg.name()] = out_tensor_name; - const string& type_attr = op_def_arg.type_attr(); - if (!type_attr.empty()) { - auto i = ndef.attr().find(type_attr); - if (i == ndef.attr().end()) { - status->status = tensorflow::errors::InvalidArgument( - tensorflow::strings::StrCat("Could not find attr ", type_attr, - " in NodeDef ", ndef.DebugString())); - return nullptr; - } - arg->set_type(i->second.type()); - } - } - VLOG(1) << "Fixed Output names and all types: " << fdef.DebugString(); - - status->status = ctx->context.AddFunctionDef(fdef); - if (!status->status.ok()) return nullptr; - const auto ret = ctx->context.FindFunctionDef(signature->name()); - DCHECK(ret != nullptr); - return ret; -} - -// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed -// via XLA. -std::unique_ptr BuildXlaLaunch(TFE_Op* op, TF_Status* status) { - VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->name; - auto launch_op = - std::unique_ptr(TFE_NewOp(op->ctx, "_XlaLaunch", status)); - if (TF_GetCode(status) != TF_OK) return nullptr; - if (op->device) { - TFE_OpSetDevice(launch_op.get(), op->device->name().c_str(), status); - if (TF_GetCode(status) != TF_OK) return nullptr; - } - - const tensorflow::FunctionDef* fdef; - { fdef = op->ctx->context.FindFunctionDef(op->name); } - std::vector const_input_types; - std::vector arg_input_types; - tensorflow::gtl::FlatMap op_input_to_func_input; - if (fdef == nullptr) { - // See if this is a primitive op, and if so create a function for it, so - // that _XlaLaunchOp can access it. - fdef = OpToFunction(op, &const_input_types, &arg_input_types, - &op_input_to_func_input, status); - if (!status->status.ok()) return nullptr; - } else { - // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for - // functions, so we need to find another way to handle constant inputs. - for (int i = const_input_types.size(); - i < fdef->signature().input_arg_size(); ++i) { - VLOG(1) << "Adding Targs from input arg " << i; - const tensorflow::OpDef::ArgDef& arg = fdef->signature().input_arg(i); - arg_input_types.push_back(static_cast(arg.type())); - } - } - DCHECK(fdef != nullptr); - - // Copy inputs and their devices. - // Since input param reordering may have occurred between `op` and `launch_op` - // via `op_input_to_func_input`, adjust the actual inputs accordingly. - launch_op->inputs = op->inputs; - for (tensorflow::TensorHandle* h : launch_op->inputs) { - h->Ref(); - } - if (!op_input_to_func_input.empty()) { - DCHECK_EQ(op->inputs.size(), op_input_to_func_input.size()); - for (int i = 0; i < op_input_to_func_input.size(); ++i) { - VLOG(1) << "mapping op input " << i << " to func input " - << op_input_to_func_input[i]; - - launch_op->inputs[op_input_to_func_input[i]] = op->inputs[i]; - } - } - launch_op->attrs.NumInputs(op->inputs.size()); - - TFE_OpSetAttrTypeList(launch_op.get(), "Tconstants", const_input_types.data(), - const_input_types.size()); - - // Set Targs and Nresources attrs. - TFE_OpSetAttrTypeList(launch_op.get(), "Targs", arg_input_types.data(), - arg_input_types.size()); - const int num_resource_inputs = fdef->signature().input_arg_size() - - const_input_types.size() - - arg_input_types.size(); - TFE_OpSetAttrInt(launch_op.get(), "Nresources", num_resource_inputs); - - // Set Tresults attr. - std::vector tresults; - for (const tensorflow::OpDef::ArgDef& arg : fdef->signature().output_arg()) { - tresults.push_back(static_cast(arg.type())); - } - TFE_OpSetAttrTypeList(launch_op.get(), "Tresults", tresults.data(), - tresults.size()); - - // Set function attr. - tensorflow::AttrValue attr_value; - tensorflow::NameAttrList* func = attr_value.mutable_func(); - func->set_name(fdef->signature().name()); - launch_op->attrs.Set("function", attr_value); - - return launch_op; -} -#endif // TENSORFLOW_EAGER_USE_XLA - -} // namespace - -extern "C" { - void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals, TF_Status* status) { - TFE_Context* ctx = op->ctx; - status->status = ctx->context.GetStatus(); + tensorflow::gtl::InlinedVector handle_retvals( + *num_retvals); + status->status = + tensorflow::EagerExecute(&op->operation, &handle_retvals, num_retvals); if (!status->status.ok()) { return; } -#ifdef TENSORFLOW_EAGER_USE_XLA - std::unique_ptr xla_launch_op; - if (op->use_xla && op->name != "_XlaLaunch") { - xla_launch_op = BuildXlaLaunch(op, status); - if (!status->status.ok()) { - return; - } - op = xla_launch_op.get(); - } -#endif // TENSORFLOW_EAGER_USE_XLA - // Ensure all resource-touching ops run in the device the resource is, - // regardless of anything else that has been specified. This is identical to - // the graph mode behavior. - for (int i = 0; i < op->inputs.size(); ++i) { - tensorflow::Device* input_op_device = nullptr; - status->status = op->inputs[i]->OpDevice(&input_op_device); - if (!status->status.ok()) return; - VLOG(2) << "for op " << op->name << " input " << i << " " - << tensorflow::DataTypeString(op->inputs[i]->dtype) << " " - << (input_op_device == nullptr ? "cpu" : input_op_device->name()) - << " " << (op->device == nullptr ? "cpu" : op->device->name()); - if (op->inputs[i]->dtype == tensorflow::DT_RESOURCE && - (input_op_device != op->device || input_op_device == nullptr)) { - tensorflow::Device* d = - input_op_device == nullptr ? ctx->context.HostCPU() : input_op_device; - VLOG(1) << "Changing device of operation " << op->name << " to " - << d->name() << " because input #" << i - << " is a resource in this device."; - op->device = d; - } - } - tensorflow::Device* device = op->device; - - tensorflow::Fprint128 cache_key = - op->attrs.CacheKey(device == nullptr ? "unspecified" : device->name()); - tensorflow::KernelAndDevice* kernel = ctx->context.GetCachedKernel(cache_key); - if (kernel == nullptr) { - const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef(); - if (device == nullptr) { - device = SelectDevice(ndef, ctx, status); - if (!status->status.ok()) { - return; - } - } - CHECK(device != nullptr); - if (ctx->context.LogDevicePlacement()) { - LOG(INFO) << "Executing op " << ndef.op() << " in device " - << device->name(); - } - kernel = new tensorflow::KernelAndDevice(ctx->context.GetRendezvous()); - // Knowledge of the implementation of Init (and in-turn - // FunctionLibraryRuntime::CreateKernel) tells us that ctx->func_lib_def - // will be accessed, so grab on to the lock. - // See WARNING comment in Execute (before kernel->Run) - would be nice to - // rework to avoid this subtlety. - tensorflow::tf_shared_lock l(*ctx->context.FunctionsMu()); - status->status = tensorflow::KernelAndDevice::Init( - ndef, ctx->context.func_lib(device), kernel); - if (!status->status.ok()) { - delete kernel; - return; - } - // Update output_dtypes inside `kernel`. - const tensorflow::OpDef* op_def = nullptr; - const tensorflow::FunctionDef* function_def = - ctx->context.FuncLibDef()->Find(ndef.op()); - if (function_def != nullptr) { - op_def = &(function_def->signature()); - } - if (op_def == nullptr) { - status->status = OpDefForOp(ndef.op().c_str(), &op_def); - if (!status->status.ok()) { - return; - } - } - tensorflow::DataTypeVector input_dtypes; - status->status = InOutTypesForNode(ndef, *op_def, &input_dtypes, - kernel->mutable_output_dtypes()); - if (!status->status.ok()) { - return; - } - ctx->context.AddKernelToCache(cache_key, kernel); - } - const tensorflow::DataTypeVector& output_dtypes = kernel->output_dtypes(); - const int output_dtypes_size = output_dtypes.size(); - if (output_dtypes_size > *num_retvals) { - TF_SetStatus(status, TF_INVALID_ARGUMENT, - tensorflow::strings::StrCat("Expecting ", output_dtypes.size(), - " outputs, but *num_retvals is ", - *num_retvals) - .c_str()); - return; - } - *num_retvals = output_dtypes_size; - if (device == nullptr) { - // TODO(apassos) debug how the assignment below might return a different - // device from the one requested above. - device = kernel->device(); - } - status->status = ValidateInputTypeAndPlacement( - &ctx->context, device, op, kernel->kernel(), - ctx->context.ShouldStoreMetadata() ? ctx->context.RunMetadataProto() - : nullptr); - if (!status->status.ok()) return; - std::unique_ptr maybe_stats; - if (ctx->context.ShouldStoreMetadata()) { - maybe_stats.reset(new tensorflow::NodeExecStats); - maybe_stats->set_node_name(op->name); - maybe_stats->set_all_start_micros(tensorflow::Env::Default()->NowMicros()); - maybe_stats->set_op_start_rel_micros(0); - maybe_stats->set_scheduled_micros(tensorflow::Env::Default()->NowMicros()); - // TODO(apassos) track referenced tensors - } - if (ctx->context.Async()) { - // Note that for async mode, execution order will make sure that all - // input handles are ready before executing them. - // TODO(agarwal): Consider executing "cheap" kernels inline for performance. - tensorflow::gtl::InlinedVector handle_retvals( - *num_retvals); - tensorflow::uint64 id = op->ctx->context.NextId(); - for (int i = 0; i < *num_retvals; ++i) { - tensorflow::TensorHandle* h = - new tensorflow::TensorHandle(id, output_dtypes[i], &op->ctx->context); - retvals[i] = new TFE_TensorHandle(h); - handle_retvals[i] = h; - } - tensorflow::EagerNode* node = new tensorflow::ExecuteNode( - id, &op->ctx->context, op->device, op->inputs, kernel, - maybe_stats.release(), output_dtypes, handle_retvals); - ctx->context.ExecutorAdd(node); - } else { - // Execute checks if retvals[i] is nullptr or not to figure if it needs to - // allocate it. - tensorflow::gtl::InlinedVector handle_retvals( - *num_retvals); - status->status = tensorflow::EagerExecute( - &op->ctx->context, op->device, op->inputs, kernel, maybe_stats.get(), - handle_retvals.data(), *num_retvals); - for (int i = 0; i < *num_retvals; ++i) { - retvals[i] = new TFE_TensorHandle(handle_retvals[i]); - } + for (int i = 0; i < *num_retvals; ++i) { + retvals[i] = new TFE_TensorHandle(handle_retvals[i]); } } @@ -1016,62 +502,6 @@ void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf, ctx->context.RunMetadataProto()->Clear(); } -void TFE_GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, - TF_Buffer* output_proto, - TF_Status* status) { - tensorflow::Node* node = &output.oper->node; - tensorflow::CppShapeInferenceResult::HandleData handle_data; - handle_data.set_is_set(true); - { - tensorflow::mutex_lock l(graph->mu); - tensorflow::shape_inference::InferenceContext* ic = - graph->refiner.GetContext(node); - CHECK(ic != nullptr); - CHECK_LT(output.index, ic->num_outputs()); - const auto* shapes_and_types = - ic->output_handle_shapes_and_types(output.index); - if (shapes_and_types == nullptr) { - output_proto->data = nullptr; - output_proto->length = 0; - output_proto->data_deallocator = nullptr; - return; - } - - for (const auto& p : *shapes_and_types) { - auto* out_shape_and_type = handle_data.add_shape_and_type(); - ic->ShapeHandleToProto(p.shape, out_shape_and_type->mutable_shape()); - out_shape_and_type->set_dtype(p.dtype); - } - } - status->status = MessageToBuffer(handle_data, output_proto); -} - -void TFE_SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, - const void* proto, size_t proto_len, - TF_Status* status) { - tensorflow::CppShapeInferenceResult::HandleData handle_data; - if (!handle_data.ParseFromArray(proto, proto_len)) { - status->status = tensorflow::errors::InvalidArgument( - "Couldn't deserialize HandleData proto"); - return; - } - DCHECK(handle_data.is_set()); - - tensorflow::mutex_lock l(graph->mu); - tensorflow::shape_inference::InferenceContext* ic = - graph->refiner.GetContext(&output.oper->node); - - std::vector shapes_and_types; - for (const auto& shape_and_type_proto : handle_data.shape_and_type()) { - tensorflow::shape_inference::ShapeHandle shape; - status->status = - ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape); - if (status->status.ok()) return; - shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype()); - } - ic->set_output_handle_shapes_and_types(output.index, shapes_and_types); -} - namespace { TFE_Op* GetFunc(TFE_Context* ctx, const tensorflow::NameAttrList& func, TF_Status* status) { @@ -1142,9 +572,3 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op, } } } // namespace tensorflow - -TFE_Op::~TFE_Op() { - for (tensorflow::TensorHandle* h : inputs) { - h->Unref(); - } -} diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h index ba77f3cd07ff686782ad7b5c5751811aa3336555..c06ce84a8c578aa60dd626c24bd58098b78ae750 100644 --- a/tensorflow/c/eager/c_api.h +++ b/tensorflow/c/eager/c_api.h @@ -329,20 +329,6 @@ TF_CAPI_EXPORT extern void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf, TF_Status* status); -// Returns the serialized CppShapeInferenceResult::HandleData proto for -// `output` if its a resource tensor, or otherwise returns an empty buffer. -TF_CAPI_EXPORT extern void TFE_GetResourceHandleShapeAndType( - TF_Graph* graph, TF_Output output, TF_Buffer* output_proto, - TF_Status* status); - -// Sets `output` based on `proto`, which should be a serialized -// CppShapeInferenceResult::HandleData proto. -TF_CAPI_EXPORT extern void TFE_SetResourceHandleShapeAndType(TF_Graph* graph, - TF_Output output, - const void* proto, - size_t proto_len, - TF_Status* status); - #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h index 05dc64f521735f944559392f470a37590e93f17c..49e1aab1cef9577256d9b081858cf094c788caf8 100644 --- a/tensorflow/c/eager/c_api_internal.h +++ b/tensorflow/c/eager/c_api_internal.h @@ -32,6 +32,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/eager/context.h" #include "tensorflow/core/common_runtime/eager/eager_executor.h" +#include "tensorflow/core/common_runtime/eager/eager_operation.h" #include "tensorflow/core/common_runtime/eager/kernel_and_device.h" #include "tensorflow/core/common_runtime/eager/tensor_handle.h" #include "tensorflow/core/common_runtime/function.h" @@ -45,7 +46,6 @@ limitations under the License. #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/public/version.h" - struct TFE_ContextOptions { TF_SessionOptions session_options; // true if async execution is enabled. @@ -85,19 +85,9 @@ struct TFE_Op { // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a // primitive operation. TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t) - : ctx(ctx), name(op), attrs(op), attr_types(t), device(nullptr) {} - - ~TFE_Op(); - - bool const is_function() const { return attr_types == nullptr; } + : operation(&ctx->context, op, t) {} - TFE_Context* ctx; // Must outlive the TFE_Op. - const tensorflow::string name; - tensorflow::AttrBuilder attrs; - const tensorflow::AttrTypeMap* attr_types; - tensorflow::gtl::InlinedVector inputs; - tensorflow::Device* device; - bool use_xla = false; + tensorflow::EagerOperation operation; }; namespace tensorflow { diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc index 93155998b86d59ec78c7ff25f146b8e3c8eac380..e18fdf6c57bd3f432d8cb73536fb816df90b3963 100644 --- a/tensorflow/c/python_api.cc +++ b/tensorflow/c/python_api.cc @@ -110,7 +110,7 @@ void ExtendSession(TF_Session* session, TF_Status* status) { session->extend_before_run = false; } -std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) { +std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) { Node* node = &output.oper->node; CppShapeInferenceResult::HandleData handle_data; handle_data.set_is_set(true); @@ -135,4 +135,30 @@ std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) { return result; } +void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, + const void* proto, size_t proto_len, + TF_Status* status) { + tensorflow::CppShapeInferenceResult::HandleData handle_data; + if (!handle_data.ParseFromArray(proto, proto_len)) { + status->status = tensorflow::errors::InvalidArgument( + "Couldn't deserialize HandleData proto"); + return; + } + DCHECK(handle_data.is_set()); + + tensorflow::mutex_lock l(graph->mu); + tensorflow::shape_inference::InferenceContext* ic = + graph->refiner.GetContext(&output.oper->node); + + std::vector shapes_and_types; + for (const auto& shape_and_type_proto : handle_data.shape_and_type()) { + tensorflow::shape_inference::ShapeHandle shape; + status->status = + ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape); + if (status->status.ok()) return; + shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype()); + } + ic->set_output_handle_shapes_and_types(output.index, shapes_and_types); +} + } // namespace tensorflow diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h index 2d4c8cd9ed7bc926f448dab1f6b50ed74179ea14..4bcb5bde62c8a4df4e68c1ce0daaf459434ceb5d 100644 --- a/tensorflow/c/python_api.h +++ b/tensorflow/c/python_api.h @@ -55,9 +55,15 @@ void ExtendSession(TF_Session* session, TF_Status* status); // Returns the serialized CppShapeInferenceResult::HandleData proto for // `output` if its a resource tensor, or otherwise returns the empty string. -// TODO(b/74620627): remove when _USE_C_SHAPES is removed -std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output); - +std::string GetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output); + +// Sets `output` based on `proto`, which should be a serialized +// CppShapeInferenceResult::HandleData proto. +// NOTE(skyewm): `proto` is passed a void*/size_t pair instead of a std::string +// because I couldn't get SWIG to work otherwise. +void SetResourceHandleShapeAndType(TF_Graph* graph, TF_Output output, + const void* proto, size_t proto_len, + TF_Status* status); } // namespace tensorflow #endif // TENSORFLOW_C_PYTHON_API_H_ diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc index 6545e4ee3eb406436937a43ddac66d017af8e108..ff348fadb24e29a83bd6c8853aa67931f6df4182 100644 --- a/tensorflow/cc/gradients/array_grad.cc +++ b/tensorflow/cc/gradients/array_grad.cc @@ -385,6 +385,42 @@ Status MirrorPadGradGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("MirrorPadGrad", MirrorPadGradGrad); +Status StridedSliceGradHelper(const Scope& scope, const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + Input x = Shape(scope, op.input(0)); + Input begin = op.input(1); + Input end = op.input(2); + Input strides = op.input(3); + int64 begin_mask; + int64 end_mask; + int64 ellipsis_mask; + int64 new_axis_mask; + int64 shrink_axis_mask; + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "begin_mask", &begin_mask)); + TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "end_mask", &end_mask)); + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "ellipsis_mask", &ellipsis_mask)); + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "new_axis_mask", &new_axis_mask)); + TF_RETURN_IF_ERROR( + GetNodeAttr(op.node()->attrs(), "shrink_axis_mask", &shrink_axis_mask)); + grad_outputs->push_back( + StridedSliceGrad(scope, x, begin, end, strides, grad_inputs[0], + StridedSliceGrad::BeginMask(begin_mask) + .EndMask(end_mask) + .EllipsisMask(ellipsis_mask) + .NewAxisMask(new_axis_mask) + .ShrinkAxisMask(shrink_axis_mask))); + // No gradients returned for begin, end and strides + grad_outputs->push_back(NoGradient()); + grad_outputs->push_back(NoGradient()); + grad_outputs->push_back(NoGradient()); + return scope.status(); +} +REGISTER_GRADIENT_OP("StridedSlice", StridedSliceGradHelper); + } // anonymous namespace } // namespace ops } // namespace tensorflow diff --git a/tensorflow/cc/gradients/array_grad_test.cc b/tensorflow/cc/gradients/array_grad_test.cc index 4a215fcc9299cf8b8da04cbf151640631ed0d449..de3bd0fc9e2493f8ff76163f5be6bd4327c58c5a 100644 --- a/tensorflow/cc/gradients/array_grad_test.cc +++ b/tensorflow/cc/gradients/array_grad_test.cc @@ -354,5 +354,29 @@ TEST_F(ArrayGradTest, MirrorPadGradGrad_Symmetric) { RunTest(x, x_shape, y, y_shape); } +TEST_F(ArrayGradTest, StridedSliceGrad) { + TensorShape x_shape({6, 4, 4}); + auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape)); + + // y = x[2:6:2, 1:3, 1:3] + auto y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1}); + // y.shape = [2, 2, 2]; + RunTest(x, x_shape, y, {2, 2, 2}); + + // y = x[2:6:2, 1:3, 1:3] + // begin_mask = 1<<1 (ignore begin_index = 1) + // end_mask = 1<<2 (ignore end_index = 2) + y = StridedSlice(scope_, x, {2, 1, 1}, {6, 3, 3}, {2, 1, 1}, + StridedSlice::BeginMask(1 << 1).EndMask(1 << 2)); + // y.shape = [2, 3, 3]; + RunTest(x, x_shape, y, {2, 3, 3}); + + // y = [tf.newaxis, 2:6:2, 1:3, 1:3] + y = StridedSlice(scope_, x, {0, 2, 1, 1}, {0, 6, 3, 3}, {1, 2, 1, 1}, + StridedSlice::NewAxisMask(1 << 0)); + // y.shape = [1, 2, 2, 2]; + RunTest(x, x_shape, y, {1, 2, 2, 2}); +} + } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc index 7c833878818022c86fd3171ec9cef9fcd3217a24..e17a7c4bf6732177508c250aa034ecc382c505d9 100644 --- a/tensorflow/compiler/aot/compile.cc +++ b/tensorflow/compiler/aot/compile.cc @@ -88,9 +88,8 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config, // Converts the graph into an XLA computation, and compiles the // computation. // TODO(toddw): Should we let the user pick the XLA cpu vs. gpu client? - namespace gpu = perftools::gputools; - gpu::Platform* cpu_platform = - gpu::MultiPlatformManager::PlatformWithName("Host").ValueOrDie(); + se::Platform* cpu_platform = + se::MultiPlatformManager::PlatformWithName("Host").ValueOrDie(); xla::CompileOnlyClient* client = xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform) .ValueOrDie(); diff --git a/tensorflow/compiler/aot/test.cc b/tensorflow/compiler/aot/test.cc index 47ef5f82cbc718ea300afa0c4eb4b73e1ca22fd0..6b098049cbd7539a2b2e2696b13139a8a6b28e0f 100644 --- a/tensorflow/compiler/aot/test.cc +++ b/tensorflow/compiler/aot/test.cc @@ -35,6 +35,7 @@ limitations under the License. // clang-format on #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index 50fa95c4f322e85c22f7be2d63f2bcd194ee419e..af2965bba5b91a66e206f05bb8945b0dcde1d2b4 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -180,6 +180,7 @@ cc_library( "//tensorflow/core/kernels:no_op", "//tensorflow/core/kernels:sendrecv_ops", "//tensorflow/core/kernels:variable_ops", + "@com_google_absl//absl/memory", ], ) @@ -256,19 +257,6 @@ cc_library( alwayslink = 1, ) -cc_library( - name = "graph_to_functiondef", - srcs = ["graph_to_functiondef.cc"], - hdrs = ["graph_to_functiondef.h"], - visibility = [":friends"], - deps = [ - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework", - "//tensorflow/core:lib", - "//tensorflow/core:protos_all_cc", - ], -) - cc_library( name = "create_xla_launch_op", srcs = [ @@ -299,7 +287,6 @@ cc_library( ], deps = [ ":common", - ":graph_to_functiondef", ":shape_inference_helpers", ":union_find", "//tensorflow/compiler/jit/graphcycles", @@ -346,28 +333,6 @@ tf_cc_test( ], ) -tf_cc_test( - name = "graph_to_functiondef_test", - size = "small", - srcs = [ - "graph_to_functiondef_test.cc", - ], - deps = [ - ":graph_to_functiondef", - "//tensorflow/cc:cc_ops", - "//tensorflow/cc:cc_ops_internal", - "//tensorflow/cc:function_ops", - "//tensorflow/cc:ops", - "//tensorflow/compiler/tf2xla:xla_compiler", - "//tensorflow/compiler/tf2xla/kernels:xla_ops", - "//tensorflow/core:core_cpu", - "//tensorflow/core:framework_internal", - "//tensorflow/core:test", - "//tensorflow/core:test_main", - "//tensorflow/core:testlib", - ], -) - tf_cc_test( name = "compilation_passes_test", size = "small", @@ -378,7 +343,6 @@ tf_cc_test( deps = [ ":common", ":compilation_passes", - ":graph_to_functiondef", "//tensorflow/cc:cc_ops", "//tensorflow/cc:cc_ops_internal", "//tensorflow/cc:function_ops", diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc index 9465385b5856baf4d03f280ff30572e196a7663b..f06debaf316c0172a5683e56aa5de6ebb83fbece 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc @@ -22,7 +22,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/jit/graph_to_functiondef.h" +#include "tensorflow/compiler/jit/graphcycles/graphcycles.h" #include "tensorflow/compiler/jit/legacy_flags/encapsulate_subgraphs_pass_flags.h" #include "tensorflow/compiler/jit/mark_for_compilation_pass.h" #include "tensorflow/compiler/jit/shape_inference_helpers.h" @@ -34,6 +34,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/shape_refiner.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_def_util.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/graph/algorithm.h" @@ -160,6 +161,11 @@ class Encapsulator { std::move(outside_compilation_attribute)), graph_in_(graph_in) {} + // Find dependencies between subgraphs and outside_compilation clusters that + // only manifest via edges between outside_compilation clusters in the outer + // (non-compiled) graph. + Status FindClusterDependencies(); + // Find subgraphs marked with 'group_attribute', and build a new // subgraph, one for each value of 'group_attribute'. Status SplitIntoSubgraphs(); @@ -230,6 +236,19 @@ class Encapsulator { // the shapes of any ancestor RAH outputs. If it can be determined that the // shape of the SFH inputs will not be inferrable even once the shapes of the // RAH outputs are known, an error is returned by the rewriter. + // + // Once edges between compiled and outside_compilation clusters have been + // replaced by send/recv ops, some dependencies may no longer be apparent. + // A clustering pass finds all the dependencies between HC nodes that are only + // present as a result of edges between nodes in outside_compilaton clusters. + // Suppose there is a path from outside_compilation cluster C in subgraph S + // to outside_compilation cluster D in subgraph T. If S != T then a control + // edge is added from the call node for S to the call node for T, which + // ensures that C will execute before D because S executes before T. If S==T + // then a control dependency is added between the HC nodes for C and D in S, + // and the HC node for C is added to an 'ancestors' attr in the HC node for D + // so that during compilation of the HC node for D, an XLA control dependency + // can be added to ensure C's SendToHost executes before D's RecvFromHost. class Subgraph { public: // Creates a graph to build the subgraph in, if it doesn't already exist, @@ -324,6 +343,18 @@ class Encapsulator { void RecordOutsideCompilationOutputOrControl( const string& outside_compilation_id, const Edge* edge); + // Records the fact that there is a path from a node in outside_compilation + // cluster ancestor to node in cluster successor that does not go through + // the subgraph. + void RecordOutsideCompilationDependency(const string& successor, + const string& ancestor); + + // Returns the mapping from outside_compilation cluster C to the set of + // outside_compilation clusters that have a path to C entirely outside + // compiled subgraphs. + const std::unordered_map> + OutsideCompilationAncestorMap() const; + // Adds the HostCompute nodes for each outside_compilation subgraph. Status AddHostComputes( const string& subgraph_name, @@ -406,6 +437,13 @@ class Encapsulator { Status AddHostComputeKeyPlaceholder(OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out); + // Get the set of outside_compilation clusters and the dependency edges + // between them. + void GetActiveClusterDependencyGraph( + std::unordered_set* clusters, + std::unordered_set* has_successor, + std::unordered_map>* ancestors_map); + // Builds a _RecvAtHost node producing all the inputs of an // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host. Status AddRecvAtHostNode(const string& group_attribute, @@ -468,6 +506,14 @@ class Encapsulator { // The outside_compilation clusters in this subgraph. std::unordered_map outside_compilation_subgraphs_; + // For each outside_compilation cluster C, the outside_compilation clusters + // that have a path to C outside the compiled graph. + std::unordered_map> + outside_compilation_ancestors_; + // For each outside_compilation cluster C, the outside_compilation clusters + // that have a path from C outside the compiled graph. + std::unordered_map> + outside_compilation_successors_; // NoOp node in the output graph that is sequenced after the call node and // used to prevent host-side outside_compilation sends and recvs from being @@ -556,6 +602,10 @@ class Encapsulator { std::unordered_set, NodeSlot::PairHasher>* edges_added); + // Adds control dependencies between subgraph call nodes that have + // dependencies via outside_compilation edges. + Status AddCallNodeDependencies(Graph* graph_out); + // Adds all edges to the output graph. Status AddEdgesToOutputGraph( const std::unordered_map& node_images, @@ -620,10 +670,65 @@ class Encapsulator { const Graph* graph_in_; std::unordered_map subgraphs_; + // For each subgraph S the subgraphs S' such that there is a path in some + // outside_compilation cluster C in S to some outside_compilation cluster C' + // in S', that goes only through the uncompiled graph. + std::unordered_map> subgraph_ancestors_; TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator); }; +namespace { + +// Return in 'sorted' a topological sort of clusters according to the +// dependencies encoded in ancestors. clusters is the list of all clusters +// including clusters that are not present in the ancestors map. has_successors +// is the set of clusters that are ancestors of some other cluster. +void TopologicalClusterSort( + const std::unordered_set& clusters, + const std::unordered_set& has_successors, + const std::unordered_map>& ancestors, + std::vector* sorted) { + // The nodes are placed in 'sorted' in topological order. + sorted->clear(); + // We don't use the standard DFS because we are not operating on Node* + // objects. + struct Work { + string cluster; + bool leave; + }; + std::set visited; + std::vector stack; + // Seed the processing list with clusters that have no successors. + for (const auto& cluster : clusters) { + if (has_successors.find(cluster) == has_successors.end()) { + stack.push_back({cluster, false}); + } + } + while (!stack.empty()) { + const Work item = stack.back(); + stack.pop_back(); + if (item.leave) { + sorted->push_back(item.cluster); + continue; + } + + if (visited.find(item.cluster) != visited.end()) continue; + visited.insert(item.cluster); + + stack.push_back({item.cluster, true}); + const auto& iter = ancestors.find(item.cluster); + if (iter != ancestors.end()) { + for (const auto& ancestor : iter->second) { + stack.push_back({ancestor, false}); + } + } + } + CHECK(sorted->size() == clusters.size()); +} + +} // namespace + Node* Encapsulator::Subgraph::GetCallNodeForInputs() const { return call_node_inputs_; } @@ -786,12 +891,71 @@ void Encapsulator::Subgraph::RecordOutsideCompilationOutputOrControl( } } +void Encapsulator::Subgraph::RecordOutsideCompilationDependency( + const string& successor, const string& ancestor) { + outside_compilation_ancestors_[successor].insert(ancestor); + outside_compilation_successors_[ancestor].insert(successor); +} + +const std::unordered_map> +Encapsulator::Subgraph::OutsideCompilationAncestorMap() const { + return outside_compilation_ancestors_; +} + +void Encapsulator::Subgraph::GetActiveClusterDependencyGraph( + std::unordered_set* clusters, + std::unordered_set* has_successor, + std::unordered_map>* ancestors_map) { + // During initial clustering the ancestor and successor datastructures may + // have been built including oc_cluster names that never turned into subgraphs + // because they had no edges into or out of the compiled cluster. Remove them + // before proceeding to simplify the logic. Get the set of clusters that was + // actually added, then remove references to the others. + for (const auto& oc_subgraph : outside_compilation_subgraphs_) { + clusters->insert(oc_subgraph.first); + } + for (const auto& cluster : outside_compilation_successors_) { + if (clusters->find(cluster.first) != clusters->end()) { + for (const auto& successor : cluster.second) { + if (clusters->find(successor) != clusters->end()) { + has_successor->insert(cluster.first); + break; + } + } + } + } + for (const auto& cluster : outside_compilation_ancestors_) { + if (clusters->find(cluster.first) != clusters->end()) { + std::unordered_set& ancestors = (*ancestors_map)[cluster.first]; + for (const auto& ancestor : cluster.second) { + if (clusters->find(ancestor) != clusters->end()) { + ancestors.insert(ancestor); + } + } + } + } +} + Status Encapsulator::Subgraph::AddHostComputes( const string& subgraph_name, const std::unordered_map& node_images) { - for (auto& oc_subgraph_iter : outside_compilation_subgraphs_) { - const string& oc_subgraph_name = oc_subgraph_iter.first; - OutsideCompilationSubgraph& oc_subgraph = oc_subgraph_iter.second; + // Get the set of outside_compilation clusters and the dependency edges + // between them. + std::unordered_set clusters; + std::unordered_set has_successor; + std::unordered_map> ancestors_map; + GetActiveClusterDependencyGraph(&clusters, &has_successor, &ancestors_map); + // Topologically sort the outside_compilation clusters according to their + // dependency relation. + std::vector sorted_clusters; + TopologicalClusterSort(clusters, has_successor, ancestors_map, + &sorted_clusters); + + // The host compute nodes added for each outside_compilation_cluster; + std::unordered_map host_compute_node; + for (const string& oc_subgraph_name : sorted_clusters) { + OutsideCompilationSubgraph& oc_subgraph = + outside_compilation_subgraphs_[oc_subgraph_name]; if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty() || !oc_subgraph.outputs_by_src.empty() || !oc_subgraph.control_outputs.empty()) { @@ -811,13 +975,22 @@ Status Encapsulator::Subgraph::AddHostComputes( inputs[input_index].Reset(src_image->name(), src_slot, dtype); input_dtypes[input_index] = dtype; } - for (const auto& output : oc_subgraph.outputs_by_src) { DataType dtype = output.first.dtype; int output_index = output.second; output_dtypes[output_index] = dtype; } + std::vector host_compute_ancestors; + const auto iter = ancestors_map.find(oc_subgraph_name); + if (iter != ancestors_map.end()) { + for (const string& ancestor_cluster : iter->second) { + host_compute_ancestors.push_back( + outside_compilation_subgraphs_[ancestor_cluster] + .host_compute_name); + } + } + NodeDef host_compute_def; NodeDefBuilder builder(strings::StrCat("outside_compilation_", oc_subgraph_name, "_host_compute"), @@ -825,6 +998,7 @@ Status Encapsulator::Subgraph::AddHostComputes( builder.Input(inputs); builder.Attr("Tinputs", input_dtypes); builder.Attr("Toutputs", output_dtypes); + builder.Attr("ancestors", host_compute_ancestors); builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name, "_", oc_subgraph_name)); @@ -834,6 +1008,7 @@ Status Encapsulator::Subgraph::AddHostComputes( Node* host_compute = graph_->AddNode(host_compute_def, &s); if (!s.ok()) return s; + host_compute_node[host_compute->name()] = host_compute; oc_subgraph.host_compute_name = host_compute->name(); // Connect the _HostCompute node to its producers in the subgraph. @@ -852,6 +1027,12 @@ Status Encapsulator::Subgraph::AddHostComputes( graph_->AddControlEdge(src_image, host_compute); } + // Connect the _HostCompute node to its ancestor host compute nodes. + for (const auto& ancestor_name : host_compute_ancestors) { + Node* ancestor = host_compute_node[ancestor_name]; + graph_->AddControlEdge(ancestor, host_compute); + } + // Connect the consumers in the subgraph to the _HostCompute node. for (const auto& output : oc_subgraph.outputs_by_dst) { const Node* dst_node = output.first.node; @@ -1654,6 +1835,17 @@ Status Encapsulator::CopyEdgeToOutputGraph( return Status::OK(); } +Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) { + for (const auto& ancestors : subgraph_ancestors_) { + const string& subgraph = ancestors.first; + for (const string& ancestor : ancestors.second) { + graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNodeForOutputs(), + subgraphs_[subgraph].GetCallNodeForInputs()); + } + } + return Status::OK(); +} + Status Encapsulator::AddEdgesToOutputGraph( const std::unordered_map& node_images, bool parallel_checking, Graph* graph_out) { @@ -1703,6 +1895,7 @@ Status Encapsulator::AddEdgesToOutputGraph( Subgraph& subgraph = subgraph_entry.second; subgraph.ConnectSequencerToCallNode(graph_out); } + TF_RETURN_IF_ERROR(AddCallNodeDependencies(graph_out)); return Status::OK(); } @@ -1960,6 +2153,182 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend( return Status::OK(); } +namespace { + +// Helper struct for building cluster dependencies and also debugging cycles in +// the dependencies. While computing dependencies we construct a mapping from +// Node* to PathDetails. +struct PathDetails { + struct SubgraphAndCluster { + string subgraph; + string outside_compilation_cluster; + bool operator==(const SubgraphAndCluster& other) const { + return subgraph == other.subgraph && + outside_compilation_cluster == other.outside_compilation_cluster; + } + }; + + struct SubgraphAndClusterHash { + inline std::size_t operator()(const SubgraphAndCluster& v) const { + return hash()( + strings::StrCat(v.subgraph, v.outside_compilation_cluster)); + } + }; + + typedef std::unordered_set + SubgraphAndClusterSet; + + // Returns the set of (subgraph, oc_cluster) pairs that should be recorded as + // ancestors for any successor of this node. If the node is in the outer + // graph, it returns the transitive union of the ancestors of the node's + // inputs. If the node is in an outside_compilation cluster, it returns just + // that cluster. If the node is compiled, it returns the empty set. + SubgraphAndClusterSet AncestorsForSuccessor() { + if (subgraph.empty()) { + return ancestor_clusters; + } else if (outside_compilation_cluster.empty()) { + return SubgraphAndClusterSet(); + } else { + SubgraphAndCluster entry; + entry.subgraph = subgraph; + entry.outside_compilation_cluster = outside_compilation_cluster; + return SubgraphAndClusterSet({entry}); + } + } + + // The transitive union of the ancestor's of this node's inputs. This is only + // saved for debugging in order to print out enough information to debug a + // discovered cycle. + SubgraphAndClusterSet ancestor_clusters; + // The subgraph attr on this node. + string subgraph; + // The outside_compilation attr on this node. + string outside_compilation_cluster; +}; + +// Adds an edge from ancestor to successor to the cycle detector, and returns an +// error if that edge causes the formation of a cycle. In the error case, logs +// the contents of the node_ancestors_map to facilitate debugging. +Status CheckClusterDependencyForCycles( + const string& ancestor, const string& successor, + const std::unordered_map>& ancestors, + const std::unordered_map& node_ancestors_map, + GraphCycles* cycle_detector, std::map* cycle_detector_map) { + if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) { + (*cycle_detector_map)[ancestor] = cycle_detector->NewNode(); + } + if (cycle_detector_map->find(successor) == cycle_detector_map->end()) { + (*cycle_detector_map)[successor] = cycle_detector->NewNode(); + } + + if (!cycle_detector->InsertEdge((*cycle_detector_map)[ancestor], + (*cycle_detector_map)[successor])) { + LOG(ERROR) << "Cycle in outside_compilation clusters"; + for (const auto& cluster : ancestors) { + LOG(ERROR) << "Cluster " << cluster.first << " depends on:"; + for (const auto& ancestor : cluster.second) { + LOG(ERROR) << " " << ancestor; + } + } + for (const auto& node_ancestors : node_ancestors_map) { + LOG(ERROR) << "Node " << node_ancestors.first->name() << " (" + << node_ancestors.second.subgraph << ";" + << node_ancestors.second.outside_compilation_cluster + << ") has ancestor clusters:"; + for (const auto& ancestor : node_ancestors.second.ancestor_clusters) { + LOG(ERROR) << " " << ancestor.subgraph << ";" + << ancestor.outside_compilation_cluster; + } + } + return errors::InvalidArgument( + "Can't compile outside_compilation clusters because there is a " + "dependency cycle: see error log for details."); + } + return Status::OK(); +} + +} // namespace + +Status Encapsulator::FindClusterDependencies() { + // Map from nodes to ancestor details. A node is entered into the map if it is + // in a compilation subgraph, and outside_compilation cluster, or appears on a + // path in the outer graph leading from an outside_compilation subgraph. + std::unordered_map node_ancestors_map; + // We check that clusters are acyclic using this cycle detector. + GraphCycles cycle_detector; + // Map from cluster name to cycle detector node id. + std::map cycle_detector_map; + // Process the nodes in topologically-sorted order. + std::vector nodes; + GetReversePostOrder(*graph_in_, &nodes); + for (Node* node : nodes) { + string subgraph_name; + string oc_cluster; + TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &subgraph_name, &oc_cluster)); + // First create an entry in the ancestors map if the node is in a compiled + // subgraph or outside_compilation cluster, or if any incoming edge is from + // a node with an ancestor map entry; and find the union of all the + // ancestors. + if (!subgraph_name.empty()) { + node_ancestors_map[node].subgraph = subgraph_name; + node_ancestors_map[node].outside_compilation_cluster = oc_cluster; + } + for (Node* src : node->in_nodes()) { + const auto iter = node_ancestors_map.find(src); + if (iter != node_ancestors_map.end()) { + const auto& ancestors_to_follow = iter->second.AncestorsForSuccessor(); + for (const auto& ancestor : ancestors_to_follow) { + if (ancestor.subgraph != subgraph_name || + ancestor.outside_compilation_cluster != oc_cluster) { + node_ancestors_map[node].ancestor_clusters.insert(ancestor); + } + } + } + } + if (!subgraph_name.empty()) { + // The node is in a compiled subgraph or an outside_compilation cluster. + if (oc_cluster.empty()) { + // The node is not in an outside_compilation cluster. Record the + // subgraph's ancestor dependencies. + for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) { + if (cluster.subgraph != subgraph_name) { + subgraph_ancestors_[subgraph_name].insert(cluster.subgraph); + TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles( + cluster.subgraph, subgraph_name, subgraph_ancestors_, + node_ancestors_map, &cycle_detector, &cycle_detector_map)); + } + } + } else { + Subgraph& subgraph = subgraphs_[subgraph_name]; + // The node is in an outside_compilation cluster. Record the cluster + // and/or subgraph ancestor dependencies. + for (const auto& cluster : node_ancestors_map[node].ancestor_clusters) { + if (cluster.subgraph == subgraph_name) { + // The ancestor is in the same subgraph. + if (cluster.outside_compilation_cluster != oc_cluster) { + // But not in the same oc_cluster, so record the dependency. + subgraph.RecordOutsideCompilationDependency( + oc_cluster, cluster.outside_compilation_cluster); + TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles( + cluster.outside_compilation_cluster, oc_cluster, + subgraph.OutsideCompilationAncestorMap(), node_ancestors_map, + &cycle_detector, &cycle_detector_map)); + } + } else { + // The ancestor is in a different subgraph, so record the + // dependency. + subgraph_ancestors_[subgraph_name].insert(cluster.subgraph); + TF_RETURN_IF_ERROR(CheckClusterDependencyForCycles( + cluster.subgraph, subgraph_name, subgraph_ancestors_, + node_ancestors_map, &cycle_detector, &cycle_detector_map)); + } + } + } + } + } + return Status::OK(); +} + Status Encapsulator::MakePrunedGraphCopyAndInline( const Graph& graph, const std::vector& sink_nodes, std::unique_ptr* pruned_graph, @@ -2166,6 +2535,7 @@ Status EncapsulateSubgraphsInFunctions( Encapsulator encapsulator(std::move(group_attribute), std::move(outside_compilation_attribute), &graph_in); + TF_RETURN_IF_ERROR(encapsulator.FindClusterDependencies()); TF_RETURN_IF_ERROR(encapsulator.SplitIntoSubgraphs()); TF_RETURN_IF_ERROR(encapsulator.BuildFunctionDefs( diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc index 8599a7038af9663e5af6f3231429cb7f6ea5f69b..5ec24d39a2c40a766dbb0ec51ebe798de620e24b 100644 --- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc +++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc @@ -20,8 +20,8 @@ limitations under the License. #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/ops/standard_ops.h" -#include "tensorflow/compiler/jit/graph_to_functiondef.h" #include "tensorflow/core/framework/function_testlib.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/graph/graph_def_builder.h" #include "tensorflow/core/lib/core/status_test_util.h" @@ -74,7 +74,7 @@ bool EqualProtoMap(const ::tensorflow::protobuf::Map& a, if (!compare(elt_a.first, elt_a.second, iter->second)) { if (diff) { *diff = strings::StrCat(map_name, " expected: element with key '", - key_to_string(elt_a.first), " has value '", + key_to_string(elt_a.first), "' has value '", value_to_string(elt_a.second), "' got: '", value_to_string(iter->second), "'"); } @@ -121,8 +121,22 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b, } return false; } + std::unordered_set control_input_a; + std::unordered_set control_input_b; for (int i = 0; i < a.input_size(); ++i) { - if (a.input(i) != b.input(i)) { + if (str_util::StartsWith(a.input(i), "^")) { + if (!str_util::StartsWith(b.input(i), "^")) { + if (diff) { + *diff = strings::StrCat( + diff_preamble, " mismatch for node ", a.name(), " input ", i, + ", expected control input ", a.input(i), " got ", b.input(i), + " expected:\n", a.DebugString(), "\ngot:\n", b.DebugString()); + } + return false; + } + control_input_a.insert(a.input(i)); + control_input_b.insert(b.input(i)); + } else if (a.input(i) != b.input(i)) { if (diff) { *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), " input ", i, ", expected ", a.input(i), @@ -132,11 +146,29 @@ bool EqualFunctionNodeDef(const NodeDef& a, const NodeDef& b, return false; } } + if (control_input_a != control_input_b) { + if (diff) { + *diff = strings::StrCat(diff_preamble, " mismatch for node ", a.name(), + " control inputs differ expected:\n", + a.DebugString(), "\ngot:\n", b.DebugString()); + } + return false; + } return EqualProtoMap( a.attr(), b.attr(), [](const string& s) { return s; }, [](const AttrValue& v) { return v.DebugString(); }, [](const string& key, const AttrValue& av, const AttrValue& bv) { - return av.DebugString() == bv.DebugString(); + if (key == "ancestors") { + // The ancestors are added from a set so the order is unpredictable; + // just compare set equality not list equality. + std::unordered_set a_set(av.list().s().begin(), + av.list().s().end()); + std::unordered_set b_set(bv.list().s().begin(), + bv.list().s().end()); + return a_set == b_set; + } else { + return av.DebugString() == bv.DebugString(); + } }, strings::StrCat(diff_preamble, " attr mismatch for node ", a.name()), diff); @@ -261,6 +293,7 @@ REGISTER_OP("XlaHostCompute") .Output("outputs: Toutputs") .Attr("Tinputs: list(type) >= 0") .Attr("Toutputs: list(type) >= 0") + .Attr("ancestors: list(string) >= 0") .Attr("key: string") .Attr("shape_inference_graph: string = ''") .Attr("shapes: list(shape) >= 0") @@ -899,6 +932,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) { {"C:o:0", "c:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, @@ -1044,17 +1078,20 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) { {"D:o:0", "F:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", + gtl::ArraySlice({"outside_compilation_O1_host_compute"})}, {"key", "host_compute_channel_F1_O2"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O2"}, {"shapes", gtl::ArraySlice({})}, {"_outside_compilation_subgraph", "O2"}}, - {"F"}}, + {"F", "outside_compilation_O1_host_compute"}}, {{"outside_compilation_O1_host_compute"}, "XlaHostCompute", {"C:o:0", "D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, @@ -1193,6 +1230,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {"C:o:0", "D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, @@ -1215,6 +1253,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { {"G:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F2_O1"}, {"shape_inference_graph", ""}, {"shapes", @@ -1279,6 +1318,179 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) { TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); } +// Test with two functions to transform, each with one outside_compilation +// cluster, with the dependency between them purely from an outside_compilation +// edge. +TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = InputShaped(b1.opts().WithName("A")); + Node* b = InputShaped(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Binary(c, d, + b1.opts() + .WithName("E") + .WithControlInputs({b, d}) + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Binary(c, e, + b1.opts().WithName("F").WithControlInput(e).WithAttr( + "_encapsulate", "F1")); + Node* g = + Binary(a, b, b1.opts().WithName("G").WithAttr("_encapsulate", "F2")); + Node* h = Unary(g, b1.opts() + .WithName("H") + .WithAttr("_encapsulate", "F2") + .WithAttr("_outside", "O1") + .WithControlInput(e)); + Node* i = Unary(h, b1.opts().WithName("I").WithAttr("_encapsulate", "F2")); + Binary(f, i, b1.opts().WithName("J")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT, DT_FLOAT}, shape.opts()); + Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1), + shape.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected)); + } + + { + GraphDefBuilder shape(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F2", "O1", + {DT_FLOAT}, shape.opts()); + Node* h = Unary(recv, shape.opts() + .WithName("H") + .WithAttr("_encapsulate", "F2") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F2", "O1", {h}, shape.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape, "F2_O1", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {}, + { + {{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, + "BinaryTest", + {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"}, + {}, + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"C:o:0", "D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT, DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}, + {"D"}}, + }, + {{"f_0_retval", "F:o:0"}}); + + *library_expected.add_function() = FunctionDefHelper::Create( + "F2", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {}, + { + {{"G"}, "BinaryTest", {"a_0_arg", "b_0_arg"}}, + {{"I"}, + "UnaryTest", + {"outside_compilation_O1_host_compute:outputs:0"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"G:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F2_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F2_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}}, + }, + {{"i_0_retval", "I:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = InputShaped(b2.opts().WithName("A")); + Node* b = InputShaped(b2.opts().WithName("B")); + + Node* key_constant1 = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1", + {DT_FLOAT, DT_FLOAT}, b2.opts()); + Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1), + b2.opts() + .WithName("E") + .WithControlInputs({recv1, b}) + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e}, + b2.opts().WithControlInput(e)); + Node* s1 = Sequencer( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}), + "F1"); + + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b); + Node* call1 = + b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1); + + Node* key_constant2 = + KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1", + {DT_FLOAT}, b2.opts()); + Node* h = Unary(recv2, b2.opts() + .WithName("H") + .WithAttr("_encapsulate", "F2") + .WithAttr("_outside", "O1") + .WithControlInput(e)); + Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h}, + b2.opts()); + + Node* s2 = Sequencer( + b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}), + "F2"); + NodeBuilder node_builder2("F2", "F2", lib_def.get()); + node_builder2.Input(a).Input(b); + Node* call2 = b2.opts() + .WithControlInputs({s2, call1}) + .FinalizeBuilder(&node_builder2); + Binary(call1, call2, b2.opts().WithName("J")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + // Test with one outside_compilation cluster that has no inputs from the // compiled subgraph. TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { @@ -1323,6 +1535,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) { {}, {{"Tinputs", gtl::ArraySlice({})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", @@ -1406,6 +1619,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) { {}, {{"Tinputs", gtl::ArraySlice({})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", @@ -1487,6 +1701,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) { {"D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", gtl::ArraySlice({})}, @@ -1567,6 +1782,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) { {"D:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", ""}, {"shapes", gtl::ArraySlice({})}, @@ -1607,6 +1823,371 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) { TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); } +// Test with two outside_compilation clusters that interact outside the compiled +// subgraph, where the ancestor has no HostCompute Op. +TEST(EncapsulateSubgraphsTest, + OutsideCompilationClusterDependencyNoSrcCluster) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = Input(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Unary(a, b1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Unary(d, b1.opts().WithName("F").WithAttr("_encapsulate", "F1")); + Node* g = Unary(f, b1.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* h = Unary(g, b1.opts().WithName("H").WithAttr("_encapsulate", "F1")); + Binary(e, h, b1.opts().WithName("I")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", + {DT_FLOAT}, shape2.opts()); + Node* g = Unary(ops::NodeOut(recv2, 0), shape2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, shape2.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {}, + { + {{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, "UnaryTest", {"D:o:0"}}, + {{"H"}, + "UnaryTest", + {"outside_compilation_O2_host_compute:outputs:0"}}, + {{"outside_compilation_O2_host_compute"}, + "XlaHostCompute", + {"F:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O2"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O2"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O2"}}}, + }, + {{"h_0_retval", "H:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = Input(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + + Node* e = Unary(a, b2.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* key_constant = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", + {DT_FLOAT}, b2.opts()); + Node* g = Unary(recv, b2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* send = + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, b2.opts()); + Node* s1 = Sequencer( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}), + "F1"); + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b).ControlInput(s1); + Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); + + Binary(e, call1, b2.opts().WithName("I")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + +// Test with two outside_compilation clusters that interact outside the compiled +// subgraph, where the successor has no HostCompute Op. +TEST(EncapsulateSubgraphsTest, + OutsideCompilationClusterDependencyNoDstCluster) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = Input(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Unary(d, b1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1")); + /*Node* g =*/Unary(a, b1.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1")); + Binary(e, h, b1.opts().WithName("I")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, shape1.opts()); + Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {}, + { + {{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, + "UnaryTest", + {"outside_compilation_O1_host_compute:outputs:0"}}, + {{"H"}, "UnaryTest", {"F:o:0"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}}, + }, + {{"h_0_retval", "H:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = Input(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + + Node* key_constant = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, b2.opts()); + Node* e = Unary(recv, b2.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* send = + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts()); + /*Node* g =*/Unary(a, b2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* s1 = Sequencer( + b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}), + "F1"); + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b).ControlInput(s1); + Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); + + Binary(e, call1, b2.opts().WithName("I")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + +// Test with two outside_compilation clusters that interact outside the compiled +// subgraph. +TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) { + FunctionDefLibrary library; + GraphDef graphdef; + + { + GraphDefBuilder b1(GraphDefBuilder::kFailImmediately); + Node* a = Input(b1.opts().WithName("A")); + Node* b = Input(b1.opts().WithName("B")); + Node* c = Unary(a, b1.opts().WithName("C").WithAttr("_encapsulate", "F1")); + Node* d = + Binary(b, c, b1.opts().WithName("D").WithAttr("_encapsulate", "F1")); + Node* e = Unary(d, b1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* f = Unary(e, b1.opts().WithName("F").WithAttr("_encapsulate", "F1")); + Node* g = Unary(d, b1.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* h = Unary(f, b1.opts().WithName("H").WithAttr("_encapsulate", "F1")); + /*Node* i =*/Binary(d, e, + b1.opts() + .WithName("I") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O3") + .WithControlInput(g)); + Binary(e, h, b1.opts().WithName("J")); + TF_EXPECT_OK(b1.ToGraphDef(&graphdef)); + } + + TF_EXPECT_OK(Encapsulate(&graphdef, &library)); + + FunctionDefLibrary library_expected; + GraphDef graphdef_expected; + + { + GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately); + Node* key_constant = + KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0")); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, shape1.opts()); + Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts()); + TF_EXPECT_OK( + AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected)); + } + + *library_expected.add_function() = FunctionDefHelper::Create( + "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {}, + {{{"C"}, "UnaryTest", {"a_0_arg"}}, + {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}}, + {{"F"}, "UnaryTest", {"outside_compilation_O1_host_compute:outputs:0"}}, + {{"H"}, "UnaryTest", {"F:o:0"}}, + {{"outside_compilation_O1_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, + {"key", "host_compute_channel_F1_O1"}, + {"shape_inference_graph", + "_outside_compilation_shape_inference_F1_O1"}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O1"}}}, + {{"outside_compilation_O2_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", + gtl::ArraySlice({"outside_compilation_O1_host_compute"})}, + {"key", "host_compute_channel_F1_O2"}, + {"shape_inference_graph", ""}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O2"}}, + {"outside_compilation_O1_host_compute"}}, + {{"outside_compilation_O3_host_compute"}, + "XlaHostCompute", + {"D:o:0"}, + {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, + {"Toutputs", gtl::ArraySlice({})}, + {"ancestors", + gtl::ArraySlice({"outside_compilation_O1_host_compute", + "outside_compilation_O2_host_compute"})}, + {"key", "host_compute_channel_F1_O3"}, + {"shape_inference_graph", ""}, + {"shapes", gtl::ArraySlice({})}, + {"_outside_compilation_subgraph", "O3"}}, + {"outside_compilation_O1_host_compute", + "outside_compilation_O2_host_compute"}}}, + {{"h_0_retval", "H:o:0"}}); + + { + std::unique_ptr lib_def( + new FunctionLibraryDefinition(OpRegistry::Global(), library_expected)); + GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get()); + Node* a = Input(b2.opts().WithName("A")); + Node* b = Input(b2.opts().WithName("B")); + + Node* key_constant = + KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder")); + Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", + {DT_FLOAT}, b2.opts()); + Node* e = Unary(recv1, b2.opts() + .WithName("E") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O1")); + Node* send = + SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts()); + Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", + {DT_FLOAT}, b2.opts()); + Node* g = Unary(recv2, b2.opts() + .WithName("G") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O2") + .WithControlInput(e)); + Node* recv3 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3", + {DT_FLOAT}, b2.opts()); + /*Node* i =*/Binary(recv3, e, + b2.opts() + .WithName("I") + .WithAttr("_encapsulate", "F1") + .WithAttr("_outside", "O3") + .WithControlInput(g)); + Node* s1 = Sequencer(b2.opts() + .WithName("F1_sequencer") + .WithControlInputs({recv1, send, recv2, recv3}), + "F1"); + NodeBuilder node_builder1("F1", "F1", lib_def.get()); + node_builder1.Input(a).Input(b).ControlInput(s1); + Node* call1 = b2.opts().FinalizeBuilder(&node_builder1); + + Binary(e, call1, b2.opts().WithName("J")); + TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected)); + } + + TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef); + TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library); +} + // Test with one outside_compilation cluster that has no outputs from the // compiled subgraph. TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) { @@ -1731,6 +2312,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) { {"c:o:0"}, {{"Tinputs", gtl::ArraySlice({DT_FLOAT})}, {"Toutputs", gtl::ArraySlice({DT_FLOAT})}, + {"ancestors", gtl::ArraySlice({})}, {"key", "host_compute_channel_F1_O1"}, {"shape_inference_graph", "_outside_compilation_shape_inference_F1_O1"}, diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc index f48941fce329313e4484b3c2dd900eeac884ed34..049d170fa48928474b894f2d0e1f2243c5f87275 100644 --- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc +++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc @@ -37,8 +37,6 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/util/stream_executor_util.h" -namespace gpu = perftools::gputools; - namespace tensorflow { XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx) @@ -51,9 +49,9 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx) num_constant_args_ = constant_types.size(); OP_REQUIRES_OK(ctx, ctx->GetAttr("Nresources", &num_resource_args_)); if (device_type_ == DeviceType(DEVICE_CPU)) { - platform_id_ = gpu::host::kHostPlatformId; + platform_id_ = se::host::kHostPlatformId; } else if (device_type_ == DeviceType(DEVICE_GPU)) { - platform_id_ = gpu::cuda::kCudaPlatformId; + platform_id_ = se::cuda::kCudaPlatformId; } else { platform_id_ = nullptr; } @@ -69,9 +67,9 @@ Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx, return Status::OK(); } - auto platform = gpu::MultiPlatformManager::PlatformWithId(platform_id_); + auto platform = se::MultiPlatformManager::PlatformWithId(platform_id_); if (!platform.ok()) { - return StreamExecutorUtil::ConvertStatus(platform.status()); + return platform.status(); } xla::LocalClientOptions client_options; client_options.set_platform(platform.ValueOrDie()); @@ -100,7 +98,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) { ResourceMgr* rm = ctx->resource_manager(); OP_REQUIRES(ctx, rm, errors::Internal("No resource manager.")); - gpu::Stream* stream = + se::Stream* stream = ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr; XlaCompilationCache* cache; @@ -153,7 +151,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) { options.device_type = &cache->device_type(); options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition(); options.graph_def_version = ctx->function_library()->graph_def_version(); - options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId); + options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId); options.device_allocator = xla_allocator; // TODO(b/77671268): We don't set variable_representation_shape_fn here. This // is restricted to Variables, but we need something like this to apply to diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.h b/tensorflow/compiler/jit/kernels/xla_launch_op.h index c6cc0986af0300c51283d432c671e92a1e4d8145..8f8e646f0ff6d94dfdf56721cacfce7fa658beb6 100644 --- a/tensorflow/compiler/jit/kernels/xla_launch_op.h +++ b/tensorflow/compiler/jit/kernels/xla_launch_op.h @@ -53,7 +53,7 @@ class XlaLocalLaunchOp : public OpKernel { // Number of resource variable arguments. int num_resource_args_; - perftools::gputools::Platform::Id platform_id_; + se::Platform::Id platform_id_; TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp); }; diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc index 6c2782e28e99df0801ab8d938fa3f961e4e1574c..60458f6f3314b2c3b65be1c90e051b2a670383bc 100644 --- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc +++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc @@ -58,7 +58,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx, launch_context.PopulateInputs(ctx, result, variables); - perftools::gputools::Stream* stream = + se::Stream* stream = ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr; TF_RET_CHECK(stream); diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc index 12f471735f68394a3079541e9ac8532e329bd694..c814b7eb029054d9d3659d52488729ae5989506a 100644 --- a/tensorflow/compiler/jit/xla_device.cc +++ b/tensorflow/compiler/jit/xla_device.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "absl/memory/memory.h" #include "tensorflow/compiler/jit/defs.h" #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h" #include "tensorflow/compiler/jit/xla_device_context.h" @@ -50,8 +51,6 @@ limitations under the License. #include "tensorflow/core/util/device_name_utils.h" #include "tensorflow/core/util/stream_executor_util.h" -namespace se = ::perftools::gputools; - namespace tensorflow { // Caches a XlaDeviceAllocator per pair. A @@ -121,7 +120,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator( auto platform = se::MultiPlatformManager::PlatformWithName(platform_name); if (!platform.ok()) { - return StreamExecutorUtil::ConvertStatus(platform.status()); + return platform.status(); } const DeviceAttributes attrs = Device::BuildDeviceAttributes( @@ -181,9 +180,15 @@ XlaDevice::XlaDevice(const SessionOptions& options, jit_device_name_(jit_device_name), xla_allocator_(nullptr), platform_(platform), - transfer_as_literal_(transfer_as_literal) {} + transfer_as_literal_(transfer_as_literal) { + VLOG(1) << "Created XLA device " << jit_device_name; +} -XlaDevice::~XlaDevice() {} +XlaDevice::~XlaDevice() { + if (gpu_device_info_ != nullptr) { + gpu_device_info_->default_context->Unref(); + } +} xla::LocalClient* XlaDevice::client() const { // We lazily create the client because the platform commits to the @@ -191,9 +196,8 @@ xla::LocalClient* XlaDevice::client() const { // don't want to do it until we get a chance to hook the platform up // to a simulator. - // For now GetOrCreateLocalClient always returns success when passed - // a non-null platform. If that changes we may have to plumb in some - // way to pass Status back. + // TODO(b/78468222): This can fail, at least when the backend is GPU and + // there is no GPU on the host. return xla::ClientLibrary::GetOrCreateLocalClient(platform_).ValueOrDie(); } @@ -218,14 +222,31 @@ xla::StatusOr XlaDevice::GetStream() { return stream_.get(); } +Status XlaDevice::CreateAndSetGpuDeviceInfo() { + if (gpu_device_info_ == nullptr) { + TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); + // Call GetAllocator for the side-effect of ensuring the allocator + // is created. + GetAllocator({}); + // XlaDevice owns both gpu_device_info_ and + // gpu_device_info_->default_context. + gpu_device_info_ = absl::make_unique(); + gpu_device_info_->stream = stream; + gpu_device_info_->default_context = + new XlaDeviceContext(stream, client(), transfer_as_literal_); + set_tensorflow_gpu_device_info(gpu_device_info_.get()); + } + + return Status::OK(); +} + Status XlaDevice::FillContextMap(const Graph* graph, DeviceContextMap* device_context_map) { VLOG(1) << "XlaDevice::FillContextMap"; device_context_map->resize(graph->num_node_ids()); TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream()); - // Call GetAllocator for the side-effect of ensuring the allocator and - // XlaTensorInfoManager is created. - (void)GetAllocator({}); + // Call GetAllocator for the side-effect of ensuring the allocator is created. + GetAllocator({}); auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_); for (Node* n : graph->nodes()) { VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name(); diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h index 4fe7dd8c9fa9eb954804555e9615160dc4bc3e8a..3ae87308cc7cffa916e178893df70a3f314b11b0 100644 --- a/tensorflow/compiler/jit/xla_device.h +++ b/tensorflow/compiler/jit/xla_device.h @@ -49,20 +49,20 @@ class XlaDevice : public LocalDevice { // retrieved e.g., when lazily creating the XlaCompilationCache device. class Metadata { public: - Metadata(int device_ordinal, perftools::gputools::Platform* platform, + Metadata(int device_ordinal, se::Platform* platform, const DeviceType& device_type); // The index of the device on this host. int device_ordinal() const; - perftools::gputools::Platform* platform() const; + se::Platform* platform() const; xla::LocalClient* client() const; const DeviceType& jit_device_type() const; private: const int device_ordinal_; const DeviceType device_type_; - perftools::gputools::Platform* platform_; // Not owned. + se::Platform* platform_; // Not owned. TF_DISALLOW_COPY_AND_ASSIGN(Metadata); }; @@ -85,8 +85,7 @@ class XlaDevice : public LocalDevice { XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs, int device_ordinal, const DeviceType& jit_device_name, - ::perftools::gputools::Platform* platform, - bool transfer_as_literal); + se::Platform* platform, bool transfer_as_literal); ~XlaDevice() override; Allocator* GetAllocator(AllocatorAttributes attr) override; @@ -103,7 +102,11 @@ class XlaDevice : public LocalDevice { Tensor* tensor) override; xla::LocalClient* client() const; - xla::StatusOr<::perftools::gputools::Stream*> GetStream(); + xla::StatusOr GetStream(); + + // If not already set, create and set GpuDeviceInfo. + // Not thread-safe + Status CreateAndSetGpuDeviceInfo(); private: // The metadata of this XlaDevice. @@ -114,7 +117,7 @@ class XlaDevice : public LocalDevice { DeviceType jit_device_name_; // Memory allocator associated with this device. Allocator* xla_allocator_; // Not owned. - ::perftools::gputools::Platform* platform_; // Not owned. + se::Platform* platform_; // Not owned. // Stream associated with this device. Operations enqueued on this // stream are executed on the device. Operations include data // copying back and forth between CPU and the device, and @@ -123,6 +126,10 @@ class XlaDevice : public LocalDevice { // Must we use XLA's transfer manager for correct host<->device transfers? if // false, we can use ThenMemcpy() instead. bool transfer_as_literal_; + + // If set, holds default device context (that we must Unref) + // and its stream. + std::unique_ptr gpu_device_info_; }; // Builds OpKernel registrations on 'device' for the JIT operators diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc index 43eb164012610723214cf39360698010c9dbdbd4..bf8c1886a022310eeaacdf69463f575a393dd8d0 100644 --- a/tensorflow/compiler/jit/xla_device_context.cc +++ b/tensorflow/compiler/jit/xla_device_context.cc @@ -23,8 +23,6 @@ limitations under the License. #include "tensorflow/core/common_runtime/dma_helper.h" #include "tensorflow/core/platform/mem.h" -namespace se = ::perftools::gputools; - namespace tensorflow { // The allocator used for Tensors assigned to the XLA device. diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h index ad914a1c23b5f2ea7063722f85e027a99fdb68f9..d7f5f1d208989256f8043d2e6d93cf9bd89333b2 100644 --- a/tensorflow/compiler/jit/xla_device_context.h +++ b/tensorflow/compiler/jit/xla_device_context.h @@ -45,8 +45,7 @@ class XlaDeviceAllocator : public Allocator { // Helper class for managing data transfers between host and XLA devices. class XlaTransferManager { public: - explicit XlaTransferManager(perftools::gputools::Stream* stream, - xla::LocalClient* client, + explicit XlaTransferManager(se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal); void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, @@ -54,7 +53,7 @@ class XlaTransferManager { void CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece tensor_name, Device* device, Tensor* cpu_tensor, StatusCallback done); - perftools::gputools::Stream* stream() const { return stream_; } + se::Stream* stream() const { return stream_; } private: Status TransferLiteralToDevice(const Tensor& host_tensor, @@ -64,7 +63,7 @@ class XlaTransferManager { // Stream obtained from a Device, used to transfer tensors between // CPU and device. - perftools::gputools::Stream* stream_; + se::Stream* stream_; // For the underlying memory allocator and XLA's TransferManager. xla::LocalClient* client_; // Transfer manager, for marshalling data to and from the device. @@ -78,8 +77,8 @@ class XlaTransferManager { // wraps the methods in XlaTransferManager. class XlaDeviceContext : public DeviceContext { public: - explicit XlaDeviceContext(perftools::gputools::Stream* stream, - xla::LocalClient* client, bool transfer_as_literal); + explicit XlaDeviceContext(se::Stream* stream, xla::LocalClient* client, + bool transfer_as_literal); void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, Tensor* device_tensor, @@ -87,9 +86,7 @@ class XlaDeviceContext : public DeviceContext { void CopyDeviceTensorToCPU(const Tensor* device_tensor, StringPiece tensor_name, Device* device, Tensor* cpu_tensor, StatusCallback done) override; - perftools::gputools::Stream* stream() const override { - return manager_.stream(); - } + se::Stream* stream() const override { return manager_.stream(); } private: XlaTransferManager manager_; diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc index ac60423d959ca44e7d92e2d965cf731287b1f83f..a8afbf9dcd736bb292b7c5f52c7cce2b47fb85b6 100644 --- a/tensorflow/compiler/jit/xla_gpu_device.cc +++ b/tensorflow/compiler/jit/xla_gpu_device.cc @@ -54,6 +54,15 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options, VLOG(1) << "Failed to create XLA_GPU device: " << status; return Status::OK(); } + + // TODO(b/78468222): Uncomment after fixing this bug + // status = device->CreateAndSetGpuDeviceInfo(); + // if (!status.ok()) { + // errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT, + // " device"); + // return status; + // } + devices->push_back(device.release()); return Status::OK(); } diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index 3520501c1a39d243cc84583c47cbb0663bb201a8..2a7f04271d4b7ea330f32b88ea1e3f4037988a91 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -32,13 +32,12 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/util/stream_executor_util.h" +namespace tensorflow { namespace { -namespace gpu = perftools::gputools; using xla::ScopedShapedBuffer; using xla::ShapedBuffer; } // anonymous namespace -namespace tensorflow { std::map SnapshotResourceVariables(OpKernelContext* ctx, int num_variables) { std::map snapshot; @@ -57,24 +56,23 @@ std::map SnapshotResourceVariables(OpKernelContext* ctx, return snapshot; } -XlaAllocator::XlaAllocator(const gpu::Platform* platform, Allocator* wrapped) +XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped) : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {} XlaAllocator::~XlaAllocator() {} -xla::StatusOr XlaAllocator::Allocate( +xla::StatusOr XlaAllocator::Allocate( int device_ordinal, uint64 size, bool retry_on_failure) { void* data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size); if (data == nullptr) { return errors::ResourceExhausted("Out of memory while trying to allocate ", size, " bytes."); } else { - return gpu::DeviceMemoryBase(data, size); + return se::DeviceMemoryBase(data, size); } } -Status XlaAllocator::Deallocate(int device_ordinal, - gpu::DeviceMemoryBase* mem) { +Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) { wrapped_->DeallocateRaw(mem->opaque()); return Status::OK(); } @@ -102,7 +100,7 @@ ScopedShapedBuffer ExtractSubShapedBuffer( /*target_base_index=*/{}); for (auto& index_to_buffer : shape_tree) { if (!index_to_buffer.first.empty() && index_to_buffer.first[0] == index) { - index_to_buffer.second = gpu::DeviceMemoryBase(nullptr, 0); + index_to_buffer.second = se::DeviceMemoryBase(nullptr, 0); } } return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator); @@ -149,7 +147,7 @@ void XlaComputationLaunchContext::PopulateInputs( << xla::ShapeUtil::HumanStringWithLayout(on_device_shape) << " not the same as on-host shape " << xla::ShapeUtil::HumanStringWithLayout(shape); - gpu::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t); + se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t); arg_buffers_[i] = xla::MakeUnique( /*on_host_shape=*/shape, /*on_device_shape=*/shape, client_->platform(), client_->default_device_ordinal()); @@ -162,7 +160,7 @@ void XlaComputationLaunchContext::PopulateInputs( void XlaComputationLaunchContext::PopulateOutputs( OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel, ScopedShapedBuffer output) { - gpu::Stream* stream = + se::Stream* stream = ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr; // Computation output should always be a tuple. @@ -227,7 +225,7 @@ void XlaComputationLaunchContext::PopulateOutputs( const TensorShape& shape = kernel->outputs[i].shape; VLOG(2) << "Retval " << i << " shape " << shape.DebugString(); - gpu::DeviceMemoryBase buffer = output.buffer({output_num}); + se::DeviceMemoryBase buffer = output.buffer({output_num}); if (allocate_xla_tensors_) { Tensor* output_tensor; OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor)); @@ -238,7 +236,7 @@ void XlaComputationLaunchContext::PopulateOutputs( } else { Tensor output_tensor = XlaTensorBuffer::MakeTensor( ctx->expected_output_dtype(i), shape, buffer, allocator); - output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num}); + output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num}); ctx->set_output(i, output_tensor); } ++output_num; @@ -258,7 +256,7 @@ void XlaComputationLaunchContext::PopulateOutputs( write.input_index >= 0 && write.input_index < ctx->num_inputs(), errors::Internal("Invalid input index for variable write.")); - gpu::DeviceMemoryBase buffer = output.buffer({output_num}); + se::DeviceMemoryBase buffer = output.buffer({output_num}); Var* variable = nullptr; // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor, @@ -288,7 +286,7 @@ void XlaComputationLaunchContext::PopulateOutputs( } else { Tensor output_tensor = XlaTensorBuffer::MakeTensor( write.type, write.shape, buffer, allocator); - output.set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num}); + output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num}); *variable->tensor() = output_tensor; } ++output_num; diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h index 26dcaa8a51d90f6706106ab8a4d762ddfec6a88d..8a6ff3b0c751206d184da63ef1a36e750a1252a5 100644 --- a/tensorflow/compiler/jit/xla_launch_util.h +++ b/tensorflow/compiler/jit/xla_launch_util.h @@ -46,13 +46,11 @@ std::map SnapshotResourceVariables(OpKernelContext* ctx, // see comment on `AllowsAsynchronousDeallocation()`. class XlaAllocator : public xla::DeviceMemoryAllocator { public: - XlaAllocator(const perftools::gputools::Platform* platform, - Allocator* wrapped); + XlaAllocator(const se::Platform* platform, Allocator* wrapped); ~XlaAllocator() override; - xla::StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure) override; - Status Deallocate(int device_ordinal, - perftools::gputools::DeviceMemoryBase* mem) override; + xla::StatusOr Allocate(int device_ordinal, uint64 size, + bool retry_on_failure) override; + Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) override; // The Tensorflow BFC allocator used on GPU allows host-side deallocation // before GPU execution takes place. Tensorflow uses the ordering of the main @@ -126,8 +124,7 @@ class XlaTensorBuffer : public TensorBuffer { } static Tensor MakeTensor(DataType dtype, const TensorShape& shape, - perftools::gputools::DeviceMemoryBase buffer, - Allocator* allocator) { + se::DeviceMemoryBase buffer, Allocator* allocator) { size_t expected_size = shape.num_elements() * DataTypeSize(dtype); auto* tensor_buffer = new XlaTensorBuffer(buffer.opaque(), expected_size, buffer.size(), allocator); diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc index 84b2835c4066005c194f73becb841fbcc48399b3..ce6456880bc1b3bc15ac0ef4bae35a83771098ef 100644 --- a/tensorflow/compiler/jit/xla_tensor.cc +++ b/tensorflow/compiler/jit/xla_tensor.cc @@ -31,16 +31,15 @@ namespace tensorflow { return FromTensor(const_cast(tensor)); } -/*static*/ perftools::gputools::DeviceMemoryBase -XlaTensor::DeviceMemoryFromTensor(const Tensor& tensor) { +/*static*/ se::DeviceMemoryBase XlaTensor::DeviceMemoryFromTensor( + const Tensor& tensor) { const XlaTensor* xla_tensor = FromTensor(&tensor); if (xla_tensor) { CHECK(xla_tensor->has_shaped_buffer()); return xla_tensor->shaped_buffer().root_buffer(); } else { - return perftools::gputools::DeviceMemoryBase( - const_cast(tensor.tensor_data().data()), - tensor.tensor_data().size()); + return se::DeviceMemoryBase(const_cast(tensor.tensor_data().data()), + tensor.tensor_data().size()); } } diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h index 2334fd272beb9e8185de10048421369179a0e475..922a91897312096e4bb6ee2a1cc153e0039e2c7a 100644 --- a/tensorflow/compiler/jit/xla_tensor.h +++ b/tensorflow/compiler/jit/xla_tensor.h @@ -43,8 +43,7 @@ class XlaTensor { // which case the returned value is shaped_buffer()->root_buffer(), or a // normal Tensor in which case the returned value is // {tensor.tensor_data().data(), tensor.tensor_data().size}. - static perftools::gputools::DeviceMemoryBase DeviceMemoryFromTensor( - const Tensor& tensor); + static se::DeviceMemoryBase DeviceMemoryFromTensor(const Tensor& tensor); // Assign the internal ShapedBuffer to new memory for the given dtype and // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 46b86c53aa6870c9290e25da27f82a06302f6865..0c720932568b0c4eafd2127f76c0b77ad2f86904 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -308,6 +308,25 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "eager_test", + size = "small", + srcs = ["eager_test.py"], + disabled_backends = [ + # TODO(b/78199195) Support XLA CPU devices in eager runtime + "cpu", + "cpu_ondemand", + # TODO(b/78468222) Enable GPU backend + "gpu", + ], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:platform_test", + ], +) + tf_xla_py_test( name = "fft_test", size = "medium", @@ -904,3 +923,15 @@ tf_xla_py_test( "//tensorflow/python:platform_test", ], ) + +tf_xla_py_test( + name = "placeholder_test", + size = "small", + srcs = ["placeholder_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:platform_test", + ], +) diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py new file mode 100644 index 0000000000000000000000000000000000000000..bdd0185dfe4abe9d9acecc5381ff82c54b8c0705 --- /dev/null +++ b/tensorflow/compiler/tests/eager_test.py @@ -0,0 +1,137 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Test cases for eager execution using XLA.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.eager import backprop +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.platform import googletest + + +class EagerTest(XLATestCase): + + def testBasic(self): + with self.test_scope(): + three = constant_op.constant(3) + five = constant_op.constant(5) + product = three * five + self.assertAllEqual(15, product) + + def testExecuteListOutputLen0(self): + with self.test_scope(): + empty = constant_op.constant([], dtype=dtypes.int32) + result = array_ops.unstack(empty, 0) + self.assertTrue(isinstance(result, list)) + self.assertEqual(0, len(result)) + + def testExecuteListOutputLen1(self): + with self.test_scope(): + split_dim = constant_op.constant(1) + value = constant_op.constant([[0, 1, 2], [3, 4, 5]]) + result = array_ops.split(value, 1, axis=split_dim) + self.assertTrue(isinstance(result, list)) + self.assertEqual(1, len(result)) + self.assertAllEqual([[0, 1, 2], [3, 4, 5]], result[0]) + + def testExecuteListOutputLen3(self): + with self.test_scope(): + split_dim = constant_op.constant(1) + value = constant_op.constant([[0, 1, 2], [3, 4, 5]]) + result = array_ops.split(value, 3, axis=split_dim) + self.assertTrue(isinstance(result, list)) + self.assertEqual(3, len(result)) + self.assertAllEqual([[0], [3]], result[0]) + self.assertAllEqual([[1], [4]], result[1]) + self.assertAllEqual([[2], [5]], result[2]) + + def testBasicGraph(self): + # Run some ops eagerly + with self.test_scope(): + three = constant_op.constant(3) + five = constant_op.constant(5) + product = three * five + self.assertAllEqual(15, product) + + # Run some ops graphly + with context.graph_mode(), self.test_session() as sess: + with self.test_scope(): + three = constant_op.constant(3) + five = constant_op.constant(5) + product = three * five + self.assertAllEqual(15, sess.run(product)) + + def testDegenerateSlices(self): + with self.test_scope(): + npt = np.arange(1, 19, dtype=np.float32).reshape(3, 2, 3) + t = constant_op.constant(npt) + # degenerate by offering a forward interval with a negative stride + self.assertAllEqual(npt[0:-1:-1, :, :], t[0:-1:-1, :, :]) + # degenerate with a reverse interval with a positive stride + self.assertAllEqual(npt[-1:0, :, :], t[-1:0, :, :]) + # empty interval in every dimension + self.assertAllEqual(npt[-1:0, 2:2, 2:3:-1], t[-1:0, 2:2, 2:3:-1]) + + def testIdentity(self): + with self.test_scope(): + self.assertAllEqual(2, array_ops.identity(2)) + + def testIdentityOnVariable(self): + with self.test_scope(): + v = resource_variable_ops.ResourceVariable(True) + i = array_ops.identity(v) + self.assertAllEqual(True, i.numpy()) + + def testAssignAddVariable(self): + with self.test_scope(): + v = resource_variable_ops.ResourceVariable(1.0) + v.assign_add(2.0) + self.assertEqual(3.0, v.numpy()) + + def testGradient(self): + def f(x): + return x + + with self.test_scope(): + grad_fn = backprop.gradients_function(f) + self.assertAllEqual(2., grad_fn(1., dy=2.)[0]) + + def testVariableGradient(self): + with self.test_scope(): + v0 = resource_variable_ops.ResourceVariable(1.0) + + def f(): + x = v0 * v0 + return x + + grads = backprop.implicit_grad(f)() + self.assertEqual(2., grads[0][0].numpy()) + + +if __name__ == "__main__": + ops.enable_eager_execution( + config=config_pb2.ConfigProto(log_device_placement=True)) + googletest.main() diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py new file mode 100644 index 0000000000000000000000000000000000000000..5e6d1313bd0336eba71fcf3658d949bd3342ae11 --- /dev/null +++ b/tensorflow/compiler/tests/placeholder_test.py @@ -0,0 +1,48 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for xla handling of placeholder_with_default.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.compiler.tests.xla_test import XLATestCase +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import googletest + + +class PlaceholderTest(XLATestCase): + + def test_placeholder_with_default_default(self): + with self.test_session() as sess, self.test_scope(): + v = resource_variable_ops.ResourceVariable(4.0) + ph = array_ops.placeholder_with_default(v, shape=[]) + out = ph * 2 + sess.run(variables.variables_initializer([v])) + self.assertEqual(8.0, sess.run(out)) + + def test_placeholder_with_default_fed(self): + with self.test_session() as sess, self.test_scope(): + v = resource_variable_ops.ResourceVariable(4.0) + ph = array_ops.placeholder_with_default(v, shape=[]) + out = ph * 2 + sess.run(variables.variables_initializer([v])) + self.assertEqual(2.0, sess.run(out, {ph: 1.0})) + + +if __name__ == '__main__': + googletest.main() diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py index ba5f829936fd82ca0cc53eda34aefbca6d80482b..ef047005b60bd156a677050368ef67ae030d6c3a 100644 --- a/tensorflow/compiler/tests/ternary_ops_test.py +++ b/tensorflow/compiler/tests/ternary_ops_test.py @@ -23,6 +23,7 @@ import numpy as np from tensorflow.compiler.tests.xla_test import XLATestCase from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_math_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import googletest @@ -68,40 +69,41 @@ class TernaryOpsTest(XLATestCase): expected=np.array([1, 3, 5], dtype=np.int32)) def testSelect(self): - self._testTernary( - array_ops.where, - np.array(0, dtype=np.bool), - np.array(2, dtype=np.float32), - np.array(7, dtype=np.float32), - expected=np.array(7, dtype=np.float32)) + for dtype in self.numeric_types: + self._testTernary( + array_ops.where, + np.array(0, dtype=np.bool), + np.array(2, dtype=dtype), + np.array(7, dtype=dtype), + expected=np.array(7, dtype=dtype)) - self._testTernary( - array_ops.where, - np.array(1, dtype=np.bool), - np.array([1, 2, 3, 4], dtype=np.float32), - np.array([5, 6, 7, 8], dtype=np.float32), - expected=np.array([1, 2, 3, 4], dtype=np.float32)) + self._testTernary( + array_ops.where, + np.array(1, dtype=np.bool), + np.array([1, 2, 3, 4], dtype=dtype), + np.array([5, 6, 7, 8], dtype=dtype), + expected=np.array([1, 2, 3, 4], dtype=dtype)) - self._testTernary( - array_ops.where, - np.array(0, dtype=np.bool), - np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32), - np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32), - expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32)) + self._testTernary( + array_ops.where, + np.array(0, dtype=np.bool), + np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype), + np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype), + expected=np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype)) - self._testTernary( - array_ops.where, - np.array([0, 1, 1, 0], dtype=np.bool), - np.array([1, 2, 3, 4], dtype=np.float32), - np.array([5, 6, 7, 8], dtype=np.float32), - expected=np.array([5, 2, 3, 8], dtype=np.float32)) + self._testTernary( + array_ops.where, + np.array([0, 1, 1, 0], dtype=np.bool), + np.array([1, 2, 3, 4], dtype=dtype), + np.array([5, 6, 7, 8], dtype=dtype), + expected=np.array([5, 2, 3, 8], dtype=dtype)) - self._testTernary( - array_ops.where, - np.array([0, 1, 0], dtype=np.bool), - np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32), - np.array([[7, 8], [9, 10], [11, 12]], dtype=np.float32), - expected=np.array([[7, 8], [3, 4], [11, 12]], dtype=np.float32)) + self._testTernary( + array_ops.where, + np.array([0, 1, 0], dtype=np.bool), + np.array([[1, 2], [3, 4], [5, 6]], dtype=dtype), + np.array([[7, 8], [9, 10], [11, 12]], dtype=dtype), + expected=np.array([[7, 8], [3, 4], [11, 12]], dtype=dtype)) def testSlice(self): for dtype in self.numeric_types: @@ -119,6 +121,23 @@ class TernaryOpsTest(XLATestCase): np.array([2, 1], dtype=np.int32), expected=np.array([[2], [5]], dtype=dtype)) + def testClipByValue(self): + # TODO(b/78258593): enable integer types here too. + for dtype in self.float_types: + test_cases = [ + (np.array([2, 4, 5], dtype=dtype), dtype(7)), # + (dtype(1), np.array([2, 4, 5], dtype=dtype)), # + (np.array([-2, 7, 7], dtype=dtype), np.array([-2, 9, 8], dtype=dtype)) + ] + x = np.array([-2, 10, 6], dtype=dtype) + for lower, upper in test_cases: + self._testTernary( + gen_math_ops._clip_by_value, + x, + lower, + upper, + expected=np.minimum(np.maximum(x, lower), upper)) + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index ba5c3a14849cefcb680b03425232724ff32375a8..942504e6bd4c9ce93c9482251823efcbb46ab1c8 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -412,7 +412,6 @@ cc_library( hdrs = ["functionalize_control_flow.h"], deps = [ ":tf2xla_util", - "//tensorflow/compiler/jit:graph_to_functiondef", "//tensorflow/compiler/jit:union_find", "//tensorflow/compiler/tf2xla:dump_graph", "//tensorflow/compiler/tf2xla/ops:xla_ops", diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc index 23629d85aed316764b8ddd775c51c3890e5c16e3..8d1f2684909e876fe5521ba6a63d745c7d3956e0 100644 --- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc +++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc @@ -21,13 +21,13 @@ limitations under the License. #include #include -#include "tensorflow/compiler/jit/graph_to_functiondef.h" #include "tensorflow/compiler/jit/union_find.h" #include "tensorflow/compiler/tf2xla/dump_graph.h" #include "tensorflow/compiler/tf2xla/tf2xla_util.h" #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/control_flow.h" diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index 579b66969990017688477443115cc4f61c18fe4a..00fd08b1a0750739445a124adc7ccf436a4a9b71 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -21,6 +21,7 @@ tf_kernel_library( "cast_op.cc", "categorical_op.cc", "cholesky_op.cc", + "clip_by_value_op.cc", "concat_op.cc", "const_op.cc", "conv_ops.cc", diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fdf75be7b1156540d762e3bc04a51f2478f00f46 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc @@ -0,0 +1,61 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/core/framework/tensor_shape.h" + +namespace tensorflow { +namespace { + +class ClipByValueOp : public XlaOpKernel { + public: + explicit ClipByValueOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} + + void Compile(XlaOpKernelContext* ctx) override { + const TensorShape shape = ctx->InputShape(0); + const TensorShape min_shape = ctx->InputShape(1); + const TensorShape max_shape = ctx->InputShape(2); + + xla::ComputationBuilder* builder = ctx->builder(); + auto input = ctx->Input(0); + auto min = ctx->Input(1); + auto max = ctx->Input(2); + + auto shape_error = [&]() -> tensorflow::Status { + return errors::InvalidArgument( + "clip_value_min and clip_value_max must be either of " + "the same shape as input, or a scalar. ", + "Input shape: ", shape.DebugString(), + " clip_value_min shape: ", min_shape.DebugString(), + " clip_value_max shape: ", max_shape.DebugString()); + }; + + if (shape != min_shape) { + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(min_shape), shape_error()); + min = builder->Broadcast(min, shape.dim_sizes()); + } + if (shape != max_shape) { + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(max_shape), shape_error()); + max = builder->Broadcast(max, shape.dim_sizes()); + } + ctx->SetOutput(0, builder->Clamp(min, input, max)); + } +}; + +REGISTER_XLA_OP(Name("ClipByValue"), ClipByValueOp); + +} // namespace +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc index 39af662b638cb9d723118e58fcfc983633fed497..e72200bfbcff20c55ac03030f1afc4bacaabf7ce 100644 --- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc @@ -38,6 +38,7 @@ class IdentityOp : public XlaOpKernel { REGISTER_XLA_OP(Name("Identity").CompilationOnly(), IdentityOp); REGISTER_XLA_OP(Name("IdentityN").CompilationOnly(), IdentityOp); +REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp); REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp); REGISTER_XLA_OP(Name("StopGradient"), IdentityOp); REGISTER_XLA_OP(Name("Snapshot"), IdentityOp); diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc index 7f72a6073df218b9e2bd4cc0c0b5bb10b5cd4b84..9bf5821b54abe3994085ad72043ff143077824c5 100644 --- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc +++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc @@ -83,15 +83,6 @@ xla::StatusOr TriangularSolve( block_size); } - // Returns [b1, b2, ... , bn, indices[0], indices[1]]. - auto prepend_batch_dims = [&](std::array indices) { - std::vector output(ndims); - std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin()); - std::copy(indices.begin(), indices.end(), - output.begin() + batch_dimensions.size()); - return output; - }; - // Applies a complex conjugation operation if `a` is complex and `conjugate_a` // is true, otherwise returns its argument. auto maybe_conj = [&](xla::ComputationBuilder* builder, @@ -108,11 +99,12 @@ xla::StatusOr TriangularSolve( std::unique_ptr sub = builder->CreateSubBuilder( tensorflow::strings::StrCat("trsm_base_", k)); - auto a_param = - sub->Parameter(0, - xla::ShapeUtil::MakeShape(b_shape->element_type(), - prepend_batch_dims({k, k})), - "a"); + auto a_param = sub->Parameter( + 0, + xla::ShapeUtil::MakeShape( + b_shape->element_type(), + PrependMajorDims(sub.get(), batch_dimensions, {k, k})), + "a"); std::array b_lastd; if (left_side) { @@ -120,11 +112,12 @@ xla::StatusOr TriangularSolve( } else { b_lastd = {m, k}; } - auto b_param = - sub->Parameter(1, - xla::ShapeUtil::MakeShape(b_shape->element_type(), - prepend_batch_dims(b_lastd)), - "b"); + auto b_param = sub->Parameter( + 1, + xla::ShapeUtil::MakeShape( + b_shape->element_type(), + PrependMajorDims(sub.get(), batch_dimensions, b_lastd)), + "b"); // We use a left-looking subroutine on the block diagonal in some common // cases, while falling back to a recursive call in unsupported cases. The @@ -380,14 +373,6 @@ xla::StatusOr TriangularSolveLeftLooking( batch_dimensions.push_back(a_size); } - auto prepend_batch_dims = [&](std::array indices) { - std::vector output(ndims); - std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin()); - std::copy(indices.begin(), indices.end(), - output.begin() + batch_dimensions.size()); - return output; - }; - auto maybe_conj = [&](xla::ComputationBuilder* builder, xla::ComputationDataHandle x) { auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a; @@ -479,30 +464,6 @@ xla::StatusOr TriangularSolveLeftLooking( auto body_b = bodyb->GetTupleElement(input_tuple, 3); auto zero = bodyb->ConstantR0(0); - // Set up some helper functions. - auto prepend_zeros = [&](std::array starts) { - auto zero = bodyb->Reshape(bodyb->ConstantR0(0), {1}); - std::vector padded_starts(ndims, zero); - padded_starts[ndims - 2] = bodyb->Reshape(starts[0], {1}); - padded_starts[ndims - 1] = bodyb->Reshape(starts[1], {1}); - return bodyb->ConcatInDim(padded_starts, 0); - }; - - auto dynamic_slice = [&](xla::ComputationDataHandle x, - std::array starts, - std::array sizes) { - auto padded_starts = prepend_zeros(starts); - auto padded_sizes = prepend_batch_dims(sizes); - return bodyb->DynamicSlice(x, padded_starts, padded_sizes); - }; - - auto update = [&](xla::ComputationDataHandle x, - xla::ComputationDataHandle update, - std::array starts) { - auto padded_starts = prepend_zeros(starts); - return bodyb->DynamicUpdateSlice(x, update, padded_starts); - }; - // We'd like to implement this: // if transpose_a: // a_row = T(a[..., i+1:, i:i+1]) @@ -516,22 +477,29 @@ xla::StatusOr TriangularSolveLeftLooking( // all zeros and use that as zero-padding (doing unnecessary FLOPs). xla::ComputationDataHandle a_row; if (transpose_a) { - a_row = dynamic_slice(body_a, {zero, i}, {m, 1}); + TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a, + {zero, i}, {m, 1})); } else { - a_row = dynamic_slice(body_a, {i, zero}, {1, m}); + TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a, + {i, zero}, {1, m})); } TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), a_row, body_out, /*transpose_x=*/transpose_a, /*transpose_y=*/false, /*conjugate_x=*/conjugate_a, /*conjugate_y=*/false)); - auto result_row = - bodyb->Sub(dynamic_slice(body_b, {i, zero}, {1, n}), b_update); + TF_ASSIGN_OR_RETURN( + auto result_row_slice, + DynamicSliceInMinorDims(bodyb.get(), body_b, {i, zero}, {1, n})); + auto result_row = bodyb->Sub(result_row_slice, b_update); // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1] - auto a_elt = dynamic_slice(body_a, {i, i}, {1, 1}); + TF_ASSIGN_OR_RETURN(auto a_elt, DynamicSliceInMinorDims(bodyb.get(), body_a, + {i, i}, {1, 1})); auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt)); - body_out = update(body_out, div_result, {i, zero}); + TF_ASSIGN_OR_RETURN(body_out, + DynamicUpdateSliceInMinorDims(bodyb.get(), body_out, + div_result, {i, zero})); // if transpose_a: // return (i - 1, body_out, a, b) diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 86263d847ae02d50e70dafb0129b2664c522f2a3..c0e996768491a6315c21021ce874b8a11557de6e 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -813,4 +813,29 @@ Status XlaCompiler::SetHostToDeviceMetadata( return Status::OK(); } +Status XlaCompiler::GetHostComputeControlDependency( + const string& host_compute_name, xla::ComputationDataHandle* handle) { + const auto iter = host_compute_control_output_.find(host_compute_name); + if (iter == host_compute_control_output_.end()) { + return errors::InvalidArgument( + "No registered control handle for host compute Op '", host_compute_name, + "'"); + } else { + *handle = iter->second; + } + return Status::OK(); +} + +Status XlaCompiler::SetHostComputeControlDependency( + const string& host_compute_name, const xla::ComputationDataHandle& handle) { + if (host_compute_control_output_.find(host_compute_name) != + host_compute_control_output_.end()) { + return errors::InvalidArgument( + "Duplicate control handles registered for for host compute Op ", + host_compute_name); + } + host_compute_control_output_[host_compute_name] = handle; + return Status::OK(); +} + } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index a6747bbe72e161b2ece55697825cce0e71145a5c..8f564f35ec81765e8998513dfd4805d221200c6c 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -325,6 +325,23 @@ class XlaCompiler { gtl::ArraySlice types, gtl::ArraySlice shapes); + // In order to avoid deadlocks from dependencies in host computations, it can + // be necessary to enforce a partial order on the execution of HostCompute + // Ops. In particular it may be necessary to constrain the SendToHost for one + // HostCompute to run before blocking on the RecvAtHost for another + // HostCompute. The compiler maintains a mapping from 'host_compute_name' to + // handle, where the handle is an 'output' of the HostCompute Op corresponding + // to 'host_compute_name'. Another HostCompute Op that needs to be sequenced + // later can add the handle as an 'input' to enforce the constraints. + // 'host_compute_name' can be any string the client wishes to use to identify + // a given HostCompute Op as long as the names are unique within the + // compilation. + Status GetHostComputeControlDependency(const string& host_compute_name, + xla::ComputationDataHandle* handle); + Status SetHostComputeControlDependency( + const string& host_compute_name, + const xla::ComputationDataHandle& handle); + const Options& options() const { return options_; } xla::Client* client() const { return options_.client; } FunctionLibraryRuntime* flib_runtime() const { return flib_runtime_; } @@ -391,6 +408,9 @@ class XlaCompiler { std::unordered_map host_compute_sends_; std::unordered_map host_compute_recvs_; + std::unordered_map + host_compute_control_output_; + TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler); }; diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index 88f37433a552e0dff1587075f1d31fd5d050302b..1af9cb6d2ab15a33b56f1df0410f47d7e139a1ba 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -605,8 +605,8 @@ cc_library( ":util", ":window_util", ":xla_data_proto", - "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:padding", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_evaluator", "//tensorflow/compiler/xla/service:shape_inference", diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc index f0f94298a05f7c4bdc41cbfb8572454fbedd371d..328e1b8fa84e7baaca41c6c9a65e9a1598ac32ae 100644 --- a/tensorflow/compiler/xla/client/client.cc +++ b/tensorflow/compiler/xla/client/client.cc @@ -235,6 +235,11 @@ StatusOr Client::LoadSnapshot(const SessionModule& module) { return Computation(stub_, response.computation()); } +StatusOr Client::LoadSnapshot(const HloSnapshot& module) { + TF_RET_CHECK(module.has_hlo() && module.hlo().has_hlo_module()); + return XlaComputation(module.hlo().hlo_module()); +} + StatusOr> Client::Execute( const Computation& computation, tensorflow::gtl::ArraySlice arguments, diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h index 14c685d94ea31c382d84223ca4e2eba544420d78..a63ff4c56d1dd78c7abfa2bf163b5fbd54d82b2b 100644 --- a/tensorflow/compiler/xla/client/client.h +++ b/tensorflow/compiler/xla/client/client.h @@ -255,6 +255,9 @@ class Client { StatusOr LoadSnapshot(const SessionModule& module); + // TODO(b/74197823): This is a part of a NOT YET ready refactor. + StatusOr LoadSnapshot(const HloSnapshot& module); + ServiceInterface* stub() { return stub_; } private: diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index d0e945b70fdb3c7abdcf94cb03d3b962f8556576..1c1270590375ab54e5d7b56344db1b2d40e5b89c 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -166,12 +166,8 @@ StatusOr LocalExecutable::Run( if (executable_->dumping()) { return ExecuteAndDump(&service_options, arguments); } - TF_ASSIGN_OR_RETURN( - ShapedBuffer result, - executable_->ExecuteOnStreamWrapper( - &service_options, run_options.execution_profile(), arguments)); - - return ScopedShapedBuffer(std::move(result), run_options.allocator()); + return executable_->ExecuteOnStreamWrapper( + &service_options, run_options.execution_profile(), arguments); } StatusOr LocalExecutable::ExecuteAndDump( @@ -181,12 +177,12 @@ StatusOr LocalExecutable::ExecuteAndDump( backend_->platform()->Name()); TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module())); TF_ASSIGN_OR_RETURN( - ShapedBuffer result, + ScopedShapedBuffer result, executable_->ExecuteOnStream(run_options, arguments, /*hlo_execution_profile=*/nullptr)); TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module())); TF_RETURN_IF_ERROR(executable_->DumpSessionModule()); - return ScopedShapedBuffer(std::move(result), run_options->allocator()); + return std::move(result); } tensorflow::Status LocalExecutable::RecordArguments( diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD index 31fa1241ee474a31575c45cf7652063dfc818fac..0d6e207971ec64515ec5e6da292910920edd101a 100644 --- a/tensorflow/compiler/xla/client/xla_client/BUILD +++ b/tensorflow/compiler/xla/client/xla_client/BUILD @@ -31,9 +31,9 @@ cc_library( hdrs = ["xla_computation.h"], deps = [ "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:hlo_proto", - "//tensorflow/core:lib", ], ) diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h index 5977ee4f4bfc6e1a8acf664ed476da201408d7e9..4955f1515d66af00ddf72e4c7621292a590e662c 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h +++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h @@ -57,11 +57,27 @@ class XlaOp { StatusOr GetShape() const; + const XlaBuilder* builder() const { return builder_; } + + bool operator==(const XlaOp& rhs) const { + return handle_ == rhs.handle_ && builder_ == rhs.builder_; + } + + bool operator!=(const XlaOp& rhs) const { + return handle_ != rhs.handle_ || builder_ != rhs.builder_; + } + + friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) { + out << op.handle(); + return out; + } + private: XlaOp(int64 handle, XlaBuilder* builder) : handle_(handle), builder_(builder) {} int64 handle() const { return handle_; } + friend class XlaBuilder; int64 handle_; diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc index a6752c601026518825c7994f6b6fa20d20f34f24..72e3935696e0c44ae3893fc8f1ceb261fa5e2646 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc +++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc @@ -17,7 +17,9 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/util.h" namespace xla { @@ -26,4 +28,13 @@ StatusOr XlaComputation::GetProgramShape() const { return proto_.program_shape(); } +StatusOr> XlaComputation::Snapshot() const { + if (IsNull()) { + return InvalidArgument("Computation is invalid."); + } + auto session = MakeUnique(); + *session->mutable_hlo()->mutable_hlo_module() = proto_; + return std::move(session); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h index 7ad212aa24cd32d104cc4db7aa164c22c9f5be8f..b70b57e9ffec40188f246f5e884146012c02f4a2 100644 --- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h +++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h @@ -48,6 +48,10 @@ class XlaComputation { const HloModuleProto& proto() const { return proto_; } + // Requests that we snapshot the computation into a serializable protocol + // buffer form. + StatusOr> Snapshot() const; + // Returns true if this object is a null Computation. bool IsNull() const { return unique_id_ == -1; } diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc index 70ae95bf47398589e3c20f72c1f2084a738f253a..bc8405703b02dc1b0c4c87005ea3c15372552157 100644 --- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc +++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc @@ -43,7 +43,7 @@ void SetDebugOptionsDefaults(DebugOptions* flags) { #ifdef INTEL_MKL flags->set_xla_cpu_use_mkl_dnn(true); #endif // INTEL_MKL - flags->set_xla_gpu_max_kernel_unroll_factor(1); + flags->set_xla_gpu_max_kernel_unroll_factor(4); // Set cudnn batchnorm off by default; it does not provide a performance win // on average. flags->set_xla_gpu_use_cudnn_batchnorm(false); diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc index c315b4ff30059147ee33dcdd5b0858a1c39e5999..bb6dd4f9098aefc1c2bbb1b1c41b3cee856b67de 100644 --- a/tensorflow/compiler/xla/literal_util.cc +++ b/tensorflow/compiler/xla/literal_util.cc @@ -44,8 +44,16 @@ namespace { constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; -// Converts between little and big endian, assuming elements in the array are 16 -// bits long. +// Converts between little and big endian. +// +// Precondition: size % 2 == 0 (elements in the array are 16 bits long) +void ConvertEndianShort(string* bytes) { + CHECK_EQ(bytes->size() / 2, 0); + for (int64 i = 0; i < bytes->size(); i += 2) { + std::swap((*bytes)[i], (*bytes)[i + 1]); + } +} + void ConvertEndianShort(char* bytes, int64 size) { CHECK_EQ(size / 2, 0); for (int64 i = 0; i < size; i += 2) { @@ -1930,16 +1938,14 @@ void Literal::Piece::WriteToProto(LiteralProto* proto) const { *proto->mutable_f16s() = string( reinterpret_cast(data().data()), size_bytes()); if (!kLittleEndian) { - ConvertEndianShort(const_cast(proto->mutable_f16s()->data()), - proto->f16s().size()); + ConvertEndianShort(proto->mutable_f16s()); } break; case BF16: *proto->mutable_bf16s() = string( reinterpret_cast(data().data()), size_bytes()); if (!kLittleEndian) { - ConvertEndianShort(const_cast(proto->mutable_bf16s()->data()), - proto->bf16s().size()); + ConvertEndianShort(proto->mutable_bf16s()); } break; case F32: diff --git a/tensorflow/compiler/xla/ptr_util.h b/tensorflow/compiler/xla/ptr_util.h index c58c19db2cacbe9b038160f27b9bd76aa58146eb..bfcdfc62f9541ab09b94a48d5121e16bad4d43cd 100644 --- a/tensorflow/compiler/xla/ptr_util.h +++ b/tensorflow/compiler/xla/ptr_util.h @@ -28,26 +28,8 @@ limitations under the License. #include "tensorflow/core/util/ptr_util.h" namespace xla { - -template -std::unique_ptr WrapUnique(T* ptr) { - return tensorflow::WrapUnique(ptr); -} - -template -typename tensorflow::helper::MakeUniqueResult::scalar MakeUnique( - Args&&... args) { - return tensorflow::MakeUnique(std::forward(args)...); -} - -// Overload for array of unknown bound. -// The allocation of arrays needs to use the array form of new, -// and cannot take element constructor arguments. -template -typename tensorflow::helper::MakeUniqueResult::array MakeUnique(size_t n) { - return tensorflow::MakeUnique(n); -} - +using tensorflow::MakeUnique; +using tensorflow::WrapUnique; } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_ diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc index eec48479c929ab0823fef342fc284bfdc4b1f339..dc6f5fe5fcc067c99ced01812f9f2388a00766d0 100644 --- a/tensorflow/compiler/xla/python/numpy_bridge.cc +++ b/tensorflow/compiler/xla/python/numpy_bridge.cc @@ -181,16 +181,6 @@ StatusOr XlaShapeFromPyShape(PyObject* o) { PyObjectCppRepr(o).c_str()); }; - auto get_attr = [o, &error](const string& field) -> StatusOr { - PyObject* result = - PyObject_GetAttrString(o, const_cast(field.c_str())); - if (result == nullptr) { - return error(tensorflow::strings::StrCat( - "Failed to get attribute of Shape object:", field)); - } - return result; - }; - auto call_method = [o, &error](const string& method) -> StatusOr { PyObject* result = PyObject_CallMethod(o, const_cast(method.c_str()), nullptr); @@ -202,12 +192,16 @@ StatusOr XlaShapeFromPyShape(PyObject* o) { }; PyObject* np_type; - TF_ASSIGN_OR_RETURN(np_type, get_attr("np_dtype")); + TF_ASSIGN_OR_RETURN(np_type, call_method("numpy_dtype")); if (np_type->ob_type != &PyArrayDescr_Type) { - return error("Shape attribute np_dtype is not an integer numpy dtype"); + return error( + "Return value of shape method numpy_dtype " + "is not an integer numpy dtype"); } if (!NumpyTypeIsValid(NumpyTypenum(np_type))) { - return error("Shape attribute np_dtype is not a valid integer numpy dtype"); + return error( + "Return value of shape method numpy_dtype " + "is not a valid integer numpy dtype"); } const PrimitiveType element_type = NumpyTypeToPrimitiveType(NumpyTypenum(np_type)); diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index 9c81f6439d0d9f0a0f0d1d3402e9c1ada46e8691..f6809b6b871d7e246dd43811c7e8c08378d53989 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -166,14 +166,14 @@ class LocalBuffer(object): self._delete = c_api.DeleteLocalShapedBuffer @staticmethod - def from_py(npval, layout_fn=None): - npval = require_numpy_array_layout(npval) + def from_pyval(pyval, layout_fn=None): + pyval = require_numpy_array_layout(pyval) if layout_fn: - shape = Shape.from_numpy(npval) + shape = Shape.from_pyval(pyval) shape = shape.map_leaves(layout_fn) else: shape = None - return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(npval, shape)) + return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(pyval, shape)) def to_py(self): return self.c_local_shaped_buffer.ToLiteral() @@ -191,53 +191,104 @@ class LocalBuffer(object): class Shape(object): - """XLA shape. + """Represents an XLA shape. - Represents an XLA shape by a corresponding Python/Numpy type and a - list of dimensions, which are themselves Shapes in case this one - represents an XLA tuple. + A shape is either an array shape, having rank-many integer + dimensions and an element type (represented by a Numpy dtype), or it + is a tuple shape, having a shape for every tuple component: + + type shape = + TupleShape of shape list + | ArrayShape of { dimensions: int list; element_type: dtype } + + Callers are expected to instantiate this class only via the static + constructors: tuple_shape, array_shape, and from_pyval. """ - def __init__(self, np_dtype, dimensions, minor_to_major=None): + @staticmethod + def tuple_shape(tuple_shapes): + """Construct a tuple shape.""" + if (not isinstance(tuple_shapes, (tuple, list)) or + not all(isinstance(t, Shape) for t in tuple_shapes)): + raise TypeError('tuple_shapes must be a tuple of Shapes') + return Shape(tuple_shapes, tuple) + + @staticmethod + def array_shape(element_type, dimensions, minor_to_major=None): + """Construct an array shape.""" + if (not isinstance(dimensions, tuple) or + not all(isinstance(i, int) for i in dimensions)): + dimensions = tuple(int(i) for i in dimensions) + return Shape(dimensions, np.dtype(element_type), + minor_to_major=minor_to_major) + + @staticmethod + def from_pyval(pyval): + def convert(pyval): + if isinstance(pyval, tuple): + return Shape.tuple_shape(tuple(convert(elt) for elt in pyval)) + else: + pyval = require_numpy_array_layout(pyval) + return Shape.array_shape(pyval.dtype, np.shape(pyval)) + return convert(pyval) + + def __init__(self, dimensions, dtype, minor_to_major=None): assert isinstance(dimensions, tuple) - self.np_dtype = np_dtype self._dimensions = dimensions + self._dtype = dtype + self._is_tuple = dtype == tuple self._minor_to_major = minor_to_major self._check_minor_to_major() def __eq__(self, other): # pylint: disable=protected-access - return (self.np_dtype == other.np_dtype and + return (self._dtype == other._dtype and self._dimensions == other._dimensions and self._minor_to_major == other._minor_to_major) def __repr__(self): - return ('xla_client.Shape(np_dtype={!r}, dimensions={!r}, ' - 'minor_to_major={!r})').format(self.np_dtype, self._dimensions, - self._minor_to_major) - - def element_type(self): - return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.np_dtype)] + return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, ' + '_is_tuple={!r}), _minor_to_major={!r}').format( + self._dtype, self._dimensions, self._is_tuple, + self._minor_to_major) def is_tuple(self): - return self.element_type() == xla_data_pb2.TUPLE + return self._is_tuple - def dimensions(self): - if self.is_tuple(): - raise ValueError('Tuple shape has no dimensions') - return self._dimensions - - def minor_to_major(self): - return self._minor_to_major + def is_array(self): + return not self._is_tuple def tuple_shapes(self): if not self.is_tuple(): - raise ValueError('Shape is not a tuple shape') + raise ValueError('not a tuple shape') + return self._dimensions + + def numpy_dtype(self): + """Like element_type(), but returns dtype('O') in case of a tuple shape.""" + if self.is_tuple(): + return np.dtype(np.object) + else: + return self.element_type() + + def xla_element_type(self): + return DTYPE_TO_XLA_ELEMENT_TYPE[str(self.numpy_dtype())] + + def element_type(self): + if not self.is_array(): + raise ValueError('not an array shape') + return self._dtype + + def dimensions(self): + if not self.is_array(): + raise ValueError('not an array shape') return self._dimensions def rank(self): return len(self.dimensions()) + def minor_to_major(self): + return self._minor_to_major + def map_leaves(self, f): """Map f over each leaf-level array subshape. @@ -250,7 +301,7 @@ class Shape(object): """ if self.is_tuple(): children = tuple(child.map_leaves(f) for child in self.tuple_shapes()) - return Shape(np.dtype('O'), children) + return Shape.tuple_shape(children) else: mapped = f(self) return self if mapped is None else mapped @@ -264,30 +315,24 @@ class Shape(object): assert sorted(mtm) == range(len(mtm)), self def update_minor_to_major(self, minor_to_major): + if not self.is_array(): + raise ValueError('not an array shape') if not isinstance(minor_to_major, tuple): raise TypeError('minor_to_major must be a tuple') - updated = Shape(self.np_dtype, tuple(self.dimensions()), minor_to_major) + updated = Shape.array_shape( + self.element_type(), self.dimensions(), minor_to_major) updated._check_minor_to_major() # pylint: disable=protected-access return updated - @staticmethod - def from_numpy(npval): - - def convert(npval): - if isinstance(npval, tuple): - return Shape(np.dtype('O'), tuple(convert(elt) for elt in npval)) - else: - return Shape(npval.dtype, np.shape(npval)) - - return convert(require_numpy_array_layout(npval)) - def _wrap_shape(shape_info): dtype, dims = shape_info element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(dtype)] if element_type == xla_data_pb2.TUPLE: - dims = tuple(_wrap_shape(subshape_info) for subshape_info in dims) - return Shape(dtype, dims) + shapes = tuple(_wrap_shape(subshape_info) for subshape_info in dims) + return Shape.tuple_shape(shapes) + else: + return Shape.array_shape(dtype, dims) def _wrap_data_handle(handle): @@ -420,7 +465,7 @@ class LocalComputation(object): compile_options=None, layout_fn=None): return self.Compile( - argument_shapes=[Shape.from_numpy(arg) for arg in arguments], + argument_shapes=[Shape.from_pyval(arg) for arg in arguments], compile_options=compile_options, layout_fn=layout_fn) @@ -428,7 +473,7 @@ class LocalComputation(object): """Execute with Python values as arguments and return value.""" if not self.is_compiled: raise ValueError('Cannot execute an uncompiled local XLA computation.') - argument_shapes = [Shape.from_numpy(arg) for arg in arguments] + argument_shapes = [Shape.from_pyval(arg) for arg in arguments] if layout_fn: argument_shapes = [ shape.map_leaves(layout_fn) for shape in argument_shapes @@ -607,7 +652,7 @@ class ComputationBuilder(object): A ComputationDataHandle message. """ return self.ParameterWithShape( - Shape.from_numpy(value), name=name, parameter_num=parameter_num) + Shape.from_pyval(value), name=name, parameter_num=parameter_num) def Broadcast(self, operand, sizes): """Enqueues a broadcast operation onto the computation. @@ -968,7 +1013,7 @@ class ComputationBuilder(object): Returns: a ComputationDataHandle to the generated array of F32 values. """ - shape = Shape(self.GetShape(mu).np_dtype, dims) + shape = Shape.array_shape(self.GetShape(mu).element_type(), dims) return _wrap_data_handle( self._client.RngNormal( _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape)) @@ -988,7 +1033,7 @@ class ComputationBuilder(object): Returns: a ComputationDataHandle to the generated array of values with the same numeric type (F32, S32, or U32) as the arguments a and b. """ - shape = Shape(self.GetShape(a).np_dtype, dims) + shape = Shape.array_shape(self.GetShape(a).element_type(), dims) return _wrap_data_handle( self._client.RngUniform( _unwrap_data_handle(a), _unwrap_data_handle(b), shape)) diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py index 433ea568776102539d4cfe7364eedafe22caf6ac..c073c02040e4d260cf760ea2b25f70d60ddd41a1 100644 --- a/tensorflow/compiler/xla/python/xla_client_test.py +++ b/tensorflow/compiler/xla/python/xla_client_test.py @@ -319,7 +319,7 @@ class LocalBufferTest(LocalComputationTest): def _Execute(self, c, arguments): compiled_c = c.Build().CompileWithExampleArguments(arguments) - arg_buffers = [xla_client.LocalBuffer.from_py(arg) for arg in arguments] + arg_buffers = [xla_client.LocalBuffer.from_pyval(arg) for arg in arguments] result_buffer = compiled_c.ExecuteWithLocalBuffers(arg_buffers) return result_buffer.to_py() @@ -350,7 +350,7 @@ class LocalBufferTest(LocalComputationTest): c.Add(c.ParameterFromNumpy(NumpyArrayF32(0.)), c.ConstantF32Scalar(3.14)) arg = NumpyArrayF32(1.11) compiled_c = c.Build().CompileWithExampleArguments([arg]) - arg_buffer = xla_client.LocalBuffer.from_py(arg) + arg_buffer = xla_client.LocalBuffer.from_pyval(arg) arg_buffer.delete() with self.assertRaises(ValueError): compiled_c.ExecuteWithLocalBuffers([arg_buffer]) @@ -1287,7 +1287,7 @@ class EmbeddedComputationsTest(LocalComputationTest): def testInfeedS32Values(self): to_infeed = NumpyArrayS32([1, 2, 3, 4]) c = self._NewComputation() - c.Infeed(xla_client.Shape.from_numpy(to_infeed[0])) + c.Infeed(xla_client.Shape.from_pyval(to_infeed[0])) compiled_c = c.Build().CompileWithExampleArguments() for item in to_infeed: xla_client.transfer_to_infeed(item) @@ -1299,7 +1299,7 @@ class EmbeddedComputationsTest(LocalComputationTest): def testInfeedThenOutfeedS32(self): to_round_trip = NumpyArrayS32([1, 2, 3, 4]) c = self._NewComputation() - x = c.Infeed(xla_client.Shape.from_numpy(to_round_trip[0])) + x = c.Infeed(xla_client.Shape.from_pyval(to_round_trip[0])) c.Outfeed(x) compiled_c = c.Build().CompileWithExampleArguments() @@ -1309,7 +1309,7 @@ class EmbeddedComputationsTest(LocalComputationTest): execution.start() xla_client.transfer_to_infeed(want) got = xla_client.transfer_from_outfeed( - xla_client.Shape.from_numpy(to_round_trip[0])) + xla_client.Shape.from_pyval(to_round_trip[0])) execution.join() self.assertEqual(want, got) diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc index ad3a28e11939d6259ebd75d544a950ba7abd741f..df9dbc58308f047484cccd6919ca0abf328622eb 100644 --- a/tensorflow/compiler/xla/reference_util.cc +++ b/tensorflow/compiler/xla/reference_util.cc @@ -18,7 +18,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/computation_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h" #include "tensorflow/compiler/xla/service/hlo_evaluator.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -90,7 +90,7 @@ std::unique_ptr> MatmulArray2DImpl( Padding padding) { return ConvArray3DGeneralDimensionsDilated( lhs, rhs, kernel_stride, padding, 1, 1, - ComputationBuilder::CreateDefaultConvDimensionNumbers(1)); + XlaBuilder::CreateDefaultConvDimensionNumbers(1)); } /*static*/ std::unique_ptr> @@ -140,7 +140,7 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated( std::pair kernel_stride, Padding padding) { return ConvArray4DGeneralDimensions( lhs, rhs, kernel_stride, padding, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); } /* static */ std::unique_ptr> diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index 9009cbf845e7cfa630abe1c68b68d4f04e4d4bc0..d55da3686cdfdf0a33ebd6d4e5c1d273192b2f6d 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -359,6 +359,7 @@ cc_library( ":hlo", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:lib", ], ) @@ -755,6 +756,7 @@ cc_library( ":hlo", ":hlo_execution_profile", ":hlo_graph_dumper", + ":hlo_proto", ":pool", ":session_proto", ":shaped_buffer", @@ -2032,6 +2034,7 @@ cc_library( srcs = ["hlo_verifier.cc"], hdrs = ["hlo_verifier.h"], deps = [ + ":hlo", ":hlo_pass", ":shape_inference", "//tensorflow/compiler/xla:status_macros", diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 6bf65825cd02a77e51e9fc521516164d93483084..cf1231bcce4d004284b71a49063e3e470a9eb93f 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -31,23 +31,35 @@ limitations under the License. namespace xla { StatusOr AllocationTracker::Register( - ShapedBuffer shaped_buffer, const string& tag) { + ScopedShapedBuffer shaped_buffer, const string& tag) { tensorflow::mutex_lock lock(mutex_); VLOG(2) << "Register"; - std::vector replicated_buffers; + std::vector replicated_buffers; replicated_buffers.emplace_back(std::move(shaped_buffer)); return RegisterInternal(std::move(replicated_buffers), tag); } StatusOr AllocationTracker::RegisterReplicatedBuffers( - std::vector replicated_buffers, const string& tag) { + std::vector replicated_buffers, const string& tag) { tensorflow::mutex_lock lock(mutex_); VLOG(2) << "RegisterReplicatedBuffers"; return RegisterInternal(std::move(replicated_buffers), tag); } +// ReleaseIfScopedShapedBuffer lets RegisterInternal(b) call +// b.release() if b is a ScopedShapedBuffer, or otherwise pass b through +// unmodified. +static ShapedBuffer ReleaseIfScopedShapedBuffer(ShapedBuffer b) { return b; } +static ShapedBuffer ReleaseIfScopedShapedBuffer(ScopedShapedBuffer b) { + return b.release(); +} + +template StatusOr AllocationTracker::RegisterInternal( - std::vector replicated_buffers, const string& tag) { + std::vector replicated_buffers, const string& tag) { + static_assert(std::is_same::value || + std::is_same::value, + "ShapedBufferTy must be ShapedBuffer or ScopedShapedBuffer."); VLOG(2) << "RegisterInternal(" << "tag: \"" << tag << "\" with " << replicated_buffers.size() << " shaped_buffers."; @@ -65,17 +77,22 @@ StatusOr AllocationTracker::RegisterInternal( int64 handle = next_handle_++; for (auto& shaped_buffer : replicated_buffers) { std::vector shape_indices; - ShapeUtil::ForEachSubshape(shaped_buffer.on_device_shape(), - [this, &shape_indices](const Shape& /*subshape*/, - const ShapeIndex& index) { - shape_indices.push_back(index); - }); + ShapeUtil::ForEachSubshape( + shaped_buffer.on_device_shape(), + [&](const Shape& /*subshape*/, const ShapeIndex& index) { + shape_indices.push_back(index); + }); + // Add shaped_buffer's buffers to opaque_to_allocation_map_, which owns + // them. for (const ShapeIndex& index : shape_indices) { AddAllocationOrIncrementRefCount(shaped_buffer.buffer(index), shaped_buffer.device_ordinal()); } - handle_to_shaped_buffers_[handle].emplace_back( - MakeUnique(std::move(shaped_buffer))); + // If ShapedBufferTy is ScopedShapedBuffer, release the ScopedShapedBuffer + // into a regular ShapedBuffer, which is stored in + // handle_to_shaped_buffers_. + handle_to_shaped_buffers_[handle].emplace_back(MakeUnique( + ReleaseIfScopedShapedBuffer(std::move(shaped_buffer)))); } GlobalDataHandle result; @@ -102,10 +119,6 @@ tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) { shaped_buffer->device_ordinal())); } } - return Reset(data); -} - -Status AllocationTracker::Reset(const GlobalDataHandle& data) { // Keep a nullptr as a tombstone for unregistered handles. This enables // better error messages. That is, "handle has been deallocated" versus // "handle does not exist". @@ -152,7 +165,7 @@ StatusOr> AllocationTracker::DeconstructTuple( element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}), /*index=*/{}); std::vector replicated_buffers; - replicated_buffers.emplace_back(std::move(element_buffer)); + replicated_buffers.push_back(std::move(element_buffer)); TF_ASSIGN_OR_RETURN( GlobalDataHandle element_handle, RegisterInternal(std::move(replicated_buffers), "deconstructed tuple")); diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h index 2bfcd5371293635f6525d0c5506c606e2762b943..1174fa641c06ae053bcc652416bfbc30cabc777c 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.h +++ b/tensorflow/compiler/xla/service/allocation_tracker.h @@ -45,13 +45,13 @@ class AllocationTracker { // Registers a shaped buffer of device memory, and returns a corresponding // handle that can be used for talking to XLA clients. The given shaped buffer // will be treated as the buffer corresponding to the only replica. - StatusOr Register(ShapedBuffer shaped_buffer, + StatusOr Register(ScopedShapedBuffer shaped_buffer, const string& tag); // Registers a vector of shaped buffers of device memory, one per replica, and // returns a corresponding handle that can be used for talking to XLA clients. StatusOr RegisterReplicatedBuffers( - std::vector replicated_buffers, const string& tag); + std::vector replicated_buffers, const string& tag); // Unregister the allocation for the given data handle. Status Unregister(const GlobalDataHandle& data); @@ -87,21 +87,21 @@ class AllocationTracker { }; // Internal helper which resolves the given GlobalDataHandle to a - // ShapedBuffer. + // list of ScopedShapedBuffers. StatusOr> ResolveInternal( const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Internal helper which registers a vector of shaped buffers, one per - // replica. + // replica. ShapedBufferTy is either ScopedShapedBuffer or ShapedBuffer. If + // it's ShapedBuffer, all of the given buffers must already be tracked by this + // object -- presumably this is a call from DeconstructTuple. + template StatusOr RegisterInternal( - std::vector replicated_buffers, const string& tag) + std::vector replicated_buffers, const string& tag) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Resets the shaped buffers corresponding to the given handle. - Status Reset(const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - // Adds the given device address to the allocation tracker, or if it already - // exists, then increment it's reference count. + // exists, then increment its reference count. void AddAllocationOrIncrementRefCount(se::DeviceMemoryBase device_memory, int device_ordinal) EXCLUSIVE_LOCKS_REQUIRED(mutex_); @@ -133,7 +133,19 @@ class AllocationTracker { // buffers for different replicas. // // The ShapedBuffers in this map's vectors need to be unique_ptrs, because our - // public API returns pointers to them. + // public API returns pointers to them. We expect the concrete class to be + // ShapedBuffer and never ScopedShapedBuffer; deallocation of buffers is + // handled by opaque_to_allocation_map_. + // + // The elements of the vectors need to be unique_ptrs because we return + // pointers to them. (In theory we could use std::list or something instead, + // but we also want to be able to null out these elements.) + // + // The reason that the elements can't be unique_ptrs is + // the existence of DeconstructTuple(). This function allows us to create a + // non-owning "view" into a tuple's sub-buffers. The sub-buffers are then + // free'd when both the view *and* the original tuple are Unregistered. This + // refcounting is managed in opaque_to_allocation_map_. tensorflow::gtl::FlatMap>> handle_to_shaped_buffers_ GUARDED_BY(mutex_); diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc index a582dbffd688a4ca904539a462aa268a0b4d3cb6..b1d616ec3506f9e65dac045c8192c4bf061e90f4 100644 --- a/tensorflow/compiler/xla/service/backend.cc +++ b/tensorflow/compiler/xla/service/backend.cc @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/eigen_thread_pool.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index 246b80286189286dd29a306dd0bda495df9dad3e..04fda3b2df574502aa882dd6bcff5ff2022c8edf 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -89,12 +89,10 @@ cc_library( ":cpu_instruction_fusion", ":cpu_layout_assignment", ":cpu_options", - ":cpu_parallelization_preparation", ":disassembler", ":dot_op_emitter", ":ir_emission_utils", ":ir_emitter", - ":parallel_cpu_executable", ":parallel_task_assignment", ":simple_orc_jit", "//tensorflow/compiler/xla:literal_util", @@ -232,35 +230,6 @@ cc_library( ], ) -cc_library( - name = "parallel_cpu_executable", - srcs = ["parallel_cpu_executable.cc"], - hdrs = [ - "parallel_cpu_executable.h", - ], - deps = [ - ":cpu_runtime", - ":shape_partition", - ":simple_orc_jit", - "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:buffer_assignment", - "//tensorflow/compiler/xla/service:device_memory_allocator", - "//tensorflow/compiler/xla/service:executable", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_execution_profile", - "//tensorflow/compiler/xla/service:logical_buffer", - "//tensorflow/compiler/xla/service:shaped_buffer", - "//tensorflow/core:lib", - "//tensorflow/core:stream_executor_no_cuda", - "@llvm//:orc_jit", - ], -) - cc_library( name = "ir_emitter", srcs = [ @@ -661,25 +630,6 @@ cc_library( ], ) -cc_library( - name = "cpu_parallelization_preparation", - srcs = ["cpu_parallelization_preparation.cc"], - hdrs = [ - "cpu_parallelization_preparation.h", - ], - deps = [ - ":ir_emission_utils", - ":parallel_task_assignment", - ":shape_partition", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_cost_analysis", - "//tensorflow/compiler/xla/service:hlo_pass", - "//tensorflow/core:lib", - ], -) - cc_library( name = "ir_emission_utils", srcs = ["ir_emission_utils.cc"], diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index e8472fd36b360fcb1f73cd97128fe7975bf2cd5c..3c0c367df30639dc7da148c180e2697c71223789 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -56,12 +56,10 @@ limitations under the License. #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h" #include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h" #include "tensorflow/compiler/xla/service/cpu/cpu_options.h" -#include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h" #include "tensorflow/compiler/xla/service/cpu/disassembler.h" #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h" #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h" #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h" -#include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h" #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h" #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" @@ -308,10 +306,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) { module->config().intra_op_parallelism_threads() > 0 ? module->config().intra_op_parallelism_threads() : tensorflow::port::NumSchedulableCPUs(); - if (options::CpuParallelBackendRequested(module->config())) { - pipeline.AddPass(max_parallelism, - ShapeSizeBytesFunction()); - } else if (!is_aot_compile) { + if (!is_aot_compile) { // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module. // Note this is not run for AOT because it would bring in thread pool // and thread synchronization dependencies which would likely increase @@ -329,13 +324,6 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) { pipeline.AddPass(); pipeline.AddPass(); pipeline.AddPass(); - if (options::CpuParallelBackendRequested(module->config())) { - // Re-run the outlining, in case any copies were inserted into the entry - // computation. - pipeline.AddPass(max_parallelism, - ShapeSizeBytesFunction()); - pipeline.AddPass(); - } pipeline.AddPass(); return pipeline.Run(module).status(); } @@ -522,190 +510,80 @@ StatusOr> CpuCompiler::RunBackend( const string xla_dump_optimized_hlo_proto_to = module->config().debug_options().xla_dump_optimized_hlo_proto_to(); - if (options::CpuParallelBackendRequested(module->config())) { - VLOG(1) << "Using parallel cpu backend"; - - // Run buffer analysis on the HLO graph. This analysis figures out which - // temporary buffers are required to run the computation. - // DependencyHloOrdering is used for the parallel emitter because the order - // of HLO instruction execution is not known ahead of time. - // DependencyHloOrdering is the most conservative partial order and only - // uses data dependencies for determining order. - TF_ASSIGN_OR_RETURN( - std::unique_ptr assignment, - BufferAssigner::Run( - module.get(), xla::MakeUnique(module.get()), - BufferSizeBytesFunction(), memory_alignment)); - // BufferAssignment::ToString() includes a header, so no need for us to - // print one ourselves. - XLA_VLOG_LINES(2, assignment->ToString()); - - if (!xla_dump_optimized_hlo_proto_to.empty()) { - HloProto proto = MakeHloProto(*module, *assignment); - TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( - proto, xla_dump_optimized_hlo_proto_to, module->name())); - } - - // If we are using the parallel CPU backend, we need to create map from - // HloInstruction to the corresponding generated function name. - std::map parallel_computations; - std::unordered_map> - aligned_constants; - for (auto instruction : entry_computation->MakeInstructionPostOrder()) { - // Parameters and constants don't get their own computation. - if (instruction->opcode() == HloOpcode::kParameter) { - continue; - } - if (instruction->opcode() == HloOpcode::kConstant) { - // Copy the constant out of the ProtocolBuffer so that we can give it a - // higher alignment. - const void* data = instruction->literal().untyped_data(); - int64 size = CpuExecutable::ShapeSizeBytes(instruction->shape()); - auto iter = aligned_constants.emplace( - instruction, xla::MakeUnique(size)); - CHECK_EQ(iter.second, true); - unsigned char* aligned_data = iter.first->second.get(); - memcpy(aligned_data, data, size); - continue; - } - // The parallel preparation should have ensured that the top-level - // computation consists solely of Call instructions. - TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall) - << module->ToString(); - HloComputation* to_apply = instruction->to_apply(); - parallel_computations.emplace(to_apply, instruction); - } - - IrEmitter ir_emitter(*module, *assignment, llvm_module.get(), - std::move(instruction_to_profile_idx), - std::move(computation_to_profile_idx), - jit->target_machine(), jit->external_constant_pool()); - - std::unique_ptr> function_names( - new HloInstructionMap()); - for (auto embedded_computation : - entry_computation->MakeEmbeddedComputationsList()) { - if (embedded_computation->IsFusionComputation()) { - continue; - } - auto parallel_computation_iter = - parallel_computations.find(embedded_computation); - // All parallel computations are considered to be an entry computation for - // IR generation purposes. - bool computation_is_parallel = - parallel_computation_iter != parallel_computations.end(); - TF_ASSIGN_OR_RETURN( - llvm::Function * ir_function, - ir_emitter.EmitComputation( - embedded_computation, embedded_computation->name(), - /*is_top_level_computation=*/computation_is_parallel, - /*instruction_order=*/nullptr)); - // If this computation is parallel, remember it in the function name map. - // This way we know what function to execute when we try to run code for - // the Call instruction. - if (computation_is_parallel) { - HloInstruction* call_instruction = parallel_computation_iter->second; - InsertOrDie(function_names.get(), call_instruction, - llvm_ir::AsString(ir_function->getName())); - } - } - - string ir_module_string; - if (embed_ir_in_executable) { - ir_module_string = llvm_ir::DumpModuleToString(*llvm_module); - } - TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); - - // JIT compile the LLVM IR module to in-memory machine code. - jit->AddModule(std::move(llvm_module)); - cpu_executable.reset(new ParallelCpuExecutable( - std::move(jit), std::move(assignment), std::move(module), - std::move(function_names), std::move(aligned_constants), - std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map))); - - if (embed_ir_in_executable) { - static_cast(*cpu_executable) - .set_ir_module_string(ir_module_string); - } - } else { - VLOG(1) << "Using sequential cpu backend"; - - // Select an order for emitting the HLO instructions for each - // computation. Using this sequence enables tighter buffer liveness analysis - // and reduced memory usage (as compared to using DependencyHloOrdering). - TF_ASSIGN_OR_RETURN( - SequentialHloOrdering::HloModuleSequence module_sequence, - CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction())); - - // Run buffer analysis on the HLO graph. This analysis figures out which - // temporary buffers are required to run the computation. - TF_ASSIGN_OR_RETURN( - std::unique_ptr assignment, - BufferAssigner::Run(module.get(), - xla::MakeUnique( - module.get(), module_sequence), - BufferSizeBytesFunction(), memory_alignment)); - // BufferAssignment::ToString() includes a header, so no need for us to - // print one ourselves. - XLA_VLOG_LINES(2, assignment->ToString()); - - if (!xla_dump_optimized_hlo_proto_to.empty()) { - HloProto proto = MakeHloProto(*module, *assignment); - TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( - proto, xla_dump_optimized_hlo_proto_to, module->name())); - } - - // Each computation is a single function. Emit all embedded computations - // before the entry computation. The order of computations returned from - // GetEmbeddedComputations guarantees that a called computation occurs - // before a caller computation. + // Select an order for emitting the HLO instructions for each + // computation. Using this sequence enables tighter buffer liveness analysis + // and reduced memory usage (as compared to using DependencyHloOrdering). + TF_ASSIGN_OR_RETURN( + SequentialHloOrdering::HloModuleSequence module_sequence, + CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction())); + + // Run buffer analysis on the HLO graph. This analysis figures out which + // temporary buffers are required to run the computation. + TF_ASSIGN_OR_RETURN( + std::unique_ptr assignment, + BufferAssigner::Run( + module.get(), + xla::MakeUnique(module.get(), module_sequence), + BufferSizeBytesFunction(), memory_alignment)); + // BufferAssignment::ToString() includes a header, so no need for us to + // print one ourselves. + XLA_VLOG_LINES(2, assignment->ToString()); + + if (!xla_dump_optimized_hlo_proto_to.empty()) { + HloProto proto = MakeHloProto(*module, *assignment); + TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( + proto, xla_dump_optimized_hlo_proto_to, module->name())); + } - IrEmitter ir_emitter(*module, *assignment, llvm_module.get(), - std::move(instruction_to_profile_idx), - std::move(computation_to_profile_idx), - jit->target_machine(), jit->external_constant_pool()); + // Each computation is a single function. Emit all embedded computations + // before the entry computation. The order of computations returned from + // GetEmbeddedComputations guarantees that a called computation occurs + // before a caller computation. - for (auto embedded_computation : - entry_computation->MakeEmbeddedComputationsList()) { - if (embedded_computation->IsFusionComputation()) { - continue; - } - TF_RETURN_IF_ERROR( - ir_emitter - .EmitComputation(embedded_computation, - embedded_computation->name(), - /*is_top_level_computation=*/false, - &module_sequence.at(embedded_computation)) - .status()); - } - string function_name_prefix = entry_computation->name().empty() - ? "__compute" - : entry_computation->name(); - TF_ASSIGN_OR_RETURN( - llvm::Function * entry_function, - ir_emitter.EmitComputation(entry_computation, function_name_prefix, - /*is_top_level_computation=*/true, - &module_sequence.at(entry_computation))); + IrEmitter ir_emitter(*module, *assignment, llvm_module.get(), + std::move(instruction_to_profile_idx), + std::move(computation_to_profile_idx), + jit->target_machine(), jit->external_constant_pool()); - string function_name = llvm_ir::AsString(entry_function->getName()); - string ir_module_string; - if (embed_ir_in_executable) { - ir_module_string = llvm_ir::DumpModuleToString(*llvm_module); + for (auto embedded_computation : + entry_computation->MakeEmbeddedComputationsList()) { + if (embedded_computation->IsFusionComputation()) { + continue; } - TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); + TF_RETURN_IF_ERROR( + ir_emitter + .EmitComputation(embedded_computation, embedded_computation->name(), + /*is_top_level_computation=*/false, + &module_sequence.at(embedded_computation)) + .status()); + } + string function_name_prefix = entry_computation->name().empty() + ? "__compute" + : entry_computation->name(); + TF_ASSIGN_OR_RETURN( + llvm::Function * entry_function, + ir_emitter.EmitComputation(entry_computation, function_name_prefix, + /*is_top_level_computation=*/true, + &module_sequence.at(entry_computation))); + + string function_name = llvm_ir::AsString(entry_function->getName()); + string ir_module_string; + if (embed_ir_in_executable) { + ir_module_string = llvm_ir::DumpModuleToString(*llvm_module); + } + TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module)); - XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module)); + XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module)); - // JIT compile the LLVM IR module to in-memory machine code. - jit->AddModule(std::move(llvm_module)); - cpu_executable.reset(new CpuExecutable( - std::move(jit), std::move(assignment), std::move(module), function_name, - std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map))); + // JIT compile the LLVM IR module to in-memory machine code. + jit->AddModule(std::move(llvm_module)); + cpu_executable.reset(new CpuExecutable( + std::move(jit), std::move(assignment), std::move(module), function_name, + std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map))); - if (embed_ir_in_executable) { - static_cast(*cpu_executable) - .set_ir_module_string(ir_module_string); - } + if (embed_ir_in_executable) { + static_cast(*cpu_executable) + .set_ir_module_string(ir_module_string); } VLOG(1) << "Compilation finished"; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 97e550abe44a32f25b0aab039e95b0ce2eecde4c..aabf4d5161e3af9d49876c6133f8ec5ddfbbf6d6 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -243,14 +243,14 @@ static Status DeallocateTempBuffers( return Status::OK(); } -StatusOr CpuExecutable::CreateResultShapedBuffer( +StatusOr CpuExecutable::CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result) { se::Stream* stream = run_options->stream(); - ShapedBuffer result_buffer( + ScopedShapedBuffer result_buffer( /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(), - stream->parent()->platform(), stream->parent()->device_ordinal()); + run_options->allocator(), stream->parent()->device_ordinal()); // Copy DeviceMemoryBase values which contain the array(s) of the result into // the respective location in ShapedBuffer which is returned to the caller. @@ -281,7 +281,7 @@ StatusOr CpuExecutable::CreateResultShapedBuffer( return std::move(result_buffer); } -StatusOr CpuExecutable::ExecuteOnStream( +StatusOr CpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -300,7 +300,7 @@ StatusOr CpuExecutable::ExecuteOnStream( std::vector buffers_in_result(assignment_->Allocations().size(), false); TF_ASSIGN_OR_RETURN( - ShapedBuffer result_buffer, + ScopedShapedBuffer result_buffer, CreateResultShapedBuffer(run_options, buffers, &buffers_in_result)); // Free all buffers not in the result. @@ -310,7 +310,7 @@ StatusOr CpuExecutable::ExecuteOnStream( return std::move(result_buffer); } -StatusOr CpuExecutable::ExecuteAsyncOnStream( +StatusOr CpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { if (hlo_profiling_enabled()) { @@ -330,7 +330,7 @@ StatusOr CpuExecutable::ExecuteAsyncOnStream( std::vector buffers_in_result(assignment_->Allocations().size(), false); TF_ASSIGN_OR_RETURN( - ShapedBuffer result_buffer, + ScopedShapedBuffer result_buffer, CreateResultShapedBuffer(run_options, buffers, &buffers_in_result)); LogLiveAddresses(buffers, buffers_in_result); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 06b6943cb5a3f20f4d8aff8acfc4ba50b9067477..68ad38cba88720a04519fc2473fe6f9decbaaf93 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -55,12 +55,12 @@ class CpuExecutable : public Executable { std::unique_ptr hlo_profile_index_map); ~CpuExecutable() override {} - StatusOr ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; @@ -102,13 +102,13 @@ class CpuExecutable : public Executable { tensorflow::gtl::ArraySlice buffers, HloExecutionProfile* hlo_execution_profile); - // Creates a ShapedBuffer for holding the result of the computation. The + // Creates a ScopedShapedBuffer for holding the result of the computation. The // addresses (DeviceMemoryBases) are set according to buffer assignment. // 'buffers_in_result' should point to a vector of the same size as // 'allocated_buffers'. An element in buffers_in_result is set to true if the // corresponding buffer is live out of the computation (and thus contained in // the returned ShapedBuffer). - StatusOr CreateResultShapedBuffer( + StatusOr CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice allocated_buffers, std::vector* buffers_in_result); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc index 09f028463af68bbc2841fecdb2ca6c6a42498798..f9c51f243c47b8069500eca3c9c2929b17f04e62 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc @@ -19,7 +19,6 @@ limitations under the License. namespace { -const char* const kXlaParallelCpuOption = "xla_cpu_parallel"; const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size"; const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce"; const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor"; @@ -30,12 +29,6 @@ namespace xla { namespace cpu { namespace options { -bool CpuParallelBackendRequested(const HloModuleConfig& config) { - const auto& extra_options_map = - config.debug_options().xla_backend_extra_options(); - return extra_options_map.count(kXlaParallelCpuOption) > 0; -} - bool OptimizeForSizeRequested(const HloModuleConfig& config) { const auto& extra_options_map = config.debug_options().xla_backend_extra_options(); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h index 6ba0fd24538b63a3da81083482e6bee3b552dfea..be62ff3cc1af23408ca8a00f1372e7a998f160c6 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_options.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h @@ -24,7 +24,6 @@ namespace xla { namespace cpu { namespace options { -bool CpuParallelBackendRequested(const HloModuleConfig& config); bool OptimizeForSizeRequested(const HloModuleConfig& config); bool VectorizedReduceDisabled(const HloModuleConfig& config); tensorflow::gtl::optional LlvmIrGemvTilingFactor( diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc deleted file mode 100644 index 662ee609232f5582ce74f4f515637b2623175e94..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.cc +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h" - -#include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h" -#include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h" -#include "tensorflow/compiler/xla/service/cpu/shape_partition.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/strcat.h" - -namespace xla { -namespace cpu { - -StatusOr ParallelizationPreparation::Run(HloModule* module) { - XLA_VLOG_LINES(2, "ParallelizationPreparation ENTRY"); - XLA_VLOG_LINES(2, module->ToString()); - - bool changed = false; - TF_ASSIGN_OR_RETURN(changed, RunParallelTaskAssignment(module)); - - HloComputation* entry_computation = module->entry_computation(); - std::unordered_set outlined; - std::vector instructions_to_outline; - for (HloInstruction* instruction : - entry_computation->MakeInstructionPostOrder()) { - // If the instruction has been outlined, it no longer exists and we must not - // dereference it. - if (outlined.count(instruction) > 0) { - continue; - } - - // Skip parameters and constants, there is nothing to parallelize. - if (instruction->opcode() == HloOpcode::kParameter || - instruction->opcode() == HloOpcode::kConstant) { - continue; - } - - // Outline 'instruction' in isolation if it was assigned parallel tasks. - if (OutlineParallelizableInstruction(instruction)) { - outlined.insert(instruction); - changed = true; - continue; - } - - instructions_to_outline.clear(); - HloInstruction* outline_candidate = instruction; - instructions_to_outline.push_back(outline_candidate); - - // Outline sole users with the current instruction. - while (CanOutlineWithUser(outline_candidate)) { - HloInstruction* prior_candidate = outline_candidate; - outline_candidate = *outline_candidate->users().begin(); - if (std::any_of(outline_candidate->operands().begin(), - outline_candidate->operands().end(), - [&](const HloInstruction* operand) { - // Do not consider any candidates which have operands - // other than the prior candidate, constants or - // parameters. Otherwise, we'd increase the fan-in which - // would reduce parallelism. - return operand->opcode() != HloOpcode::kParameter && - operand->opcode() != HloOpcode::kConstant && - operand != prior_candidate; - })) { - break; - } - instructions_to_outline.push_back(outline_candidate); - } - - outlined.insert(instructions_to_outline.begin(), - instructions_to_outline.end()); - - // Optimization to avoid replacing a single existing kCall with another - // kCall that just calls the first one. - if (instructions_to_outline.size() == 1 && - instructions_to_outline[0]->opcode() == HloOpcode::kCall) { - continue; - } - - module->OutlineExpressionFromComputation( - instructions_to_outline, - tensorflow::strings::StrCat("pp_", instruction->name()), - entry_computation); - changed = true; - } - - XLA_VLOG_LINES(2, "ParallelizationPreparation EXIT"); - XLA_VLOG_LINES(2, module->ToString()); - return changed; -} - -StatusOr ParallelizationPreparation::RunParallelTaskAssignment( - HloModule* module) { - VLOG(1) << "RunParallelTaskAssignment max_parallelism_: " << max_parallelism_; - bool changed = false; - // Initialize ParallelTaskAssignment. - ParallelTaskAssignment parallel_task_assignment(max_parallelism_, shape_size_, - module); - // Assign parallel tasks to HLOs in entry computation. - HloComputation* computation = module->entry_computation(); - for (auto* instruction : computation->instructions()) { - // Calculate target parallel task count in [1, max_parallelism_]. - const int64 target_parallel_task_count = - parallel_task_assignment.GetTargetParallelTaskCount(instruction); - if (target_parallel_task_count == 1) { - continue; - } - - // Assign feasible dimension partitions (based on actual dimension sizes). - auto dim_partition_counts = ShapePartitionAssigner(instruction->shape()) - .Run(target_parallel_task_count); - const int64 total_partition_count = - ShapePartitionAssigner::GetTotalPartitionCount(dim_partition_counts); - if (total_partition_count <= 1) { - // Feasible partition calculation resulting in no partitioning, so skip. - continue; - } - VLOG(2) << "Assigning parallel task count: " << total_partition_count - << " to instruction: " << instruction->name(); - // Map 'instruction' to assigned dimension partitioning. - instruction->set_outer_dimension_partitions(dim_partition_counts); - } - - return changed; -} - -bool ParallelizationPreparation::OutlineParallelizableInstruction( - HloInstruction* instruction) { - if (instruction->outer_dimension_partitions().empty()) { - return false; - } - // Store dimension partition counts before outlining (which clones - // 'instruction'). - std::vector dim_partition_counts = - instruction->outer_dimension_partitions(); - // Outline 'instruction' in its own sub-computation. - HloModule* module = instruction->parent()->parent(); - auto* call = module->OutlineExpressionFromComputation( - {instruction}, tensorflow::strings::StrCat("pp_", instruction->name()), - module->entry_computation()); - // Map previously assigned 'dim_partition_counts' to cloned root instruction. - VLOG(1) << "Outlining parallelizable" - << " caller: " << call->name() - << " callee: " << call->to_apply()->root_instruction()->name(); - call->to_apply()->root_instruction()->set_outer_dimension_partitions( - dim_partition_counts); - return true; -} - -bool ParallelizationPreparation::CanOutlineWithUser( - HloInstruction* instruction) { - if (instruction->users().size() != 1) { - // Do not outline 'instruction' with multiple users. - return false; - } - if (AssignedParallelTasks(instruction) || - AssignedParallelTasks(*instruction->users().begin())) { - // Do not outline if 'instruction' (or user) were assigned parallel tasks. - return false; - } - return true; -} - -bool ParallelizationPreparation::AssignedParallelTasks( - HloInstruction* instruction) { - return !instruction->outer_dimension_partitions().empty() || - (instruction->opcode() == HloOpcode::kCall && - !instruction->to_apply() - ->root_instruction() - ->outer_dimension_partitions() - .empty()); -} - -} // namespace cpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h b/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h deleted file mode 100644 index 87be758ef5d0535fdce3a65e54ce225042019cdb..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_ - -#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" - -namespace xla { -namespace cpu { - -// This pass prepares an HLO module for parallel execution by transforming -// subgraphs of the top-level computation into embedded computations which can -// be executed in parallel. -// TODO(b/29630486): Currently, it is limited to turning all instructions (which -// are not constants or parameters) in the entry computation into embedded -// computations. However, it could make sense to coarsen the parallelization to -// improve cache locality. Also, we will need to do something to intelligently -// handle While constructs. -class ParallelizationPreparation : public HloPassInterface { - public: - // 'max_parallelism': the maximum parallel task count per instruction. - // 'shape_size': shape size function used by HloCostAnalysis during parallel - // task assignment. - ParallelizationPreparation( - const int64 max_parallelism, - const HloCostAnalysis::ShapeSizeFunction& shape_size) - : max_parallelism_(max_parallelism), shape_size_(shape_size) {} - ~ParallelizationPreparation() override {} - - tensorflow::StringPiece name() const override { - return "cpu-parallel-prepare"; - } - - // Run parallel preparation on the given computation. Returns whether the - // computation was changed. - StatusOr Run(HloModule* module) override; - - private: - // Assigns parallel task partitions to conformant instructions in 'module'. - // Returns true on success or error status otherwise. - StatusOr RunParallelTaskAssignment(HloModule* module); - - // Outlines 'instruction' from entry computation, if it had - // been assigned parallel tasks in an earlier pass through the computation. - // Returns true if 'instruction' was successfully outlined, false otherwise. - bool OutlineParallelizableInstruction(HloInstruction* instruction); - - // Returns true if 'instruction' can be outlined into the same sub-computation - // with its single user (parallelizable instructions are not outlined with - // each other). Returns false otherwise. - bool CanOutlineWithUser(HloInstruction* instruction); - - // Returns true if 'instruction' (or the root of the sub-computation that - // 'instruction' calls) has had parallel tasks assigned in earlier pass. - // Returns false otherwise. - bool AssignedParallelTasks(HloInstruction* instruction); - - const int64 max_parallelism_; - const HloCostAnalysis::ShapeSizeFunction shape_size_; -}; - -} // namespace cpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_PARALLELIZATION_PREPARATION_H_ diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc index 29afd8ea5f9822ea9ae969ae035511a58de4888e..495fecc4aa8b3cf8fcb3ab63d82d8146546854da 100644 --- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc @@ -1070,7 +1070,8 @@ static bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape, // 1) be matrices with no padding, and // 2) have an allowed element type. PrimitiveType output_primitive_type = output_shape.element_type(); - return (output_primitive_type == F32 || output_primitive_type == F16) && + return (output_primitive_type == F64 || output_primitive_type == F32 || + output_primitive_type == F16) && IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) && IsRank2WithNoPadding(output_shape); } diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 3405277d449f2d9e558f2d3f83277163655af592..0b08ad8da3cf1734c96dd41e9145aece08dfe779 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -93,8 +93,6 @@ IrEmitter::IrEmitter( computation_to_profile_idx_(std::move(computation_to_profile_idx)), alias_analysis_(hlo_module, assignment, &llvm_module->getContext()), hlo_module_config_(hlo_module.config()), - parallel_cpu_backend_( - options::CpuParallelBackendRequested(hlo_module_config_)), is_top_level_computation_(false), target_machine_features_(target_machine), external_constant_pool_(external_constant_pool) { @@ -2076,7 +2074,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) { TF_RETURN_IF_ERROR(ElementTypesSameAndSupported( /*instruction=*/*root, /*operands=*/{lhs, rhs}, - /*supported_types=*/{F16, F32})); + /*supported_types=*/{F16, F32, F64})); llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs)); llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs)); @@ -2163,8 +2161,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) { TF_RETURN_IF_ERROR(EmitTargetAddressForOp(call)); - if (!computation->root_instruction()->outer_dimension_partitions().empty() && - !parallel_cpu_backend_) { + if (!computation->root_instruction()->outer_dimension_partitions().empty()) { // ParallelTaskAssignment assigned partitions, emit call to // ParallelForkJoin. std::vector call_args = GetArrayFunctionCallArguments( @@ -2550,22 +2547,6 @@ Status IrEmitter::FinishVisit(HloInstruction* root) { } }; - // For the parallel cpu backend, we record the total for each embedded - // computation callee with its caller kCall HLO. - if (parallel_cpu_backend_ && is_top_level_computation_) { - auto* computation = root->parent(); - auto* entry_computation = computation->parent()->entry_computation(); - if (computation != entry_computation) { - for (HloInstruction* instruction : entry_computation->instructions()) { - if (instruction->opcode() == HloOpcode::kCall && - instruction->to_apply()->root_instruction() == root) { - record_complete_computation(GetProfileCounterFor(*instruction)); - return Status::OK(); - } - } - } - } - // For the entry computation this increment is cumulative of embedded // computations since it includes cycles spent in computations invoked by // While, Call etc. diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h index 509440251497cd7337284c39dae05c5f6c28e7c2..0f2f3d1817d6e891211bed843cd05c414771f151 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h @@ -532,8 +532,6 @@ class IrEmitter : public DfsHloVisitorWithDefault { const HloModuleConfig& hlo_module_config_; - const bool parallel_cpu_backend_; - bool is_top_level_computation_; TargetMachineFeatures target_machine_features_; diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc deleted file mode 100644 index a2bd4fa195bd93211914da172e7aed59b713ba8b..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc +++ /dev/null @@ -1,528 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" -#include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/service/buffer_assignment.h" -#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h" -#include "tensorflow/compiler/xla/service/cpu/shape_partition.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/hlo_opcode.h" -#include "tensorflow/compiler/xla/service/logical_buffer.h" -#include "tensorflow/compiler/xla/service/shaped_buffer.h" -#include "tensorflow/compiler/xla/shape_util.h" -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/util.h" -#include "tensorflow/compiler/xla/xla_data.pb.h" -#include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/core/platform/mem.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/types.h" - -namespace xla { -namespace cpu { - -ParallelCpuExecutable::ParallelCpuExecutable( - std::unique_ptr jit, - std::unique_ptr assignment, - std::unique_ptr hlo_module, - std::unique_ptr> function_names, - std::unordered_map> - aligned_constants, - std::unique_ptr hlo_profile_printer_data, - std::unique_ptr hlo_profile_index_map) - : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data), - std::move(hlo_profile_index_map)), - jit_(std::move(jit)), - assignment_(std::move(assignment)), - function_names_(std::move(function_names)), - aligned_constants_(std::move(aligned_constants)) {} - -// Type of the computation function we expect in the JIT. -using ComputeFunctionType = void (*)(void*, const void*, const void**, void**, - int64*, int64*); - -// Given a pointer to an output buffer (following the CPU JIT calling -// conventions), mark addresses that are "live". The initial pointer itself is -// trivially live. If the shape of the buffer is a tuple, this analysis looks -// into the tuple's elements and marks them live as well (since tuples keep -// pointers to buffers) and also works recursively. -// address is an in-memory buffer address that contains some runtime XLA object. -// shape is its shape. marked_addresses is the set of live addresses to -// populate. -static void MarkLiveAddressesInOutput( - const void* address, const Shape& shape, - std::unordered_set* marked_addresses) { - marked_addresses->insert(address); - const uintptr_t* address_buffer = static_cast(address); - if (ShapeUtil::IsTuple(shape)) { - for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { - const uintptr_t* element_address = address_buffer + i; - const void* element = reinterpret_cast(*element_address); - MarkLiveAddressesInOutput( - element, ShapeUtil::GetTupleElementShape(shape, i), marked_addresses); - } - } -} - -namespace { - -// Executor manages the concurrent execution of 'functions' for instructions -// in 'pending' on 'thread_pool' (storing resulting data in 'results'). -class Executor { - public: - Executor(const HloInstructionMap& functions, - const ServiceExecutableRunOptions* run_options, - std::list* pending, - HloInstructionMap* results, void** temps_array, - int64* profile_counters_array, const BufferAssignment* assignment) - : functions_(functions), - run_options_(run_options), - pending_(pending), - results_(results), - temps_array_(temps_array), - profile_counters_array_(profile_counters_array), - thread_pool_(CHECK_NOTNULL(run_options_->xla_intra_op_thread_pool())), - assignment_(assignment) {} - - // Executes pending list of instructions on thread pool. - // Returns OK status on success, error status otherwise. - Status Run(); - - private: - // Schedules a parallel invocation of compute function for 'instruction' on - // 'thread_pool_', storing result in 'result_buffer'. - // If 'partition_buffers' is non-null, parallel task will be invoked on - // per-dimension partition [start, limit) values stored in - // 'partition_buffers'. - void Schedule(HloInstruction* instruction, int64* partition_buffers, - void* result_buffer); - - // Returns true if 'instruction' has been assigned parallel tasks (returns - // false otherwise). - bool HasParallelTasks(HloInstruction* instruction); - - // Returns in 'partition_buffers' the partition [size, limit) for each - // dimension. - int64* GetPartitionBuffers( - const std::vector>& partition); - - // Returns array of result buffers for all operands in 'instruction'. - const void** GetOperandBuffers(HloInstruction* instruction); - - // Arguments passed into Executor. - const HloInstructionMap& functions_; - const ServiceExecutableRunOptions* run_options_; - std::list* pending_; - HloInstructionMap* results_; - void** temps_array_; - int64* profile_counters_array_; - tensorflow::thread::ThreadPool* thread_pool_; - const BufferAssignment* assignment_; - - // Members used to manage instruction execution. - tensorflow::mutex completion_queue_lock_; - tensorflow::condition_variable completion_queue_cv_; - std::deque completion_queue_; - int64 instructions_in_flight_ = 0; - std::unordered_map tasks_in_flight_; -}; - -Status Executor::Run() { - while (!pending_->empty() || instructions_in_flight_ > 0) { - auto pending_it = pending_->begin(); - while (pending_it != pending_->end()) { - HloInstruction* instruction = *pending_it; - // Skip pending instructions whose operands aren't ready. - if (std::any_of(instruction->operands().begin(), - instruction->operands().end(), - [&](HloInstruction* operand) { - return !ContainsKey(*results_, operand); - })) { - ++pending_it; - continue; - } - - // Get 'result_buffer' reference to result buffer for 'instruction'. - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, - assignment_->GetUniqueTopLevelSlice(instruction)); - void* result_buffer = - static_cast(temps_array_[result_slice.index()]) + - result_slice.offset(); - - if (HasParallelTasks(instruction)) { - // 'instruction' has been assigned parallel task partitions. - CHECK_EQ(HloOpcode::kCall, instruction->opcode()); - HloInstruction* root = instruction->to_apply()->root_instruction(); - - // Create ShapePartitionIterator to iterate through all outer dimension - // partitions of 'instruction'. - ShapePartitionIterator partition_iterator( - root->shape(), root->outer_dimension_partitions()); - - const int64 partition_count = - partition_iterator.GetTotalPartitionCount(); - - // Record total parallel task count for 'instruction' before dispatch. - { - tensorflow::mutex_lock l(completion_queue_lock_); - tasks_in_flight_.insert(std::make_pair(instruction, partition_count)); - VLOG(2) << "Schedule PARALLEL" - << " instruction: " << instruction->name() - << " instruction.callee: " - << instruction->to_apply()->root_instruction()->name() - << " partition_count: " << partition_count; - } - - for (int64 i = 0; i < partition_count; ++i) { - // Get partition [start, limit) for each dimension. - auto partition_buffers = - GetPartitionBuffers(partition_iterator.GetPartition(i)); - Schedule(instruction, partition_buffers, result_buffer); - } - - } else { - // Set tasks in-flight to '1' for sequential instruction execution. - { - tensorflow::mutex_lock l(completion_queue_lock_); - tasks_in_flight_.insert(std::make_pair(instruction, 1)); - VLOG(2) << "Schedule SEQUENTIAL" - << " instruction: " << instruction->name() - << " instruction.callee: " - << instruction->to_apply()->root_instruction()->name(); - } - Schedule(instruction, nullptr, result_buffer); - } - - ++instructions_in_flight_; - pending_it = pending_->erase(pending_it); - } - // Wait for a completed HLO instruction to be present in the queue. We will - // pop it out of the queue and make the result available to its users. - HloInstruction* instruction; - do { - tensorflow::mutex_lock l(completion_queue_lock_); - if (completion_queue_.empty()) { - completion_queue_cv_.wait(l); - } - if (!completion_queue_.empty()) { - instruction = completion_queue_.front(); - completion_queue_.pop_front(); - break; - } - } while (true); - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, - assignment_->GetUniqueTopLevelSlice(instruction)); - void* result_buffer = - static_cast(temps_array_[result_slice.index()]) + - result_slice.offset(); - InsertOrDie(results_, instruction, result_buffer); - --instructions_in_flight_; - } - return Status::OK(); -} - -void Executor::Schedule(HloInstruction* instruction, int64* partition_buffers, - void* result_buffer) { - // The thread pool entry takes ownership of |operand_buffers|. - auto operand_buffers = GetOperandBuffers(instruction); - - auto function = FindOrDie(functions_, instruction); - const auto* exec_run_options = &run_options_->run_options(); - thread_pool_->Schedule([this, instruction, result_buffer, operand_buffers, - partition_buffers, exec_run_options, function]() { - function(result_buffer, exec_run_options, operand_buffers, temps_array_, - partition_buffers, profile_counters_array_); - - delete[] operand_buffers; - delete[] partition_buffers; - // Push the completed HLO instruction on the queue, the main - // thread will pop it off and potentially launch more work which - // uses the result. - // TODO(b/27458679) Consider alternative task scheduling and synchronization - // schemes. For example, we could avoid the overhead associate with the - // condvar here if the thread just dequed the next instruction to execute - // on completion. - { - tensorflow::mutex_lock l(completion_queue_lock_); - // Decrement in-flight task count for this completion. - if (--FindOrDie(tasks_in_flight_, instruction) == 0) { - completion_queue_.push_back(instruction); - completion_queue_cv_.notify_all(); - tasks_in_flight_.erase(instruction); - } - } - }); -} - -int64* Executor::GetPartitionBuffers( - const std::vector>& partition) { - // Return in 'partition_buffers' partition [size, limit) for each dimension. - auto partition_buffers = new int64[partition.size() * 2]; - for (int i = 0; i < partition.size(); ++i) { - partition_buffers[2 * i + 0] = partition[i].first; - partition_buffers[2 * i + 1] = partition[i].first + partition[i].second; - } - return partition_buffers; -} - -bool Executor::HasParallelTasks(HloInstruction* instruction) { - return instruction->opcode() == HloOpcode::kCall && - !instruction->to_apply() - ->root_instruction() - ->outer_dimension_partitions() - .empty(); -} - -const void** Executor::GetOperandBuffers(HloInstruction* instruction) { - // We cannot use a move-only RAII type like std::unique_ptr because the - // list of operands is allocated on the main thread and transferred to the - // worker via the lambda passed to enqueue_function. In order for the - // lambda to take ownership, we would need to use generalized lambda - // capture which is a feature new to C++14. - // TODO(b/27458679) Avoid dynamic allocations in Executor. - auto operand_buffers = new const void*[instruction->operand_count()]; - std::transform(instruction->operands().begin(), instruction->operands().end(), - operand_buffers, [this](HloInstruction* operand) { - return FindOrDie(*results_, operand); - }); - return operand_buffers; -} - -} // namespace - -Status ParallelCpuExecutable::AllocateBuffers( - DeviceMemoryAllocator* memory_allocator, int device_ordinal, - std::vector* buffers) { - CHECK_EQ(buffers->size(), assignment_->Allocations().size()); - VLOG(3) << "Allocating " << assignment_->Allocations().size() - << " allocations for module " << module().name(); - for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); - ++i) { - auto& allocation = assignment_->GetAllocation(i); - - VLOG(3) << allocation.ToString(); - - if (allocation.is_entry_computation_parameter()) { - VLOG(3) << "allocation #" << i << " is a parameter"; - continue; - } - - if (allocation.is_thread_local()) { - VLOG(3) << "buffer #" << i << " is thread-local"; - continue; - } - - int64 buffer_size = allocation.size(); - if (!(*buffers)[i].is_null()) { - VLOG(3) << "buffer #" << i - << " is in the preallocated result ShapedBuffer"; - } else { - TF_ASSIGN_OR_RETURN((*buffers)[i], memory_allocator->Allocate( - device_ordinal, buffer_size)); - - VLOG(3) << "buffer #" << i << " allocated " << buffer_size << " bytes [" - << (*buffers)[i].opaque() << "]"; - } - - // Since the output buffer and all the temporary buffers were written into - // by the JITed code, msan has no way of knowing their memory was - // initialized. Mark them initialized so that msan doesn't flag loads from - // these buffers. - TF_ANNOTATE_MEMORY_IS_INITIALIZED((*buffers)[i].opaque(), buffer_size); - } - - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice, - assignment_->GetUniqueTopLevelOutputSlice()); - VLOG(3) << "result index: " << result_slice.index(); - - return Status::OK(); -} - -Status ParallelCpuExecutable::ExecuteComputeFunctions( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice buffers, - HloExecutionProfile* hlo_execution_profile) { - // Allocate profiling counters for each hlo instruction that we would like to - // profile. - std::vector* profile_counters = nullptr; - if (hlo_execution_profile) { - profile_counters = hlo_execution_profile->mutable_profile_counters(); - } - - std::vector buffer_pointers; - buffer_pointers.reserve(buffers.size()); - for (auto device_allocation : buffers) { - buffer_pointers.push_back(device_allocation.opaque()); - } - - // Resolve functions for all the HLO instructions ahead of time. - HloInstructionMap functions; - for (auto& entry : *function_names_) { - tensorflow::mutex_lock lock(jit_mutex_); - HloInstruction* instruction = entry.first; - llvm::JITSymbol sym = jit_->FindCompiledSymbol(entry.second); - TF_RET_CHECK(sym); - InsertOrDie( - &functions, instruction, - reinterpret_cast(cantFail(sym.getAddress()))); - } - - // Map containing pointers to result buffers for each instruction. - HloInstructionMap results; - - uint64 start_micros = tensorflow::Env::Default()->NowMicros(); - - std::list pending; - - // Call the function for each HLO instruction in topological order. - const HloComputation& entry_computation = *module().entry_computation(); - for (auto* instruction : entry_computation.MakeInstructionPostOrder()) { - // Parameters and constants have no functions associated with them. Instead - // just copy the existing buffer into the map containing instruction - // results.. - if (instruction->opcode() == HloOpcode::kParameter) { - InsertOrDie( - &results, instruction, - arguments[instruction->parameter_number()]->root_buffer().opaque()); - } else if (instruction->opcode() == HloOpcode::kConstant) { - unsigned char* aligned_data = - FindOrDie(aligned_constants_, instruction).get(); - InsertOrDie(&results, instruction, aligned_data); - } else { - TF_RET_CHECK(instruction->opcode() == HloOpcode::kCall); - pending.push_back(instruction); - } - } - - // TODO(b/27458679) Manage scheduling based on in-flight concurrency limits. - // For example, if we expect a library conv/matmul call to run at max - // concurrency, we should not dispatch runnable instructions until the - // library call is finished (to avoid expensive cache invalidation). - Executor executor( - functions, run_options, &pending, &results, buffer_pointers.data(), - profile_counters ? profile_counters->data() : nullptr, assignment_.get()); - - TF_RETURN_IF_ERROR(executor.Run()); - - uint64 end_micros = tensorflow::Env::Default()->NowMicros(); - - { - tensorflow::mutex_lock lock(mutex_); - double nanoseconds = (end_micros - start_micros) * 1000.0; - execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0)); - } - - return Status::OK(); -} - -StatusOr ParallelCpuExecutable::ExecuteOnStream( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - HloExecutionProfile* hlo_execution_profile) { - if (GetRootPointsToSet().IsAmbiguous()) { - return Unimplemented("Points-to set of root instruction is ambiguous"); - } - - se::Stream* stream = run_options->stream(); - DeviceMemoryAllocator* memory_allocator = run_options->allocator(); - std::vector buffers(assignment_->Allocations().size()); - - ShapedBuffer result_buffer( - /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(), - stream->parent()->platform(), stream->parent()->device_ordinal()); - - TF_RETURN_IF_ERROR(AllocateBuffers( - memory_allocator, stream->parent()->device_ordinal(), &buffers)); - - TF_RETURN_IF_ERROR(ExecuteComputeFunctions(run_options, arguments, buffers, - hlo_execution_profile)); - - // Copy DeviceMemoryBase values which into the respective location in - // ShapedBuffer which is returned to the caller. - std::vector buffers_in_result(assignment_->Allocations().size(), false); - TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus( - [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { - const auto& sources = this->GetRootPointsToSet().element(index); - - // The points to set is unambiguous so the set should be a singleton. - CHECK_EQ(1, sources.size()); - const LogicalBuffer* buffer_source = sources[0]; - HloInstruction* src = buffer_source->instruction(); - - // The source for this result buffer can be a nested buffer such as a - // tuple element. The source instruction should have a non-parameter - // buffer assigned. - TF_ASSIGN_OR_RETURN( - const BufferAllocation::Slice slice, - this->assignment_->GetUniqueSlice(src, buffer_source->index())); - CHECK(!slice.allocation()->is_entry_computation_parameter()); - - const BufferAllocation::Index buffer_index = slice.index(); - const se::DeviceMemoryBase& buffer = buffers[buffer_index]; - CHECK(!buffer.is_null() || buffer.size() == 0); - *device_memory = buffer; - buffers_in_result[buffer_index] = true; - return Status::OK(); - })); - - // Free all buffers not in the result. - for (size_t i = 0; i < buffers.size(); ++i) { - se::DeviceMemoryBase alloc = buffers[i]; - if (!buffers_in_result[i] && !alloc.is_null()) { - VLOG(3) << "CpuExecutable deallocating buffer #" << i << " [" - << alloc.opaque() << "]"; - TF_RETURN_IF_ERROR(memory_allocator->Deallocate( - stream->parent()->device_ordinal(), &alloc)); - } - } - - return std::move(result_buffer); -} - -StatusOr ParallelCpuExecutable::ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) { - // TODO(b/30671675): Implement asynchronous execution mode. - return Unimplemented( - "Asynchronous execution on stream is not yet supported on CPU."); -} - -const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const { - return assignment_->points_to_analysis().GetPointsToSet( - module().entry_computation()->root_instruction()); -} - -} // namespace cpu -} // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h deleted file mode 100644 index 5ce84fa99640c782d85e8bcedd19820788737a38..0000000000000000000000000000000000000000 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h +++ /dev/null @@ -1,137 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_ -#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_ - -#include -#include -#include -#include -#include - -#include "tensorflow/compiler/xla/service/buffer_assignment.h" -#include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" -#include "tensorflow/compiler/xla/service/executable.h" -#include "tensorflow/compiler/xla/service/hlo_execution_profile.h" -#include "tensorflow/compiler/xla/service/hlo_instruction.h" -#include "tensorflow/compiler/xla/service/hlo_module.h" -#include "tensorflow/compiler/xla/service/shaped_buffer.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/core/lib/gtl/array_slice.h" -#include "tensorflow/core/platform/macros.h" -#include "tensorflow/core/platform/mutex.h" -#include "tensorflow/core/platform/stream_executor_no_cuda.h" -#include "tensorflow/core/platform/thread_annotations.h" - -namespace xla { -namespace cpu { - -// CPU-targeting parallel implementation of the XLA Executable interface. -// -// Wraps a JIT-ed object that can be executed "on device". We JIT for the host -// architecture, so JIT-ed code and host code share the same ABI. -class ParallelCpuExecutable : public Executable { - public: - ParallelCpuExecutable( - std::unique_ptr jit, - std::unique_ptr assignment, - std::unique_ptr hlo_module, - std::unique_ptr> function_names, - std::unordered_map> - aligned_constants, - std::unique_ptr hlo_profile_printer_data, - std::unique_ptr hlo_profile_index_map); - ~ParallelCpuExecutable() override {} - - StatusOr ExecuteOnStream( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - HloExecutionProfile* hlo_execution_profile) override; - - StatusOr ExecuteAsyncOnStream( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments) override; - - // This should be called after set_ir_module_string. - const string& ir_module_string() const { return ir_module_string_; } - - void set_ir_module_string(const string& ir_module_string) { - ir_module_string_ = ir_module_string; - } - - static int64 ShapeSizeBytes(const Shape& shape) { - // On the cpu, opaques are pointers. - if (ShapeUtil::IsOpaque(shape)) { - return sizeof(void*); - } - return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); - } - - private: - // Allocate buffers required for execution and assign them to the elements of - // "buffers". "buffers" should be sized to the number of buffers in buffer - // assignment. Each vector element corresponds to a particular Index. If - // a vector element already contains a non-null DeviceMemoryBase, then no - // buffer is assigned for this element. - Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator, - int device_ordinal, - std::vector* buffers); - - // Calls the generated functions in 'function_names_', performing the - // computation with the given arguments using the supplied buffers. - Status ExecuteComputeFunctions( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice arguments, - tensorflow::gtl::ArraySlice buffers, - HloExecutionProfile* hlo_execution_profile); - - // Returns the points-to set of the root instruction of the entry - // computation. Uses points-to analysis from buffer assignment. - const PointsToSet& GetRootPointsToSet() const; - - // The JIT containing compiled modules. - tensorflow::mutex jit_mutex_; - const std::unique_ptr jit_ GUARDED_BY(jit_mutex_); - - // Buffer assignment for the buffers we need to allocate. - const std::unique_ptr assignment_; - - // The LLVM IR, in string format, of the unoptimized module generated for this - // ParallelCpuExecutable. We save a string instead of an llvm::Module* because - // leaving llvm::Module* in a singleton can cause the heap checker to emit - // false positives. - string ir_module_string_; - - // Map containing the JITted function names for each HLO instruction. - const std::unique_ptr> function_names_; - - // Map from HLO Constant instructions to a pointer to their literal data. - // The data stored in the protocol buffer might be insufficiently aligned, - // we create a sufficiently aligned copy and store it in this map. - const std::unordered_map> - aligned_constants_; - - TF_DISALLOW_COPY_AND_ASSIGN(ParallelCpuExecutable); -}; - -} // namespace cpu -} // namespace xla - -#endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_CPU_EXECUTABLE_H_ diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h index 240acf89739d1e07fab7a8e2a4b4d43a0732c57f..da45c4d45a1c56fd39b1e3e17ff131de59ceeced 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.h +++ b/tensorflow/compiler/xla/service/device_memory_allocator.h @@ -38,13 +38,25 @@ class DeviceMemoryAllocator { virtual ~DeviceMemoryAllocator() {} // 'retry_on_failure': If false, and the first attempt to allocate the memory - // fails, the allocation should return immediately without retrying. - // An example use case is optional scratch spaces where a failure - // has only performance impact. + // fails, the allocation should return immediately without retrying. An + // example use case is optional scratch spaces where a failure has only + // performance impact. + // // Allocate() should return a null pointer for a size-0 allocation. // Deallocate() must be a no-op for null pointers. - virtual StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure = true) = 0; + virtual StatusOr Allocate(int device_ordinal, + uint64 size, + bool retry_on_failure) = 0; + + // Two-arg version of Allocate(), which sets retry-on-failure to true. + // + // (We don't simply use a default argument on the virtual Allocate function + // because default args on virtual functions are disallowed by the Google + // style guide.) + StatusOr Allocate(int device_ordinal, uint64 size) { + return Allocate(device_ordinal, size, /*retry_on_failure=*/true); + } + virtual tensorflow::Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) = 0; @@ -67,8 +79,12 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator { const se::Platform* platform, tensorflow::gtl::ArraySlice stream_executors); - StatusOr Allocate( - int device_ordinal, uint64 size, bool retry_on_failure = true) override; + StatusOr Allocate(int device_ordinal, uint64 size, + bool retry_on_failure) override; + + // Pull in two-arg overload that sets retry_on_failure to true. + using DeviceMemoryAllocator::Allocate; + tensorflow::Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) override; diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index 56e35e260469e383848db7bdbdbaacd9c9f9e56c..38b5efa9fb2cdbb8581682003d2fc68cf2020b2b 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -52,6 +52,13 @@ using tensorflow::strings::StrCat; namespace { +int64 GlobalRandomValue() { + static auto* mu = new tensorflow::mutex(); + static std::mt19937_64 rng{42}; + tensorflow::mutex_lock l(*mu); + return rng(); +} + llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits, int64 mantissa_bits, llvm::IRBuilder<>* ir_builder) { @@ -1175,7 +1182,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator( llvm::Value* increment = ir_builder_->getInt( llvm::APInt(128, {0x14057B7EF767814F, 0x5851F42D4C957F2D})); - auto random_value = [hlo]() { + auto random_value_from_hlo = [hlo]() { const HloModule* module = hlo->IsFused() ? hlo->parent()->FusionInstruction()->parent()->parent() : hlo->parent()->parent(); @@ -1197,10 +1204,15 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator( /*Ty=*/ir_builder_->getInt64Ty(), /*isConstant=*/false, /*Linkage=*/llvm::GlobalValue::PrivateLinkage, - /*Initializer=*/ir_builder_->getInt64(random_value()), + /*Initializer=*/ir_builder_->getInt64(random_value_from_hlo()), /*Name=*/"state_ptr0"); + + // When the module config seed is 0, the expected result of a prng is a random + // value. Instead of using the random_value_from_hlo, we need a global random + // value as the graph seed. This is because if we use random_value_from_hlo + // here, then for a newly built hlo graph, it always gives the same number. uint64 graph_seed = hlo_module_config_.seed() != 0 ? hlo_module_config_.seed() - : random_value(); + : GlobalRandomValue(); llvm::GlobalVariable* state_ptr1 = new llvm::GlobalVariable( /*M=*/*module_, /*Ty=*/ir_builder_->getInt64Ty(), diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index b097ef79cc6d925068c55ba65c598706408648f5..021f09d310b718b51932d0492d1b8f5eb562605c 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -29,12 +29,12 @@ using tensorflow::gtl::ArraySlice; namespace xla { -StatusOr> Executable::ExecuteOnStreams( +StatusOr> Executable::ExecuteOnStreams( ArraySlice run_options, ArraySlice> arguments) { TF_RET_CHECK(run_options.size() == arguments.size()); - std::vector return_values; + std::vector return_values; return_values.reserve(run_options.size()); if (run_options.size() == 1) { @@ -60,7 +60,7 @@ StatusOr> Executable::ExecuteOnStreams( return std::move(return_values); } -StatusOr Executable::ExecuteOnStreamWrapper( +StatusOr Executable::ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, ArraySlice arguments) { se::Stream* stream = run_options->stream(); @@ -80,7 +80,7 @@ StatusOr Executable::ExecuteOnStreamWrapper( &hlo_profile_index_map()) : nullptr; - StatusOr return_value = + StatusOr return_value = ExecuteOnStream(run_options, arguments, profile_ptr.get()); TF_RETURN_IF_ERROR(return_value.status()); @@ -163,4 +163,24 @@ Status Executable::DumpSessionModule() { result); } +/* static */ Status Executable::DumpToDirectory( + const string& directory_path, string filename, + const HloSnapshot& hlo_session) { + tensorflow::Env* env = tensorflow::Env::Default(); + if (!env->IsDirectory(directory_path).ok()) { + // NB! CreateDir does not work reliably with multiple XLA threads -- two + // threads can race to observe the absence of the dump directory and + // simultaneously try to create it, causing the "losing" thread to get a + // "directory already exists" error. + TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path)); + } + filename = SanitizeFileName(std::move(filename)); + string file_path = tensorflow::io::JoinPath(directory_path, filename); + string result; + TF_RET_CHECK( + tensorflow::SerializeToStringDeterministic(hlo_session, &result)); + return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path, + result); +} + } // namespace xla diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 9c725f21d80ca02582a54061878c99b2612d8a44..f7af1ca57492972c58d3ce5ddc083088a32a968f 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_module.h" @@ -62,14 +63,14 @@ class Executable { // enabled. // // Returns a shaped buffer containing the result of the computation. - virtual StatusOr ExecuteOnStream( + virtual StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) = 0; // Same as ExecuteOnStream(), but this call is non-blocking and returns as // soon as all of the operations are enqueued for launch on the stream. - virtual StatusOr ExecuteAsyncOnStream( + virtual StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) = 0; @@ -77,7 +78,7 @@ class Executable { // streams. arguments[i] contains the arguments to the execution on // run_options[i]->stream() and the returned value is at index i of the // returned vector. - virtual StatusOr> ExecuteOnStreams( + virtual StatusOr> ExecuteOnStreams( tensorflow::gtl::ArraySlice run_options, tensorflow::gtl::ArraySlice< @@ -97,7 +98,7 @@ class Executable { // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a // timer for the execution, sets up HLO profiling if enabled, and fills in the // given ExecutionProfile if non-null. - StatusOr ExecuteOnStreamWrapper( + StatusOr ExecuteOnStreamWrapper( const ServiceExecutableRunOptions* run_options, ExecutionProfile* profile, tensorflow::gtl::ArraySlice arguments); @@ -155,6 +156,10 @@ class Executable { static Status DumpToDirectory(const string& directory_path, string filename, const SessionModule& session_module); + // Dump hlo snapshot to directory_path/filename. + static Status DumpToDirectory(const string& directory_path, string filename, + const HloSnapshot& hlo_session); + protected: mutable tensorflow::mutex mutex_; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 62ce15bc59d34e3bd1768313f2894871bf97ef21..980cc89fa03abd874a8e0a694f2abb775c1de050 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -250,7 +250,7 @@ Status GpuExecutable::ExecuteThunks( return Status::OK(); } -StatusOr GpuExecutable::ExecuteOnStream( +StatusOr GpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -297,8 +297,8 @@ StatusOr GpuExecutable::ExecuteOnStream( HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); auto device_ordinal = executor->device_ordinal(); - auto shaped_buffer = ShapedBuffer(root->shape(), root->shape(), - executor->platform(), device_ordinal); + ScopedShapedBuffer shaped_buffer(root->shape(), root->shape(), + memory_allocator, device_ordinal); // Copy DeviceMemoryBase values which contain the array(s) of the result into // the respective location in ShapedBuffer. @@ -335,7 +335,7 @@ StatusOr GpuExecutable::ExecuteOnStream( return std::move(shaped_buffer); } -StatusOr GpuExecutable::ExecuteAsyncOnStream( +StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { // TODO(b/30671675): Implement asynchronous execution mode. diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 361bc30b2f367c23051e06a1169f9c15c19f4a6a..80ec38c3ac114fe4ad9d56784330c1144d913db1 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -74,12 +74,12 @@ class GpuExecutable : public Executable { // ExecuteOnStream will fail if the compute capability of the stream doesn't // match the compute capability passed to this object's constructor. - StatusOr ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto index 0c3eb7dcb44f1203ae10e238f194e41e9ecaea74..aa6860880b7a1308d3ecabb52318daa7d2852af2 100644 --- a/tensorflow/compiler/xla/service/hlo.proto +++ b/tensorflow/compiler/xla/service/hlo.proto @@ -300,7 +300,7 @@ message HloProto { // Encapsulates HloProto together with the arguments, result, and // execution_platform. This message is used for purposes such as // analysis/replay/file-storage. -message HloSession { +message HloSnapshot { // The hlo graph. HloProto hlo = 1; diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc index cd7cbbdd71706fddb64855f631eb09de35da52e8..3b22c93733af293e4d73a2b1b3ac8822dec6d5f5 100644 --- a/tensorflow/compiler/xla/service/hlo_cse.cc +++ b/tensorflow/compiler/xla/service/hlo_cse.cc @@ -97,6 +97,10 @@ StatusOr HloCSE::Run(HloModule* module) { const std::function eq_computations = std::equal_to(); for (auto* computation : module->computations()) { + if (only_fusion_computations_ && !computation->IsFusionComputation()) { + continue; + } + changed |= CombineConstants(computation, is_layout_sensitive_); std::list post_order = diff --git a/tensorflow/compiler/xla/service/hlo_cse.h b/tensorflow/compiler/xla/service/hlo_cse.h index 70096e07a2493763a9d4b0dc8e1c31510718c6c2..5e2b348bdda2b31556fb692e24d2bad2e4173ef5 100644 --- a/tensorflow/compiler/xla/service/hlo_cse.h +++ b/tensorflow/compiler/xla/service/hlo_cse.h @@ -29,9 +29,11 @@ class HloCSE : public HloPassInterface { public: // If is_layout_sensitive is true, then the simplifier preserves layout during // transformation. Otherwise, layout is ignored. - explicit HloCSE(bool is_layout_sensitive) - : is_layout_sensitive_(is_layout_sensitive) {} - ~HloCSE() override {} + explicit HloCSE(bool is_layout_sensitive, + bool only_fusion_computations = false) + : is_layout_sensitive_(is_layout_sensitive), + only_fusion_computations_(only_fusion_computations) {} + ~HloCSE() override = default; tensorflow::StringPiece name() const override { return "cse"; } // Run CSE on the given module. Returns whether the module was changed (common @@ -39,7 +41,8 @@ class HloCSE : public HloPassInterface { StatusOr Run(HloModule* module) override; private: - bool is_layout_sensitive_; + const bool is_layout_sensitive_; + const bool only_fusion_computations_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index a638d54d85214b1c34d5b72c11df91b5b0218154..a714d0e114245021c28da26beae444dbd3d99bb5 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -2451,12 +2451,6 @@ HloInstructionProto HloInstruction::ToProto() const { proto.add_fft_length(fft_len); } - if (gather_dimension_numbers_ != nullptr) { - *proto.mutable_gather_dimension_numbers() = *gather_dimension_numbers_; - } - for (int64 bound : gather_window_bounds_) { - proto.add_gather_window_bounds(bound); - } proto.set_channel_name(channel_name_); proto.set_cost_estimate_ns(cost_estimate_ns_); diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc index bc74c4bc10cad20eab20b5caf8550b17048a5276..69deac263ee58f9e4d46987a54f09b11d650950a 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers.cc @@ -132,6 +132,69 @@ bool HloCustomCallMatcher::MatchAndExplain( return result; } +bool HloShapeMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (ShapeUtil::Compatible(instruction->shape(), shape_)) { + return true; + } + *listener << instruction->ToString() << " has incorrect shape (expected: " + << ShapeUtil::HumanString(shape_) << ")"; + return false; +} + +void HloShapeMatcher::DescribeTo(std::ostream* os) const { + *os << ShapeUtil::HumanString(shape_); +} + +bool HloShapeAndLayoutMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (ShapeUtil::Equal(instruction->shape(), shape_)) { + return true; + } + *listener << instruction->ToString() << " has incorrect shape (expected: " + << ShapeUtil::HumanStringWithLayout(shape_) << ")"; + return false; +} + +void HloShapeAndLayoutMatcher::DescribeTo(std::ostream* os) const { + *os << ShapeUtil::HumanStringWithLayout(shape_); +} + +bool HloShardingMatcher::MatchAndExplain( + const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const { + if (!sharding_.has_value()) { + if (!instruction->has_sharding()) { + return true; + } + *listener << instruction->ToString() << " expected to have no sharding."; + return false; + } + if (instruction->has_sharding()) { + if (instruction->sharding() == sharding_.value()) { + return true; + } + *listener << instruction->ToString() + << " has incorrect sharding (expected: " << sharding_->ToString() + << ")"; + return false; + } else { + *listener << instruction->ToString() + << " has no sharding (expected: " << sharding_->ToString() << ")"; + return false; + } +} + +void HloShardingMatcher::DescribeTo(std::ostream* os) const { + if (sharding_.has_value()) { + *os << sharding_->ToString(); + } else { + *os << ""; + } +} + } // namespace testing void PrintTo(const HloInstruction* inst, ::std::ostream* os) { diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h index 103f04a2cb7a1a5ae877d8bf259692f7cbed3408..f2ab9b5d9b6e000cdd808fe06a70f0506ddb5ff3 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.h +++ b/tensorflow/compiler/xla/service/hlo_matchers.h @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/test.h" +#include "tensorflow/core/lib/gtl/optional.h" namespace xla { namespace testing { @@ -86,6 +87,50 @@ class HloCustomCallMatcher : public HloMatcher { ::testing::Matcher call_target_matcher_; }; +class HloShapeMatcher + : public ::testing::MatcherInterface { + public: + explicit HloShapeMatcher(const Shape& shape) : shape_(shape) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + Shape shape_; +}; + +class HloShapeAndLayoutMatcher + : public ::testing::MatcherInterface { + public: + explicit HloShapeAndLayoutMatcher(const Shape& shape) : shape_(shape) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + Shape shape_; +}; + +// Verify the sharding of an instruction against the provided HloSharding. If a +// nullopt is provided for the expected sharding then it checks that no sharding +// is present for an instruction. +class HloShardingMatcher + : public ::testing::MatcherInterface { + public: + explicit HloShardingMatcher( + const tensorflow::gtl::optional& sharding) + : sharding_(sharding) {} + + bool MatchAndExplain(const HloInstruction* instruction, + ::testing::MatchResultListener* listener) const override; + void DescribeTo(std::ostream* os) const override; + + private: + tensorflow::gtl::optional sharding_; +}; + // HloInstruction* matchers for opcode and operands. Example: // namespace op = xla::opcode_matchers; // EXPECT_THAT(instruction, @@ -231,6 +276,30 @@ inline ::testing::Matcher CustomCall() { new ::xla::testing::HloMatcher(HloOpcode::kCustomCall, {})); } +// Verifies the shape or the shape and the layout of an HLO instruction against +// the provided shape object. +inline ::testing::Matcher Shape( + const class Shape& shape) { + return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(shape)); +} +inline ::testing::Matcher ShapeWithLayout( + const class Shape& shape) { + return ::testing::MakeMatcher( + new ::xla::testing::HloShapeAndLayoutMatcher(shape)); +} + +// Verifies the value of the HloSharing against the provided sharding object. +inline ::testing::Matcher Sharding( + const HloSharding& sharding) { + return ::testing::MakeMatcher( + new ::xla::testing::HloShardingMatcher(sharding)); +} +// Verifies that no HloSharding is set for an HLO instruction. +inline ::testing::Matcher NoSharding() { + return ::testing::MakeMatcher( + new ::xla::testing::HloShardingMatcher(tensorflow::gtl::nullopt)); +} + #undef HLO_MATCHER } // namespace opcode_matchers diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc index 1c21703a45e11914854153bc14fabd85e9ea57f2..c6373b2e46af7df1c7cfe03f95e155f519f39a11 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc +++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc @@ -100,5 +100,63 @@ TEST(HloMatchersTest, CustomCallMatcher) { R"(custom-call with call target that is equal to "foo_target")"); } +TEST(HloMatchersTest, ShapeMatcher) { + auto p0 = HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1}), "param"); + + EXPECT_THAT(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {5, 7}))); + EXPECT_THAT( + p0.get(), + ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {5, 7})))); + EXPECT_THAT(p0.get(), + ::testing::Not(op::Shape(ShapeUtil::MakeShape(F32, {7, 5})))); + EXPECT_THAT( + p0.get(), + ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {7, 5})))); + EXPECT_THAT(p0.get(), + op::Shape(ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1}))); + EXPECT_THAT(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout( + F32, {5, 7}, {0, 1}))); + EXPECT_THAT(p0.get(), + ::testing::Not(op::ShapeWithLayout( + ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {1, 0})))); + + EXPECT_THAT(Explain(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {7, 5}))), + "%param = f32[5,7]{0,1} parameter(0) has incorrect shape " + "(expected: f32[7,5])"); + EXPECT_THAT( + Explain(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout( + F32, {7, 5}, {1, 0}))), + "%param = f32[5,7]{0,1} parameter(0) has incorrect shape " + "(expected: f32[7,5]{1,0})"); +} + +TEST(HloMatchersTest, ShardingMatcher) { + auto p0 = HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {5}), + "param.0"); + p0->clear_sharding(); + auto p1 = HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {7}), + "param.1"); + p1->set_sharding(HloSharding::AssignDevice(1)); + + EXPECT_THAT(p0.get(), op::NoSharding()); + EXPECT_THAT(p0.get(), + ::testing::Not(op::Sharding(HloSharding::AssignDevice(1)))); + EXPECT_THAT(p1.get(), ::testing::Not(op::NoSharding())); + EXPECT_THAT(p1.get(), + ::testing::Not(op::Sharding(HloSharding::AssignDevice(0)))); + EXPECT_THAT(p1.get(), op::Sharding(HloSharding::AssignDevice(1))); + + EXPECT_THAT(Explain(p0.get(), op::Sharding(HloSharding::AssignDevice(1))), + "%param.0 = f32[5]{0} parameter(0) has no sharding (expected: " + "{maximal device=1})"); + EXPECT_THAT(Explain(p1.get(), op::NoSharding()), + "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} " + "expected to have no sharding."); + EXPECT_THAT(Explain(p1.get(), op::Sharding(HloSharding::AssignDevice(0))), + "%param.1 = f32[7]{0} parameter(1), sharding={maximal device=1} " + "has incorrect sharding (expected: {maximal device=0})"); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc index df5ffd0b7d6f0f4575c5f3d8aaa7bff4a1b1e771..81c43db292a75dfd1e882537052b75d64a55002e 100644 --- a/tensorflow/compiler/xla/service/hlo_runner.cc +++ b/tensorflow/compiler/xla/service/hlo_runner.cc @@ -126,16 +126,12 @@ StatusOr> HloRunner::Execute( } TF_ASSIGN_OR_RETURN( - ShapedBuffer result, + ScopedShapedBuffer result, executable->ExecuteOnStreamWrapper( &service_run_options, /*profile=*/nullptr, argument_buffer_ptrs)); - // Create a ScopedShapedBuffer of the result to manage deallocation. This will - // deallocate all the device memory when it goes out of scope. - ScopedShapedBuffer scoped_result(std::move(result), run_options.allocator()); - auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice( - stream.parent(), scoped_result); + stream.parent(), result); if (result_literal.ok()) { VLOG(4) << "Executed binary and got result: " << result_literal.ValueOrDie()->ToString(); @@ -248,18 +244,16 @@ StatusOr>> HloRunner::ExecuteReplicated( } LOG(INFO) << "Replicated execution started"; - TF_ASSIGN_OR_RETURN(std::vector results, + TF_ASSIGN_OR_RETURN(std::vector results, executable->ExecuteOnStreams(service_run_options, argument_buffer_slices)); LOG(INFO) << "Replicated execution terminated"; std::vector> exec_results; for (int64 i = 0; i < options.num_replicas; ++i) { - ScopedShapedBuffer result(std::move(results[i]), - backend().memory_allocator()); TF_ASSIGN_OR_RETURN(std::unique_ptr literal, backend().transfer_manager()->TransferLiteralFromDevice( - streams[i]->parent(), result)); + streams[i]->parent(), results[i])); exec_results.push_back(std::move(literal)); } return std::move(exec_results); diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc index 1b42349b0b3ad9634bb910b3843affed6a0ca334..994de441237493b5e2254a0a66763d6195c5ea85 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding.cc @@ -256,37 +256,24 @@ Status HloSharding::ValidateNonTuple(const Shape& shape, ", input_shape=", ShapeUtil::HumanString(shape)); } - // The tile shape must not be the same as the input shape without maximal_ - // also set. If this is the case, we're not actually sharded and the correct - // constructor should have been used. - if (ShapeUtil::Equal(shape, tile_shape_)) { + // The correct constructor have to be used to create tile maximal shardings. + if (tile_assignment_.num_elements() == 1) { return tensorflow::errors::InvalidArgument( - "Tile shape is the same as the input shape. If a replicated sharding " - "was intended, use HloSharding::Replicated(). If a device placement " - "was intended, use HloSharding::AssignDevice()"); + "Tile assignment only contains a single device. If a replicated " + "sharding was intended, use HloSharding::Replicated(). If a device " + "placement was intended, use HloSharding::AssignDevice()"); } - // The tile shape must not be greater than the input shape in any dimension. - for (int64 i = 0, e = ShapeUtil::Rank(shape); i != e; ++i) { - auto tile_dim = tile_shape_.dimensions(i); - auto shape_dim = shape.dimensions(i); - if (tile_dim > shape_dim) { - return tensorflow::errors::InvalidArgument( - StrCat("Tile is larger than input shape (dimension ", i, ", ", - tile_dim, " > ", shape_dim)); - } - } - - // The tile assignment tensor must be exactly dimensioned to ceil(shape[dim] - // tile[dim]) for every dimension contained within tile. + // The tile assignment tensor must contain enough element to cover the full + // shape with tiles of the specified size. for (int64 i = 0, e = tile_assignment_.dimensions().size(); i != e; ++i) { - int64 expected_dim = - CeilOfRatio(shape.dimensions(i), tile_shape_.dimensions(i)); - if (tile_assignment_.dimensions()[i] != expected_dim) { + int64 total_tile_size = tile_assignment_.dim(i) * tile_shape_.dimensions(i); + if (shape.dimensions(i) > total_tile_size) { return tensorflow::errors::InvalidArgument( - StrCat("Tile assignment tensor has incorrect shape. Dimension ", i, - " expected ", expected_dim, " but got ", - tile_assignment_.dimensions()[i])); + StrCat("Tile assignment tensor has too few element to cover the full " + "shape. Dimension ", + i, ", shape ", shape.dimensions(i), ", total size ", + total_tile_size)); } } diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc index 69ea4233e45c2e59c8d1541a0517a007f4bbf42f..3bf0d25efb7fad78aeccdd9269c289950b2171ab 100644 --- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc +++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc @@ -88,7 +88,7 @@ TEST_F(HloShardingTest, Tile) { } { - // Test should pass. + // Test should fail because of more devices used then `num_device`. Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); HloSharding sharding = HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); @@ -97,17 +97,8 @@ TEST_F(HloShardingTest, Tile) { } { - // Test should fail due to the tile being larger than the input space. - Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); - HloSharding sharding = - HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); - EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(F32, {2, 2}), - /*num_devices=*/4)); - } - - { - // Test should fail due to the tile not dividing the input space into 4 - // sections (even with padding). + // Test should fail because the total tiled size in dimension 0 is 4 but we + // have 6 elements along that dimensions. Shape tile_shape = ShapeUtil::MakeShape(U32, {2, 3}); HloSharding sharding = HloSharding::Tile(tile_shape, MakeArray({2, 2}, {0, 1, 2, 3})); diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 8c875698eb1992719d504d272ca338b05b60e36b..8a30cbf9cd622ffb64d345ddaf0dc88f34850bfc 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -15,6 +15,7 @@ limitations under the License. #include +#include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/service/hlo_verifier.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/lib/core/errors.h" @@ -731,6 +732,73 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const { return tensorflow::Status::OK(); } +Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) { + auto* while_cond = instruction->while_condition(); + auto* while_body = instruction->while_body(); + if (while_cond->num_parameters() != 1) { + return FailedPrecondition( + "While condition must have exactly 1 parameter; had %lld : %s", + while_cond->num_parameters(), while_cond->ToString().c_str()); + } + if (while_body->num_parameters() != 1) { + return FailedPrecondition( + "While body must have exactly 1 parameter; had %lld : %s", + while_body->num_parameters(), while_body->ToString().c_str()); + } + if (instruction->operand_count() != 1) { + return FailedPrecondition( + "While loop must have exactly one operand; had %lld : %s", + instruction->operand_count(), instruction->ToString().c_str()); + } + auto* init = instruction->operand(0); + auto* cond_param = while_cond->parameter_instruction(0); + if (!ShapeUtil::Compatible(init->shape(), cond_param->shape())) { + return FailedPrecondition( + "While condition's parameter must have the same shape as the " + "loop's 'init'. init: %s, param: %s", + init->ToString().c_str(), cond_param->ToString().c_str()); + } + auto* cond_root = while_cond->root_instruction(); + if (!ShapeUtil::Compatible(cond_root->shape(), + ShapeUtil::MakeShape(PRED, {}))) { + return FailedPrecondition("While condition should have shape PRED: %s", + cond_root->ToString().c_str()); + } + auto* body_param = while_body->parameter_instruction(0); + if (!ShapeUtil::Compatible(init->shape(), body_param->shape())) { + return FailedPrecondition( + "While body's parameter must have the same shape as the loop's" + " 'init'. init: %s, param: %s", + init->ToString().c_str(), body_param->ToString().c_str()); + } + auto* body_root = while_body->root_instruction(); + if (!ShapeUtil::Compatible(init->shape(), body_root->shape())) { + return FailedPrecondition( + "While body should have same shape as the loop's 'init'." + "init: %s, body: %s", + init->ToString().c_str(), body_root->ToString().c_str()); + } + return tensorflow::Status::OK(); +} + +Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) { + const Shape& out_shape = instruction->shape(); + for (HloInstruction* operand : instruction->operands()) { + const Shape& operand_shape = operand->shape(); + if (!ShapeUtil::IsScalar(operand_shape) && + !ShapeUtil::CompatibleIgnoringElementType(operand_shape, out_shape)) { + return FailedPrecondition( + "Implicit broadcast is not allowed in HLO." + "Found non-compatible shapes for instruction %s.\n" + "output: %s\noperand: %s\n", + HloOpcodeString(instruction->opcode()).c_str(), + ShapeUtil::HumanString(out_shape).c_str(), + ShapeUtil::HumanString(operand_shape).c_str()); + } + } + return tensorflow::Status::OK(); +} + StatusOr HloVerifier::Run(HloModule* module) { TF_RETURN_IF_ERROR(VerifyHloStructure(module)); @@ -771,39 +839,9 @@ StatusOr HloVerifier::Run(HloModule* module) { << instruction->dimensions().size() << " != " << ShapeUtil::Rank(instruction->operand(0)->shape()); } else if (instruction->opcode() == HloOpcode::kWhile) { - auto* while_cond = instruction->while_condition(); - auto* while_body = instruction->while_body(); - TF_RET_CHECK(while_cond->num_parameters() == 1) - << "While condition must have exactly 1 parameter; had " - << while_cond->num_parameters() << ": " << while_cond->ToString(); - TF_RET_CHECK(while_body->num_parameters() == 1) - << "While body must have exactly 1 parameter; had " - << while_body->num_parameters() << ": " << while_body->ToString(); - TF_RET_CHECK(instruction->operand_count() == 1) - << "While loop must have exactly one operand; had " - << instruction->operand_count() << ": " << instruction->ToString(); - - auto* init = instruction->operand(0); - auto* cond_param = while_cond->parameter_instruction(0); - TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), cond_param->shape())) - << "While condition's parameter must have the same shape as the " - "loop's 'init'. init: " - << init->ToString() << ", param: " << cond_param->ToString(); - auto* cond_root = while_cond->root_instruction(); - TF_RET_CHECK(ShapeUtil::Compatible(cond_root->shape(), - ShapeUtil::MakeShape(PRED, {}))) - << "While condition should have shape PRED: " - << cond_root->ToString(); - - auto* body_param = while_body->parameter_instruction(0); - TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_param->shape())) - << "While body's parameter must have the same shape as the loop's " - "'init'. init: " - << init->ToString() << ", param: " << body_param->ToString(); - auto* body_root = while_body->root_instruction(); - TF_RET_CHECK(ShapeUtil::Compatible(init->shape(), body_root->shape())) - << "While body should have same shape as the loop's 'init'. init: " - << init->ToString() << ", body: " << body_root->ToString(); + TF_RETURN_IF_ERROR(CheckWhileInstruction(instruction)); + } else if (instruction->IsElementwise()) { + TF_RETURN_IF_ERROR(CheckElementwiseInstruction(instruction)); } auto previous = instructions.find(instruction->name()); diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h index 1dd7ec3c51e18dcfe89bd478de87798ba3858119..6208887547a14d22b512ef308dd2668af2f4468d 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier.h +++ b/tensorflow/compiler/xla/service/hlo_verifier.h @@ -102,7 +102,7 @@ class ShapeVerifier : public DfsHloVisitor { Status CheckTernaryShape(const HloInstruction* instruction); Status CheckVariadicShape(const HloInstruction* instruction); - // Checks if the given two instructions shares the same channel id. + // Checks if the given two instructions share the same channel id. Status CheckSameChannel(const HloInstruction* instr1, const HloInstruction* instr2); @@ -144,9 +144,15 @@ class HloVerifier : public HloPassInterface { // CHECKs various invariants of a fusion instruction. Status CheckFusionInstruction(HloInstruction* fusion) const; + Status CheckWhileInstruction(HloInstruction* instruction); + + // Checks that the non-scalar operand shapes are compatible to the output + // shape, i.e., that there are no implicit broadcasts of size-one dimensions. + Status CheckElementwiseInstruction(HloInstruction* instruction); + // Creates a ShapeVerifier that checks that shapes match inferred - // expectations. This is a factory function because ShapeVerifier, Note that - // ShapeVerifier, being a DfsHloVisitor, is stateful. We want a clean object + // expectations. This is a factory function because ShapeVerifier, + // being a DfsHloVisitor, is stateful. We want a clean object // for each run of the verifier. ShapeVerifierFactory shape_verifier_factory_; }; diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index 6553000336b072d56ea01cad1f2f66c7bd339e1a..61f199bc9e8f4f95a2f097af4abf9395a1e05f64 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -45,7 +45,7 @@ InterpreterExecutable::InterpreterExecutable( InterpreterExecutable::~InterpreterExecutable() {} -StatusOr InterpreterExecutable::ExecuteOnStream( +StatusOr InterpreterExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) { @@ -88,8 +88,8 @@ StatusOr InterpreterExecutable::ExecuteOnStream( evaluator.Evaluate>(*computation, arg_literals)); // Transform the result literal back into a ShapedBuffer. - TF_ASSIGN_OR_RETURN(ShapedBuffer result, - transfer_manager->AllocateShapedBuffer( + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, + transfer_manager->AllocateScopedShapedBuffer( result_literal->shape(), run_options->allocator(), executor->device_ordinal())); TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice( @@ -106,7 +106,7 @@ StatusOr InterpreterExecutable::ExecuteOnStream( return std::move(result); } -StatusOr InterpreterExecutable::ExecuteAsyncOnStream( +StatusOr InterpreterExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) { return tensorflow::errors::Unimplemented( diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h index c825a9a368d43b5ce1cdc4701e3bbf57374d81f3..b0b797ca7d6f449a11c662ffba7c2a0a0040e47e 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.h +++ b/tensorflow/compiler/xla/service/interpreter/executable.h @@ -43,12 +43,12 @@ class InterpreterExecutable : public Executable { InterpreterExecutable(std::unique_ptr hlo_module); ~InterpreterExecutable() override; - StatusOr ExecuteOnStream( + StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments, HloExecutionProfile* hlo_execution_profile) override; - StatusOr ExecuteAsyncOnStream( + StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice arguments) override; diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc index ce2f4d378c02592da01be12c0647da47d00b01ba..92e069a8c67c1d441ba9d396dee503c9b3bde0df 100644 --- a/tensorflow/compiler/xla/service/interpreter/platform.cc +++ b/tensorflow/compiler/xla/service/interpreter/platform.cc @@ -71,8 +71,8 @@ port::StatusOr XlaInterpreterPlatform::GetExecutor( port::StatusOr> XlaInterpreterPlatform::GetUncachedExecutor( const StreamExecutorConfig& config) { - auto executor = port::MakeUnique( - this, port::MakeUnique(config.plugin_config)); + auto executor = MakeUnique( + this, MakeUnique(config.plugin_config)); auto init_status = executor->Init(config.ordinal, config.device_options); if (!init_status.ok()) { return port::Status{ diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc index 094f7319f462a71f4bfe972771a1de4aedbb8ee3..13e2d3258e3b92f52320201c382594962c0e3b2b 100644 --- a/tensorflow/compiler/xla/service/reshape_mover_test.cc +++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc @@ -458,57 +458,6 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) { EXPECT_EQ(select, computation->root_instruction()); } -// Tree looks like: -// -// param0 [1,128,1] -// | -// reshape [128,1] constant [128,1024] -// \ / -// multiply w/implicit broadcast [128,1024] -// -// The reshape mover would like to sink the reshape below the multiply. -// -// Previously we would attempt to insert a reshape of the constant to [1,128,1] -// (which is unsound, because it has a different number of elements) as -// preparation for sinking the reshape. -// -// To eliminate the unsoundness, we outlaw reshape sinking when one of the -// operands is implicitly broadcast in the elementwise consumer. -// -// TODO(b/37799338) However, it would be possible in this case to do a more -// in-depth analysis to get reshape movement to occur: -// -// 1. Note that the broadcast dimension (logical dimension 1) in the operands -// would map back to logical dimension 2 in the param0 node. -// 2. Match rank of the constant to the param0 node (by prepending a trivial 1 -// dimension). -// 3. Reshape to [128,1024] at the root. -// -// But this is not currently done. -TEST_F(ReshapeMoverTest, ImplicitlyBroadcastReshapeIsNotMovedBug37787999) { - HloComputation::Builder builder(TestName()); - auto param0 = builder.AddInstruction(HloInstruction::CreateParameter( - 0, ShapeUtil::MakeShape(F32, {1, 128, 1}), "param0")); - auto reshape = builder.AddInstruction(HloInstruction::CreateReshape( - ShapeUtil::MakeShape(F32, {128, 1}), param0)); - Array2D a(128, 1024); - auto literal = Literal::CreateR2FromArray2D(a); - auto constant = builder.AddInstruction( - HloInstruction::CreateConstant(std::move(literal))); - auto multiply = builder.AddInstruction(HloInstruction::CreateBinary( - constant->shape(), HloOpcode::kMultiply, constant, reshape)); - - auto computation = module().AddEntryComputation(builder.Build()); - EXPECT_THAT(computation->root_instruction(), - op::Multiply(op::Constant(), op::Reshape(param0))); - - EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie()); - - EXPECT_THAT(computation->root_instruction(), - op::Multiply(op::Constant(), op::Reshape(param0))); - EXPECT_EQ(multiply, computation->root_instruction()); -} - // Tree looks like this: // // add1 diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 39f3aefdf8045be465176ca2b6318c4bcffdb0d7..086bd61dd04aa13b6bba7898806951fedc6fbfd4 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -550,7 +550,7 @@ Service::ExecuteParallelAndRegisterResult( // Stream executors for the replicas of the current computation. TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i])); CHECK_EQ(replicas.size(), arguments[i].size()); - std::vector result_buffers; + std::vector result_buffers; for (int64 replica = 0; replica < replicas.size(); ++replica) { TF_ASSIGN_OR_RETURN(Pool::SmartPtr stream, backend->BorrowStream(replicas[replica])); @@ -582,7 +582,7 @@ Service::ExecuteParallelAndRegisterResult( backend->StreamBorrower()); // Asynchronously launch the computation. - TF_ASSIGN_OR_RETURN(ShapedBuffer result, + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, executables[i]->ExecuteAsyncOnStream( &run_options, arguments[i][replica])); @@ -1234,7 +1234,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg, streams.push_back(std::move(stream)); } - std::vector result_buffers; + std::vector result_buffers; for (size_t i = 0; i < streams.size(); ++i) { const auto& stream = streams[i]; ExecutableRunOptions options; @@ -1247,7 +1247,7 @@ tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg, ServiceExecutableRunOptions service_options( options, execute_backend_->StreamBorrower()); - TF_ASSIGN_OR_RETURN(ShapedBuffer this_result_buffer, + TF_ASSIGN_OR_RETURN(ScopedShapedBuffer this_result_buffer, executable->ExecuteAsyncOnStream( &service_options, replicated_arguments[i])); @@ -1347,11 +1347,11 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg, } // Allocate memory in each replica and transfer the data to all replicas. - std::vector replicated_buffers; + std::vector replicated_buffers; for (se::StreamExecutor* executor : replicas) { TF_ASSIGN_OR_RETURN( - ShapedBuffer shaped_buffer, - execute_backend_->transfer_manager()->AllocateShapedBuffer( + ScopedShapedBuffer shaped_buffer, + execute_backend_->transfer_manager()->AllocateScopedShapedBuffer( shape, execute_backend_->memory_allocator(), executor->device_ordinal())); TF_RETURN_IF_ERROR( diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc index 0b5a383f6fe5b01615c8d24ccf00fb667e0fe0c2..fb3b5f06dad67b4305aed0305c9f6441e666db53 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer.cc @@ -117,7 +117,7 @@ ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer, : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {} ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s) - : ShapedBuffer(std::move(s)), allocator_(s.allocator_) { + : ShapedBuffer(static_cast(s)), allocator_(s.allocator_) { // Null out s.allocator_ so it doesn't try to free anything in its destructor. s.allocator_ = nullptr; } @@ -151,7 +151,7 @@ ScopedShapedBuffer::~ScopedShapedBuffer() { } ShapedBuffer ScopedShapedBuffer::release() { - ShapedBuffer shaped_buffer(std::move(*this)); + ShapedBuffer shaped_buffer(static_cast(*this)); buffers_ = ShapeTree(); return shaped_buffer; } diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h index f1b0527474cfe28fa7108fda829556d24c12b4d9..e10fca9e9466c018f6cb4da2f5618e4db4977307 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.h +++ b/tensorflow/compiler/xla/service/shaped_buffer.h @@ -30,6 +30,8 @@ limitations under the License. namespace xla { +class ScopedShapedBuffer; + // Class which encapsulates a buffer or set of buffers containing data of a // particular XLA shape. class ShapedBuffer { @@ -49,6 +51,10 @@ class ShapedBuffer { ShapedBuffer(const ShapedBuffer&) = delete; ShapedBuffer& operator=(const ShapedBuffer&) = delete; + // Prevent (some forms of) accidental object slicing. + ShapedBuffer(const ScopedShapedBuffer&) = delete; + ShapedBuffer& operator=(const ScopedShapedBuffer&) = delete; + virtual ~ShapedBuffer(); // Returns the shape of the on-host representation of the data held by this diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc index 98d0111d04d981dd3e26733e3311fd846b55d8ae..8b71a415091f028b3167cddb2583754e72ba17c8 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.cc +++ b/tensorflow/compiler/xla/service/transfer_manager.cc @@ -175,7 +175,7 @@ Status TransferManager::TransferBufferToDevice( return Status::OK(); } -StatusOr TransferManager::AllocateShapedBuffer( +StatusOr TransferManager::AllocateScopedShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal) { if (!LayoutUtil::HasLayout(on_host_shape)) { @@ -187,8 +187,8 @@ StatusOr TransferManager::AllocateShapedBuffer( const Shape on_device_shape = HostShapeToDeviceShape(on_host_shape); TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape)); - ShapedBuffer shaped_buffer(on_host_shape, on_device_shape, - allocator->platform(), device_ordinal); + ScopedShapedBuffer shaped_buffer(on_host_shape, on_device_shape, allocator, + device_ordinal); // Allocate an appropriate sized buffer for each element in the shape // including the tuple pointer arrays. @@ -204,13 +204,4 @@ StatusOr TransferManager::AllocateShapedBuffer( return std::move(shaped_buffer); } -StatusOr TransferManager::AllocateScopedShapedBuffer( - const Shape& on_host_shape, DeviceMemoryAllocator* allocator, - int device_ordinal) { - TF_ASSIGN_OR_RETURN( - ShapedBuffer unscoped_buffer, - AllocateShapedBuffer(on_host_shape, allocator, device_ordinal)); - return ScopedShapedBuffer(std::move(unscoped_buffer), allocator); -} - } // namespace xla diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h index a6451c4bb1104cc865c010463bbfd9107f6c4f9b..d82b4f0f81b5da38c1caf80bddefa0d3f7842463 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.h +++ b/tensorflow/compiler/xla/service/transfer_manager.h @@ -104,12 +104,9 @@ class TransferManager { // region for a host-to-device transfer. virtual int64 GetByteSizeRequirement(const Shape& shape) const = 0; - // Allocate a ShapedBuffer which can hold data with the given on-host + // Allocates a ScopedShapedBuffer which can hold data with the given on-host // shape. The on-device shape may be different as indicated by // HostShapeToDeviceShape. - StatusOr AllocateShapedBuffer(const Shape& on_host_shape, - DeviceMemoryAllocator* allocator, - int device_ordinal); StatusOr AllocateScopedShapedBuffer( const Shape& on_host_shape, DeviceMemoryAllocator* allocator, int device_ordinal); diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h index 4c83750f3e6f3c735db66d8e0b86ae3f43e5ca11..a1dce758cd3ab3f204ce330fca2a7d2bdf57a2be 100644 --- a/tensorflow/compiler/xla/shape_layout.h +++ b/tensorflow/compiler/xla/shape_layout.h @@ -48,8 +48,7 @@ class ShapeLayout { bool MatchesLayoutInShape(const Shape& shape) const; // Copies the layout from the given shape into this ShapeLayout. 'other_shape' - // must be compatible with the ShapeLayout's shape, and 'other_shape' must - // have a layout (LayoutUtil::HasLayout). + // must be compatible with the ShapeLayout's shape. tensorflow::Status CopyLayoutFromShape(const Shape& other_shape); // Clears (Layout::Clear) all the Layouts stored in this object. diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index 63da9154cfc1a5e7e8c0eeaa103d27096540fefe..5fa728e7c2fa5faf6ba347198fdc99e56ca4c324 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/lib/gtl/optional.h" +#include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/types.h" diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h index 641b5e9a6accc0a2e7737f79bcd485d317e4e521..cccbce5fc83af87396f4d51eb9e785cea93aba0b 100644 --- a/tensorflow/compiler/xla/statusor.h +++ b/tensorflow/compiler/xla/statusor.h @@ -113,17 +113,19 @@ class StatusOr : private internal_statusor::StatusOrData, StatusOr& operator=(StatusOr&&) = default; // Conversion copy/move constructor, T must be convertible from U. - // TODO(b/62186717): These should not participate in overload resolution if U - // is not convertible to T. - template + template ::value>::type* = nullptr> StatusOr(const StatusOr& other); - template + template ::value>::type* = nullptr> StatusOr(StatusOr&& other); // Conversion copy/move assignment operator, T must be convertible from U. - template + template ::value>::type* = nullptr> StatusOr& operator=(const StatusOr& other); - template + template ::value>::type* = nullptr> StatusOr& operator=(StatusOr&& other); // Constructs a new StatusOr with the given value. After calling this @@ -233,12 +235,14 @@ StatusOr& StatusOr::operator=(Status&& status) { } template -template +template ::value>::type*> inline StatusOr::StatusOr(const StatusOr& other) : Base(static_cast::Base&>(other)) {} template -template +template ::value>::type*> inline StatusOr& StatusOr::operator=(const StatusOr& other) { if (other.ok()) this->Assign(other.ValueOrDie()); @@ -248,12 +252,14 @@ inline StatusOr& StatusOr::operator=(const StatusOr& other) { } template -template +template ::value>::type*> inline StatusOr::StatusOr(StatusOr&& other) : Base(static_cast::Base&&>(other)) {} template -template +template ::value>::type*> inline StatusOr& StatusOr::operator=(StatusOr&& other) { if (other.ok()) { this->Assign(std::move(other).ValueOrDie()); diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index 1f90a44d8ba725c1bc7d23b581161f8915ff74fd..c28d14ba8ac3a0b893485dae65a160f903deb3f1 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -153,6 +153,8 @@ tf_cc_binary( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service/cpu:cpu_compiler", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", @@ -191,6 +193,7 @@ cc_library( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:interpreter_plugin", # reference backend "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -257,8 +260,8 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:client_library", - "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:computation_placer", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", @@ -288,6 +291,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -311,6 +316,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -330,6 +337,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -371,6 +380,8 @@ xla_test( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:test_utils", @@ -390,6 +401,7 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -442,6 +454,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -461,6 +475,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -478,6 +494,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -514,6 +532,8 @@ xla_test( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -535,6 +555,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -554,6 +576,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -578,6 +602,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -604,6 +630,7 @@ xla_test( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -670,6 +697,8 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -702,9 +731,6 @@ xla_test( "cpu": [ "--xla_cpu_multi_thread_eigen=false", ], - "cpu_parallel": [ - "--xla_cpu_multi_thread_eigen=false", - ], }, shard_count = 20, tags = ["optonly"], @@ -715,6 +741,8 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -738,6 +766,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -760,6 +790,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -801,7 +833,6 @@ xla_test( backend_tags = { # TODO(b/31436974): Fix msan failure. Failed on 2016-09-12. "cpu": ["nomsan"], - "cpu_parallel": ["nomsan"], }, shard_count = 30, deps = [ @@ -813,6 +844,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -836,6 +869,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:padding", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -898,6 +933,8 @@ xla_test( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", @@ -923,6 +960,8 @@ xla_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -963,6 +1002,8 @@ xla_test( "//tensorflow/compiler/xla:array3d", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1038,6 +1079,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1196,6 +1239,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1235,6 +1280,8 @@ xla_test( "//tensorflow/compiler/xla:reference_util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1256,6 +1303,8 @@ xla_test( "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1294,6 +1343,8 @@ xla_test( deps = [ "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1310,6 +1361,8 @@ xla_test( deps = [ "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1335,6 +1388,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -1355,6 +1410,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -1428,6 +1485,8 @@ xla_test( "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client/lib:arithmetic", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1472,6 +1531,8 @@ xla_test( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1514,6 +1575,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -1532,6 +1595,8 @@ xla_test( deps = [ "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1595,6 +1660,8 @@ xla_test( ":client_library_test_base", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", ], @@ -1608,6 +1675,8 @@ xla_test( ":client_library_test_base", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", ], @@ -1625,11 +1694,11 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/client:computation", - "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/service:session_proto", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", + "//tensorflow/compiler/xla/service:hlo_proto", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1713,6 +1782,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_runner", "//tensorflow/compiler/xla/service:platform_util", @@ -1740,6 +1811,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_runner", "//tensorflow/compiler/xla/service:platform_util", @@ -1777,6 +1850,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -1802,6 +1877,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:platform_util", @@ -1860,6 +1937,8 @@ xla_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", @@ -1886,6 +1965,8 @@ xla_test( "//tensorflow/compiler/xla/client:computation_builder", "//tensorflow/compiler/xla/client:global_data", "//tensorflow/compiler/xla/client:local_client", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -1982,6 +2063,8 @@ xla_test( ":test_utils", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_builder", + "//tensorflow/compiler/xla/client/xla_client:xla_computation", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", ], diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc index 4b4dc6dd9d3dddd105c63c7f3ce039bafca65dd7..e8a5efe796a9209307ecfa343b89f66ff2a34e0f 100644 --- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc +++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" @@ -214,7 +213,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementC64s) { } XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector lhs{0xFFFFFFFF, static_cast(-1), @@ -255,7 +254,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantU64s) { } XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantS64s) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector lhs{static_cast(0x8000000000000000LL), static_cast(0x8000000000000000LL), @@ -1332,7 +1331,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) { // Some Pow cases that can be implemented more efficiently. XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values = {1.0f, 2.0f, 3.2f, -4.0f}; std::vector exponents = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1360,7 +1359,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowSpecialF32) { } XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1385,7 +1384,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowOfExpF32) { } XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, 4.0f, 0.5f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1410,7 +1409,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, LogOfPowerF32) { } XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1435,7 +1434,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulOfExpF32) { } XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.0f, 5.7f}; std::vector values1 = {0.0f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1460,7 +1459,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfExpF32) { } XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f}; std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1492,7 +1491,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_lhs_F32) { } XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f}; std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -1525,7 +1524,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Div3_rhs_F32) { } XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f}; std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, 1.0f, 0.5f}; @@ -1558,7 +1557,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivOfPowerF32) { } XLA_TEST_F(ArrayElementwiseOpTest, Div4F32) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); std::vector values0 = {1.0f, 2.0f, 3.2f, -4.0f, 0.45f, 5.7f}; std::vector values1 = {0.1f, 1.0f, 2.0f, 0.5f, -1.0f, -0.5f}; @@ -2357,7 +2356,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Add1DTo2DF32) { XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Eq) { // Test broadcasting in Eq comparison. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto v = builder.ConstantR1({42, 73}); auto m = builder.ConstantR2({{42, 73}, {42, 52}}); @@ -2783,7 +2782,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonIdentityBroadcastOfSameRankIsDisallowed) { // Regression test for b/31927799. "slice - y" is fused and requires implicit // broadcast. XLA_TEST_F(ArrayElementwiseOpTest, ImplictBroadcastInFusedExpressions) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x_literal = Literal::CreateR1({1, 2, 3}); auto y_literal = Literal::CreateR1({4, 5}); auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie(); diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc index ec3b46acfec0ee0ff514a862ce5b1ca74279efa8..fcd9ff55e393f64476ddd4754e0fa74427f1cb51 100644 --- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc +++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc @@ -15,7 +15,6 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -42,7 +41,7 @@ TEST_F(AxpySimpleTest, AxTenValues) { } XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) { - ComputationBuilder builder(client_, "axpy_10"); + XlaBuilder builder("axpy_10"); auto alpha = builder.ConstantR0(3.1415926535); auto x = builder.ConstantR1({}); auto y = builder.ConstantR1({}); @@ -54,7 +53,7 @@ XLA_TEST_F(AxpySimpleTest, AxpyZeroValues) { } TEST_F(AxpySimpleTest, AxpyTenValues) { - ComputationBuilder builder(client_, "axpy_10"); + XlaBuilder builder("axpy_10"); auto alpha = builder.ConstantR0(3.1415926535); auto x = builder.ConstantR1( {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0}); diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc index e4bf1827acf24bcdbfe20fe39e794a0265ab89e3..22c3394e6f34bd018ffaaaa4d9d68339673c3764 100644 --- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc +++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc @@ -18,9 +18,9 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -34,13 +34,13 @@ namespace { class BadRngShapeValidationTest : public ClientLibraryTestBase {}; TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR0(0.0); auto one = builder.ConstantR0(1.0); Shape default_constructed; builder.RngUniform(zero, one, default_constructed); - StatusOr computation = builder.Build(); + StatusOr computation = builder.Build(); EXPECT_FALSE(computation.ok()); LOG(INFO) << "status received: " << computation.status(); EXPECT_THAT(computation.status().error_message(), @@ -48,7 +48,7 @@ TEST_F(BadRngShapeValidationTest, DefaultConstructedShapeCreatesError) { } TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR0(0.0); auto one = builder.ConstantR0(1.0); Shape sans_layout; @@ -57,7 +57,7 @@ TEST_F(BadRngShapeValidationTest, ShapeWithoutLayoutIsOk) { builder.RngUniform(zero, one, sans_layout); - StatusOr computation = builder.Build(); + StatusOr computation = builder.Build(); ASSERT_TRUE(computation.ok()); LOG(INFO) << computation.status(); } diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc index b853dfaa15d7ff2e21048a5a6a486d22c5a05416..4e65cf11f3f1a027e1adc5a89930caba28958fea 100644 --- a/tensorflow/compiler/xla/tests/bfloat16_test.cc +++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc @@ -19,10 +19,9 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" @@ -52,7 +51,7 @@ class Bfloat16Test : public ClientLibraryTestBase { }; XLA_TEST_F(Bfloat16Test, ScalarOperation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR0(static_cast(2.0f)); auto y = builder.ConstantR0(static_cast(1.0f)); builder.Add(x, y); @@ -62,7 +61,7 @@ XLA_TEST_F(Bfloat16Test, ScalarOperation) { } XLA_TEST_F(Bfloat16Test, LogOperation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR0(static_cast(4.0f)); builder.Log(x); @@ -71,7 +70,7 @@ XLA_TEST_F(Bfloat16Test, LogOperation) { } XLA_TEST_F(Bfloat16Test, NegateScalarF16) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.Neg(builder.ConstantR0(static_cast(2.1f))); ComputeAndCompareR0(&builder, static_cast(-2.1f), {}, @@ -80,7 +79,7 @@ XLA_TEST_F(Bfloat16Test, NegateScalarF16) { XLA_TEST_F(Bfloat16Test, BatchNormTraining) { const int kFeatureIndex = 2; - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto operand = builder.ConstantR4FromArray4D( {{{{static_cast(1.f)}, {static_cast(2.f)}}, @@ -117,7 +116,7 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) { XLA_TEST_F(Bfloat16Test, BatchNormGrad) { const int kFeatureIndex = 2; - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto operand = builder.ConstantR4FromArray4D( Array4D(2, 2, 2, 1, static_cast(0.0f))); diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc index 97fec89b63fb8d3a4264275f3253a91e1ea2ce68..48203b1d40ea69ff00a57c2c9e42620739b23d59 100644 --- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc +++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc @@ -15,8 +15,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" @@ -32,7 +32,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_32x4) { auto alhs = MakeLinspaceArray2D(0.0, 1.0, 32, 4); auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 4); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); builder.Add(lhs, rhs); @@ -48,7 +48,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixRowVector_129x129) { auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 129); auto arhs = MakeLinspaceArray2D(0.0, 1.0, 1, 129); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); builder.Add(lhs, rhs); @@ -64,7 +64,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_9x5) { auto alhs = MakeLinspaceArray2D(0.0, 1.0, 9, 5); auto arhs = MakeLinspaceArray2D(0.0, 1.0, 9, 1); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); builder.Add(lhs, rhs); @@ -80,7 +80,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) { auto alhs = MakeLinspaceArray2D(0.0, 1.0, 129, 257); auto arhs = MakeLinspaceArray2D(0.0, 1.0, 129, 1); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); builder.Add(lhs, rhs); @@ -93,7 +93,7 @@ TEST_F(BinopScalingTest, MatrixPlusPseudoMatrixColVector_129x257) { } TEST_F(BinopScalingTest, R0PlusR2F32) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs = builder.ConstantR0(42.0); auto rhs = builder.ConstantR2({ {1.0, 2.0}, {3.0, 4.0}, @@ -109,7 +109,7 @@ TEST_F(BinopScalingTest, R0PlusR2F32) { } TEST_F(BinopScalingTest, R4PlusR0S32) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // clang-format off Array4D lhs_array({ {{{1, 2}, diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc index 97095f1cc427789845051a8fea24c95475286fe2..34c86e007beea1cbac04641bdbdab62dc567f13e 100644 --- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc +++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc @@ -19,8 +19,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test.h" @@ -33,10 +33,8 @@ namespace { class BroadcastSimpleTest : public ClientLibraryTestBase { public: - ComputationDataHandle BuildBinOp(HloOpcode op, - const ComputationDataHandle& lhs, - const ComputationDataHandle& rhs, - ComputationBuilder* builder) { + XlaOp BuildBinOp(HloOpcode op, const XlaOp& lhs, const XlaOp& rhs, + XlaBuilder* builder) { switch (op) { case HloOpcode::kMinimum: { return builder->Min(lhs, rhs); @@ -105,21 +103,21 @@ class BroadcastSimpleTest : public ClientLibraryTestBase { using ::testing::HasSubstr; XLA_TEST_F(BroadcastSimpleTest, ScalarNoOpBroadcast) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR0(1.5), {}); ComputeAndCompareR0(&b, 1.5, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x3) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR0(2.25), {2, 3}); Array2D expected(2, 3, 2.25); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) { - ComputationBuilder b(client_, TestName()); - ComputationDataHandle src; + XlaBuilder b(TestName()); + XlaOp src; std::unique_ptr param_data = CreateR0Parameter(2.25f, /*parameter_number=*/0, /*name=*/"src", /*builder=*/&b, /*data_handle=*/&src); @@ -131,21 +129,21 @@ XLA_TEST_F(BroadcastSimpleTest, ScalarParamTo2D_2x3) { } XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_2x0) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR0(2.25), {2, 0}); Array2D expected(2, 0); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, ScalarTo2D_0x2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR0(2.25), {0, 2}); Array2D expected(0, 2); ComputeAndCompareR2(&b, expected, {}, ErrorSpec(0.0001)); } XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR1({1, 2, 3}), {2}); Array2D expected(2, 3); @@ -160,7 +158,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) { // Tests implicit broadcasting of PREDs. XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); Array2D x_vals(2, 1); x_vals(0, 0) = true; @@ -171,7 +169,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) { y_vals(1, 0, 0) = true; y_vals(1, 1, 0) = true; - ComputationDataHandle x, y; + XlaOp x, y; auto x_data = CreateR2Parameter(x_vals, 0, "x", &b, &x); auto y_data = CreateR3Parameter(y_vals, 1, "y", &b, &y); b.And(x, y, /*broadcast_dimensions=*/{1, 2}); @@ -186,7 +184,7 @@ XLA_TEST_F(BroadcastSimpleTest, BooleanAnd2DTo3D_Pred) { } XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR1({}), {2}); Array2D expected(2, 0); @@ -194,7 +192,7 @@ XLA_TEST_F(BroadcastSimpleTest, ZeroElement_1DTo2D) { } XLA_TEST_F(BroadcastSimpleTest, 1DToZeroElement2D) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Broadcast(b.ConstantR1({1, 2, 3}), {0}); Array2D expected(0, 3); @@ -209,7 +207,7 @@ XLA_TEST_F(BroadcastSimpleTest, InDimensionAndDegenerateBroadcasting) { // broadcasting (broadcast_dimensions {1, 2}), then is added to the rhs shape // [2, 3, 1]. Degenerate dimension broadcasting then broadcasts the size one // dimensions. - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Add(b.ConstantR2({{1.0, 5.0}}), b.ConstantLiteral(*Literal::CreateR3( @@ -247,7 +245,7 @@ class BroadcastR3ImplicitTest XLA_TEST_P(BroadcastR3ImplicitTest, Doit) { const R3ImplicitBroadcastSpec& spec = GetParam(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Shape r3_shape, r3_implicit_shape; Array3D r3_array(spec.output_bounds[0], spec.output_bounds[1], @@ -264,8 +262,7 @@ XLA_TEST_P(BroadcastR3ImplicitTest, Doit) { auto r3_implicit_parameter = builder.Parameter(0, r3_implicit_shape, "input"); auto r3_parameter = builder.Parameter(1, r3_shape, "input"); - ComputationDataHandle op = - BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder); + XlaOp op = BuildBinOp(spec.op, r3_implicit_parameter, r3_parameter, &builder); Array3D expected_array(spec.output_bounds[0], spec.output_bounds[1], spec.output_bounds[2]); @@ -300,9 +297,9 @@ INSTANTIATE_TEST_CASE_P(BroadcastR3ImplicitTestInstances, // r1 and r3's dim0 matches, and r1's dim1 and dim2 have size 1: XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) { - ComputationBuilder b(client_, TestName()); - ComputationDataHandle r1h; - ComputationDataHandle r3h; + XlaBuilder b(TestName()); + XlaOp r1h; + XlaOp r3h; Array3D r1d = {{{1}}, {{2}}}; Array3D r3d = {{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}; @@ -319,7 +316,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1_2) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -332,7 +329,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}, {2}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -345,7 +342,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_2) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}, {3, 4}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -358,7 +355,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1, 2}}, {{3, 4}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -371,7 +368,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_1) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}, {2}}, {{3}, {4}}})); auto r3 = b.ConstantLiteral( @@ -385,7 +382,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_2) { } XLA_TEST_F(BroadcastSimpleTest, Add3DTo3DDegenerate_0_1_2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR3({{{1}}})); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -491,7 +488,7 @@ class BroadcastR2ImplicitTest XLA_TEST_P(BroadcastR2ImplicitTest, Doit) { const R2ImplicitBroadcastSpec& spec = GetParam(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // Operands with degenerate dimensions require implicit broadcasting: Shape r2_shape, r2_implicit_shape1, r2_implicit_shape2; @@ -517,10 +514,9 @@ XLA_TEST_P(BroadcastR2ImplicitTest, Doit) { auto r2_implicit_parameter2 = builder.Parameter(2, r2_implicit_shape2, "input2"); - ComputationDataHandle op1 = + XlaOp op1 = BuildBinOp(spec.op1, r2_implicit_parameter1, r2_parameter, &builder); - ComputationDataHandle op2 = - BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder); + XlaOp op2 = BuildBinOp(spec.op2, op1, r2_implicit_parameter2, &builder); Array2D expected_array(spec.output_bounds[0], spec.output_bounds[1]); @@ -547,7 +543,7 @@ INSTANTIATE_TEST_CASE_P(BroadcastR2ImplicitTestInstances, ::testing::ValuesIn(kR2ImplicitBroadcastTestCases)); XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}})); auto r2 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}, {3, 4}})); b.Add(r2, r1); @@ -558,7 +554,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_0) { } XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantLiteral(*Literal::CreateR2({{1}, {2}})); auto r2 = b.ConstantLiteral(*Literal::CreateR2({{1, 2}, {3, 4}})); b.Add(r2, r1); @@ -569,7 +565,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add2DTo2DDegenerate_1) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantR1({10, 20}); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -582,7 +578,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim0) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantR1({10, 20}); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -595,7 +591,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim1) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1 = b.ConstantR1({10, 20}); auto r3 = b.ConstantLiteral( *Literal::CreateR3({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})); @@ -608,7 +604,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDim2) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1_0 = b.ConstantR1({1000, 2000}); auto r1_1 = b.ConstantR1({100, 200}); auto r1_2 = b.ConstantR1({10, 20}); @@ -629,7 +625,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAll) { } XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); auto r1_0 = b.ConstantR1({1000, 2000}); auto r1_1 = b.ConstantR1({100, 200}); auto r1_2 = b.ConstantR1({10, 20}); @@ -652,7 +648,7 @@ XLA_TEST_F(BroadcastSimpleTest, Add1DTo3DInDimAllWithScalarBroadcast) { XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) { // Binary dimension broadcasting of the smaller lhs ([2, 2] up to [2, 2, 2]) // results in a shape incompatible with the lhs [2, 3, 1]. - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Add(b.ConstantR2({{1.0, 5.0}, {1.0, 5.0}}), b.ConstantLiteral(*Literal::CreateR3( @@ -667,7 +663,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidBinaryAndDegenerateBroadcasting) { XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) { // Test invalid broadcasting with [1, 2] and [2, 3] inputs. - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Add(b.ConstantR2({{1.0, 2.0}}), b.ConstantR2({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}})); @@ -680,7 +676,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) { XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) { // Test invalid broadcasting with [1, 2] and [2, 3] inputs. - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); b.Add(b.ConstantR2({{1.0, 2.0}}), b.ConstantR2({{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}})); diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl index eac2eb286c3f7a1cd33aed03686e99ef753b773a..53f2c3bfbfce9585cb68f103a495ce2f1ad8432e 100644 --- a/tensorflow/compiler/xla/tests/build_defs.bzl +++ b/tensorflow/compiler/xla/tests/build_defs.bzl @@ -4,7 +4,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured") load("//tensorflow/compiler/xla/tests:plugin.bzl", "plugins") load("//tensorflow:tensorflow.bzl", "tf_cc_test") -all_backends = ["cpu", "cpu_parallel", "gpu"] + plugins.keys() +all_backends = ["cpu", "gpu"] + plugins.keys() def filter_backends(backends): """Removes "gpu" from a backend list if CUDA is not enabled. @@ -39,10 +39,10 @@ def xla_test(name, **kwargs): """Generates cc_test targets for the given XLA backends. - This rule generates a cc_test target for one or more XLA backends and also - a platform-agnostic cc_library rule. The arguments are identical to cc_test - with two additions: 'backends' and 'backend_args'. 'backends' specifies the - backends to generate tests for ("cpu", "cpu_parallel", "gpu"), and + This rule generates a cc_test target for one or more XLA backends and also a + platform-agnostic cc_library rule. The arguments are identical to cc_test with + two additions: 'backends' and 'backend_args'. 'backends' specifies the + backends to generate tests for ("cpu", "gpu"), and 'backend_args'/'backend_tags' specifies backend-specific args parameters to use when generating the cc_test. @@ -90,9 +90,9 @@ def xla_test(name, deps: Dependencies of the target. xla_test_library_deps: If set, the generated test targets will depend on the respective cc_libraries generated by the xla_test_library rule. - backends: A list of backends to generate tests for. Supported - values: "cpu", "cpu_parallel", "gpu". If this list is empty, the test will - be generated for all supported backends. + backends: A list of backends to generate tests for. Supported values: "cpu", + "gpu". If this list is empty, the test will be generated for all supported + backends. blacklisted_backends: A list of backends to NOT generate tests for. args: Test arguments for the target. tags: Tags for the target. @@ -128,10 +128,6 @@ def xla_test(name, if backend == "cpu": backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"] backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"] - elif backend == "cpu_parallel": - backend_deps = ["//tensorflow/compiler/xla/service:cpu_plugin"] - backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_cpu"] - this_backend_args += ["--xla_backend_extra_options=\"xla_cpu_parallel\""] elif backend == "gpu": backend_deps = ["//tensorflow/compiler/xla/service:gpu_plugin"] backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"] @@ -201,7 +197,7 @@ def xla_test_library(name, hdrs: Headers for the target. deps: Dependencies of the target. backends: A list of backends to generate libraries for. - Supported values: "cpu", "cpu_parallel", "gpu". If this list is empty, the + Supported values: "cpu", "gpu". If this list is empty, the library will be generated for all supported backends. """ @@ -210,7 +206,7 @@ def xla_test_library(name, for backend in filter_backends(backends): this_backend_copts = [] - if backend in ["cpu", "cpu_parallel", "gpu"]: + if backend in ["cpu", "gpu"]: backend_deps = ["//tensorflow/compiler/xla/tests:test_macros_%s" % backend] elif backend in plugins: backend_deps = plugins[backend]["deps"] diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc index 69389dae3f28a2a20eb8a26e72ab3541eb565305..22660c35dcaa0ebbb553aa2d5e2412043a2bb300 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.cc +++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc @@ -61,6 +61,11 @@ ClientLibraryTestBase::ClientLibraryTestBase( : client_(GetOrCreateLocalClientOrDie(client_options)), execution_options_(CreateDefaultExecutionOptions()) { CHECK_EQ(platform, client_options.platform()); + + LocalClientOptions ref_options; + ref_options.set_platform(GetReferencePlatform()); + ref_client_ = GetOrCreateLocalClientOrDie(ref_options); + // Disabling constant_folding so that tests (usually written using Constants) // will exercise the intended code paths, instead of being constant folded. // @@ -152,6 +157,7 @@ ClientLibraryTestBase::ExecuteAndTransferReference( *execution_options.mutable_shape_with_output_layout() = *shape_with_output_layout; } + execution_options.clear_device_handles(); return ref_client_->ExecuteAndTransfer(computation, arguments, &execution_options); } @@ -211,6 +217,14 @@ void ClientLibraryTestBase::ComputeAndCompareR1( arguments); } +void ClientLibraryTestBase::ComputeAndCompareR1( + XlaBuilder* builder, const tensorflow::core::Bitmap& expected, + tensorflow::gtl::ArraySlice arguments) { + std::unique_ptr expected_literal = Literal::CreateR1(expected); + ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal, + arguments); +} + template void ClientLibraryTestBase::ComputeAndCompareLiteral( BuilderT* builder, const Literal& expected, @@ -452,7 +466,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus( } void ClientLibraryTestBase::ComputeAndCompareR1U8( - ComputationBuilder* builder, tensorflow::StringPiece expected, + XlaBuilder* builder, tensorflow::StringPiece expected, tensorflow::gtl::ArraySlice arguments) { auto actual_status = ExecuteAndTransfer(builder, arguments); EXPECT_IS_OK(actual_status.status()); @@ -613,8 +627,8 @@ ClientLibraryTestBase::ComputeValueAndReference( return std::make_pair(std::move(reference), std::move(result)); } -Computation ClientLibraryTestBase::CreateScalarRelu() { - ComputationBuilder builder(client_, "relu"); +XlaComputation ClientLibraryTestBase::CreateScalarRelu() { + XlaBuilder builder("relu"); auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {}); auto z_value = builder.Parameter(0, shape, "z_value"); auto zero = use_bfloat16_ @@ -626,8 +640,8 @@ Computation ClientLibraryTestBase::CreateScalarRelu() { return computation_status.ConsumeValueOrDie(); } -Computation ClientLibraryTestBase::CreateScalarMax() { - ComputationBuilder builder(client_, "max"); +XlaComputation ClientLibraryTestBase::CreateScalarMax() { + XlaBuilder builder("max"); auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {}); auto x = builder.Parameter(0, shape, "x"); auto y = builder.Parameter(1, shape, "y"); diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h index 481d7c5c25a7e8ad9d40d11fa3f84e13d6922aa3..32eea7c2f3a65d2b4a83435ec6258ea9cf6aaf6a 100644 --- a/tensorflow/compiler/xla/tests/client_library_test_base.h +++ b/tensorflow/compiler/xla/tests/client_library_test_base.h @@ -165,6 +165,9 @@ class ClientLibraryTestBase : public ::testing::Test { void ComputeAndCompareR1(ComputationBuilder* builder, const tensorflow::core::Bitmap& expected, tensorflow::gtl::ArraySlice arguments); + void ComputeAndCompareR1(XlaBuilder* builder, + const tensorflow::core::Bitmap& expected, + tensorflow::gtl::ArraySlice arguments); template void ComputeAndCompareR2(BuilderT* builder, const Array2D& expected, @@ -219,7 +222,7 @@ class ClientLibraryTestBase : public ::testing::Test { // Compare the result of the computation to a strings. In XLA strings are // represented using rank-1 U8 shapes. void ComputeAndCompareR1U8( - ComputationBuilder* builder, tensorflow::StringPiece expected, + XlaBuilder* builder, tensorflow::StringPiece expected, tensorflow::gtl::ArraySlice arguments); // Convenience method for running a built computation, transferring the @@ -252,8 +255,8 @@ class ClientLibraryTestBase : public ::testing::Test { ErrorSpec error); // Create scalar operations for use in reductions. - Computation CreateScalarRelu(); - Computation CreateScalarMax(); + XlaComputation CreateScalarRelu(); + XlaComputation CreateScalarMax(); Computation CreateScalarReluSensitivity(); // Special case convenience functions for creating filled arrays. diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc index 32e2f2c0848407ec46a5ac52e2668ef27b92c426..1e5447179677318eb604590195e97f811024898a 100644 --- a/tensorflow/compiler/xla/tests/client_test.cc +++ b/tensorflow/compiler/xla/tests/client_test.cc @@ -109,8 +109,7 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) { /*minor_to_major=*/{1, 0}))); } -XLA_TEST_F(ClientTest, - DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) { +XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) { XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg; Shape shape = ShapeUtil::MakeShape(S32, {2, 2}); diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc index 896b34fb6e2762c14bd9ec2bf1ba13c548d4cf60..b5a42e305987df030c15d089f5877f73bb61de1b 100644 --- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc @@ -18,9 +18,9 @@ limitations under the License. #include #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/statusor.h" @@ -34,13 +34,35 @@ limitations under the License. namespace xla { namespace { +StatusOr CreateConvDimensionNumbers( + int64 input_batch, int64 input_feature, int64 input_first_spatial, + int64 input_second_spatial, int64 output_batch, int64 output_feature, + int64 output_first_spatial, int64 output_second_spatial, + int64 kernel_output_feature, int64 kernel_input_feature, + int64 kernel_first_spatial, int64 kernel_second_spatial) { + ConvolutionDimensionNumbers dimension_numbers; + dimension_numbers.set_input_batch_dimension(input_batch); + dimension_numbers.set_input_feature_dimension(input_feature); + dimension_numbers.add_input_spatial_dimensions(input_first_spatial); + dimension_numbers.add_input_spatial_dimensions(input_second_spatial); + dimension_numbers.set_kernel_output_feature_dimension(kernel_output_feature); + dimension_numbers.set_kernel_input_feature_dimension(kernel_input_feature); + dimension_numbers.add_kernel_spatial_dimensions(kernel_first_spatial); + dimension_numbers.add_kernel_spatial_dimensions(kernel_second_spatial); + dimension_numbers.set_output_batch_dimension(output_batch); + dimension_numbers.set_output_feature_dimension(output_feature); + dimension_numbers.add_output_spatial_dimensions(output_first_spatial); + dimension_numbers.add_output_spatial_dimensions(output_second_spatial); + TF_RETURN_IF_ERROR(XlaBuilder::Validate(dimension_numbers)); + return dimension_numbers; +} + class ConvolutionDimensionNumbersTest : public ClientLibraryTestBase {}; // Tests the convolution operation with invalid input dimension numbers. TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) { auto dimension_numbers_status = - ComputationBuilder::CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0, - 1, 2, 3); + CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3); ASSERT_FALSE(dimension_numbers_status.ok()); ASSERT_THAT(dimension_numbers_status.status().error_message(), ::testing::HasSubstr("input are not unique")); @@ -49,8 +71,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) { // Tests the convolution operation with invalid weight dimension numbers. TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) { auto dimension_numbers_status = - ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0, - 2, 2, 3); + CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 2, 3); ASSERT_FALSE(dimension_numbers_status.ok()); ASSERT_THAT(dimension_numbers_status.status().error_message(), ::testing::HasSubstr("weight are not unique")); @@ -59,8 +80,7 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) { // Tests the convolution operation with invalid output dimension numbers. TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) { auto dimension_numbers_status = - ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0, - 1, 2, 3); + CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0, 1, 2, 3); ASSERT_FALSE(dimension_numbers_status.ok()); ASSERT_THAT(dimension_numbers_status.status().error_message(), ::testing::HasSubstr("output are not unique")); @@ -76,14 +96,14 @@ XLA_TEST_F(ConvolutionDimensionNumbersTest, client_->TransferToServer(*Literal::CreateR4FromArray4D(*weight_array)) .ConsumeValueOrDie(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(*input_array); auto weight = builder.Parameter(0, ShapeUtil::MakeShape(F32, {4, 3, 1, 1}), "weight"); auto conv1 = builder.Conv(input, weight, {1, 1}, Padding::kValid); ConvolutionDimensionNumbers dim_nums = - ComputationBuilder::CreateDefaultConvDimensionNumbers(); + XlaBuilder::CreateDefaultConvDimensionNumbers(); // Swap batch_dimension and feature_dimension. int64 old_input_batch_dim = dim_nums.input_batch_dimension(); int64 old_output_batch_dim = dim_nums.output_batch_dimension(); diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc index 9c1145def8c11f1222c63adf006102887d49f00d..50d6e25d868c4964ff35023b43a3734ed115bbb8 100644 --- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/padding.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/reference_util.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -52,7 +53,7 @@ class ConvolutionVariantsTest : public ClientLibraryTestBase { }; XLA_TEST_F(ConvolutionVariantsTest, Minimal) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const Array4D input_array(1, 1, 1, 1, {2}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -67,7 +68,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Minimal) { } XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const Array4D input_array(5, 1, 1, 1, {1, 2, 3, 4, 5}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -82,7 +83,7 @@ XLA_TEST_F(ConvolutionVariantsTest, MinimalWithBatch) { } XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(2, 1, 3, 4); input_array.FillWithMultiples(1); @@ -99,7 +100,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Flat1x1) { } XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 2, 1, 1, {10, 1}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -114,7 +115,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Deep1x1) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 2, {1, 2}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -129,7 +130,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 3, {1, 2, 3}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -144,7 +145,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in1x3) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -159,7 +160,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -174,7 +175,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1in2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -189,7 +190,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2in2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array( 2, 2, 2, 3, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, // plane 0 @@ -210,7 +211,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2in2x3WithDepthAndBatch) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 4, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -225,7 +226,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x4) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 5, {1, 2, 3, 4, 5}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -240,7 +241,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride1x2in1x5) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 4, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -255,7 +256,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x4) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 5, {1, 2, 3, 4, 5}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -270,7 +271,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x3stride1x2in1x5) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -285,7 +286,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1stride2x2in3x3) { } XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 1, {1}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -300,7 +301,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x1in1x1Padded) { } XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 3, {1, 2, 3}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -315,7 +316,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter5x1in3x1Padded) { } XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 2, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -333,7 +334,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 2, 1, 2, {1, 2, 3, 4}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -348,7 +349,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1in2x1WithPaddingAndDepth) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 3, 3, {1, 2, 3, 4, 5, 6, 7, 8, 9}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -363,7 +364,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2Stride1x1Input3x3) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(1, 1, 1, 3, {1, 2, 3}); auto input = builder.ConstantR4FromArray4D(input_array); @@ -378,7 +379,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2Stride1x1Input1x3) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(64); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -398,7 +399,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x1x8x8Input1x1x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(16 * 1 * 1 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -419,7 +420,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input16x1x1x1) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); constexpr int bs = 16; constexpr int kx = 2; @@ -450,7 +451,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input16x1x2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); constexpr int kx = 2; constexpr int ky = 2; @@ -482,7 +483,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(16, 1, 8, 8); for (int i0 = 0; i0 < 16; ++i0) { @@ -510,7 +511,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x8x8Input16x1x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(2 * 8 * 8); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -536,7 +537,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input1x2x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(2 * 2 * 8 * 8); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -562,7 +563,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input2x2x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(32 * 2 * 8 * 8); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -602,7 +603,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter2x2x8x8Input32x2x8x8) { } XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D input_array(16, 16, 1, 1); Array4D filter_array(16, 16, 1, 1); @@ -628,7 +629,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter16x16x1x1Input16x16x1x1) { } XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 4 * 6); std::iota(input_data.begin(), input_data.end(), 0.0); @@ -640,14 +641,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatRhsDilation) { builder.ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{2, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 2, 2, {3924, 4257, 5922, 6255}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -659,14 +660,14 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation1D) { builder.ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{}, /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 1, 8, {10, 2, 20, 3, 30, 4, 40, 5}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 3 * 4); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -682,8 +683,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) { builder.ConvGeneralDilated( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{2, 1}, /*padding=*/{{1, 0}, {0, 0}}, /*lhs_dilation=*/{3, 2}, - /*rhs_dilation=*/{}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + /*rhs_dilation=*/{}, XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 3, 5, {204, 40, 406, 60, 608, // @@ -693,7 +693,7 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) { } XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -705,14 +705,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingOnBothEnds) { builder.ConvGeneral( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {-1, -1}}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 1, 2, {23, 34}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -724,14 +724,14 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingLowAndPositivePaddingHigh) { builder.ConvGeneral( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {-1, 2}}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 1, 5, {23, 34, 45, 50, 0}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -743,14 +743,14 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingLowAndNegativePaddingHigh) { builder.ConvGeneral( /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {2, -1}}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); Array4D expected(1, 1, 1, 5, {0, 1, 12, 23, 34}); ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -763,7 +763,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) { /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {3, 2}}, /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); // input: // [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5] @@ -775,7 +775,7 @@ XLA_TEST_F(ConvolutionVariantsTest, PositivePaddingAndDilation) { ComputeAndCompareR4(&builder, expected, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 1 * 1 * 5); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -788,7 +788,7 @@ XLA_TEST_F(ConvolutionVariantsTest, NegativePaddingAndDilation) { /*lhs=*/input, /*rhs=*/filter, /*window_strides=*/{}, /*padding=*/{{0, 0}, {-3, -2}}, /*lhs_dilation=*/{1, 2}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); // input: // [1, 2, 3, 4, 5] --dilate-> [1, 0, 2, 0, 3, 0, 4, 0, 5] @@ -821,7 +821,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x1x2x3_Filter2x1x1x2) { Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -854,7 +854,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input1x16x1x1_Filter1x16x1x1) { Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -887,7 +887,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter1x16x1x1) { Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -920,7 +920,7 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) { Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -954,7 +954,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Array4D input_array(bs, iz, iy, ix, input_data); Array4D filter_array(oz, iz, ky, kx, kernel_data); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.ConstantR4FromArray4D(input_array); auto filter = builder.ConstantR4FromArray4D(filter_array); builder.Conv(input, filter, {1, 1}, Padding::kValid); @@ -966,7 +966,7 @@ XLA_TEST_F(ConvolutionVariantsTest, } XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 2 * 3 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -1010,7 +1010,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 2 * 3 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -1054,7 +1054,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 2 * 3 * 1); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -1095,7 +1095,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) { } XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector input_data(1 * 2 * 3 * 2); std::iota(input_data.begin(), input_data.end(), 1.0); @@ -1147,7 +1147,7 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) { // BackwardInputConv([1,2,3], [5,6], padding_low=0, padding_high=1) XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingLessThanHighPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3})); @@ -1166,19 +1166,18 @@ XLA_TEST_F(ConvolutionVariantsTest, // BackwardInputConv([1], [1,10,100], stride=3, padding=(2,1)) XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingGreaterThanHighPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 1, /*values=*/{1})); auto weights = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 3, /*values=*/{1, 10, 100})); auto mirrored_weights = builder.Rev(weights, {2, 3}); - builder.ConvGeneralDilated( - gradients, mirrored_weights, - /*window_strides=*/{1, 1}, - /*padding=*/{{0, 0}, {0, 3}}, - /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + builder.ConvGeneralDilated(gradients, mirrored_weights, + /*window_strides=*/{1, 1}, + /*padding=*/{{0, 0}, {0, 3}}, + /*lhs_dilation=*/{1, 3}, /*rhs_dilation=*/{}, + XlaBuilder::CreateDefaultConvDimensionNumbers()); ComputeAndCompareR4(&builder, {{{{100, 0}}}}, {}, error_spec_); } @@ -1187,7 +1186,7 @@ XLA_TEST_F(ConvolutionVariantsTest, // into // BackwardInputConv([1], [1,10,100], padding=(1,1)) XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 1, /*values=*/{1})); @@ -1208,7 +1207,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding) { // However, XLA:GPU doesn't actually fuse it because PadInsertion doesn't // support negative padding on backward convolution yet (b/32744257). XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR4FromArray4D( Array4D(1, 1, 1, 3, /*values=*/{1, 2, 3})); @@ -1224,7 +1223,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) { XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterLowPaddingLessThanHighPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // activations: 1,2,3,4 ---pad--> 0,1,2,3,4,0,0 // gradients: 100,10,1 -dilate-> 100,0,10,0,1 @@ -1240,7 +1239,7 @@ XLA_TEST_F(ConvolutionVariantsTest, /*window_strides=*/{1, 1}, /*padding=*/{{0, 0}, {1, 2}}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); builder.Transpose(forward_conv, {0, 1, 2, 3}); ComputeAndCompareR4(&builder, {{{{24, 130, 240}}}}, {}, error_spec_); @@ -1248,7 +1247,7 @@ XLA_TEST_F(ConvolutionVariantsTest, XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterLowPaddingGreaterThanHighPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // activations: 1,2,3,4 ---pad--> 0,0,1,2,3,4 // gradients: 100,10,1 -dilate-> 100,0,10,0,1 @@ -1266,14 +1265,14 @@ XLA_TEST_F(ConvolutionVariantsTest, /*window_strides=*/{1, 1}, /*padding=*/{{0, 0}, {2, 0}}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); builder.Transpose(forward_conv, {0, 1, 2, 3}); ComputeAndCompareR4(&builder, {{{{13, 24}}}}, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // activations: 1,2,3,4 ---pad--> 0,0,1,2,3,4,0 // gradients: 100,10,1 -dilate-> 100,0,10,0,1 @@ -1293,14 +1292,14 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding) { /*window_strides=*/{1, 1}, /*padding=*/{{0, 0}, {2, 1}}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers()); + XlaBuilder::CreateDefaultConvDimensionNumbers()); builder.Transpose(forward_conv, {0, 1, 2, 3}); ComputeAndCompareR4(&builder, {{{{13, 24, 130}}}}, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients = builder.ConstantR3FromArray3D( Array3D(1, 1, 1, /*value=*/1)); @@ -1314,26 +1313,26 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding1D) { } XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto activations = builder.ConstantR3FromArray3D(Array3D({{{1, 2, 3, 4}}})); auto gradients = builder.ConstantR3FromArray3D(Array3D({{{100, 10, 1}}})); - auto forward_conv = builder.ConvGeneralDilated( - activations, gradients, - /*window_strides=*/{1}, - /*padding=*/{{2, 1}}, - /*lhs_dilation=*/{}, /*rhs_dilation=*/{2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers( - /*num_spatial_dims=*/1)); + auto forward_conv = + builder.ConvGeneralDilated(activations, gradients, + /*window_strides=*/{1}, + /*padding=*/{{2, 1}}, + /*lhs_dilation=*/{}, /*rhs_dilation=*/{2}, + XlaBuilder::CreateDefaultConvDimensionNumbers( + /*num_spatial_dims=*/1)); builder.Transpose(forward_conv, {0, 1, 2}); ComputeAndCompareR3(&builder, {{{13, 24, 130}}}, {}, error_spec_); } XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto gradients_flat = Literal::CreateR1({1}); auto gradients_literal = @@ -1357,7 +1356,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) { } XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto activations_flat = Literal::CreateR1({1, 2, 3, 4}); auto activations_literal = @@ -1378,7 +1377,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) { /*window_strides=*/{1, 1, 1}, /*padding=*/{{0, 0}, {0, 0}, {2, 1}}, /*lhs_dilation=*/{}, /*rhs_dilation=*/{1, 1, 2}, - ComputationBuilder::CreateDefaultConvDimensionNumbers( + XlaBuilder::CreateDefaultConvDimensionNumbers( /*num_spatial_dims=*/3)); builder.Transpose(forward_conv, {0, 1, 2, 3, 4}); ComputeAndCompareLiteral(&builder, *expected_literal, {}, error_spec_); diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc index 021fbcedb994eb4ea86a9a2da691900a56cd45f2..ff53a84588fc04effac217112425e5868dfeaa67 100644 --- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc +++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc @@ -470,13 +470,6 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase { template void RunR3Contiguous(std::vector operand_shape, int32 index, int32 size) { -#ifdef XLA_TEST_BACKEND_CPU_PARALLEL - // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10. - if (std::is_same::value) { - return; - } -#endif - const int32 kSeq = operand_shape[0]; const int32 kBatch = operand_shape[1]; const int32 kDim = operand_shape[2]; @@ -539,30 +532,22 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0(); } // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10. -XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R1BF16)) { - TestR1(); -} +XLA_TEST_F(DynamicUpdateSliceTest, Int32R1BF16) { TestR1(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32R1) { TestR1(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R1) { TestR1(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R1) { TestR1(); } -// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10. -XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R2BF16)) { - TestR2(); -} +XLA_TEST_F(DynamicUpdateSliceTest, Int32R2BF16) { TestR2(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32R2) { TestR2(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R2) { TestR2(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R2) { TestR2(); } -// TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10. -XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R3BF16)) { - TestR3(); -} +XLA_TEST_F(DynamicUpdateSliceTest, Int32R3BF16) { TestR3(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3(); } XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3(); } XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3(); } -XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32WrapBF16)) { +XLA_TEST_F(DynamicUpdateSliceTest, Int32WrapBF16) { TestWrap(); } XLA_TEST_F(DynamicUpdateSliceTest, Int32Wrap) { TestWrap(); } diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc index 644cbbf40f296eb2a574ae568b4f32aa3d0bd12f..c8cc8e40aa32105e56e7e13489734af6ce56e0c7 100644 --- a/tensorflow/compiler/xla/tests/execution_profile_test.cc +++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc @@ -24,8 +24,7 @@ namespace { class ExecutionProfileTest : public ClientLibraryTestBase {}; -XLA_TEST_F(ExecutionProfileTest, - DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) { +XLA_TEST_F(ExecutionProfileTest, ExecuteWithExecutionProfile) { Shape shape = ShapeUtil::MakeShape(F32, {256, 256}); TF_ASSERT_OK_AND_ASSIGN( diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc index c7f64d856097350a948f6b15cefb147aae7cee18..6f89e9164c8d44cbc6afcceb653b0e57841bf500 100644 --- a/tensorflow/compiler/xla/tests/fusion_test.cc +++ b/tensorflow/compiler/xla/tests/fusion_test.cc @@ -794,19 +794,19 @@ void BM_ParallelFusion(int num_iters) { // Transfer literals to device. auto param0_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1); - ShapedBuffer buffer0 = + ScopedShapedBuffer buffer0 = client->LiteralToShapedBuffer(*param0_literal, device_ordinal) .ConsumeValueOrDie(); auto param1_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1); - ShapedBuffer buffer1 = + ScopedShapedBuffer buffer1 = client->LiteralToShapedBuffer(*param1_literal, device_ordinal) .ConsumeValueOrDie(); auto param2_literal = Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1); - ShapedBuffer buffer2 = + ScopedShapedBuffer buffer2 = client->LiteralToShapedBuffer(*param2_literal, device_ordinal) .ConsumeValueOrDie(); diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc index 90496d55e60b4f45fc2d46b2746f94d775cf9f94..4dd3acd9af16210408229d0c8980173314766278 100644 --- a/tensorflow/compiler/xla/tests/gather_operation_test.cc +++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc @@ -401,10 +401,7 @@ ENTRY main { class GatherClientLibraryTest : public ClientLibraryTestBase {}; -// TODO(b/30671675): Asynchronous execution on stream is not yet supported on -// GPU and CPU_PARALLEL. -XLA_TEST_F(GatherClientLibraryTest, - DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(Basic))) { +XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) { // We create this HLO, but using the XlaBuilder API. // // ENTRY main { diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc index 7209f91639b5f24e7da569135c3b9444ed522d6e..f21f83992ffb7c07dff31c68a7e9e3f7944bf512 100644 --- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc @@ -15,9 +15,8 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" @@ -38,7 +37,7 @@ class LocalClientAllocationTest : public LocalClientTestBase { }; XLA_TEST_F(LocalClientAllocationTest, AddVectors) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({0.0f, 1.0f, 2.0f}); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); @@ -74,7 +73,7 @@ XLA_TEST_F(LocalClientAllocationTest, AddVectors) { XLA_TEST_F(LocalClientAllocationTest, RunOnDevices) { // Run a computation on every device on the system. Verify that allocation // occurs on the proper device. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({0.0f, 1.0f, 2.0f}); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc index 7e14e77366d1d4d7f21a585883bb2fc750f4247e..44c6811df84f49b6c1b24c11449939e2d375a9d1 100644 --- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc @@ -18,9 +18,8 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" @@ -54,7 +53,7 @@ class LocalClientExecuteTest : public LocalClientTestBase { }; XLA_TEST_F(LocalClientExecuteTest, Constant) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto y = builder.ConstantR0(123.0f); ScopedShapedBuffer result = @@ -64,7 +63,7 @@ XLA_TEST_F(LocalClientExecuteTest, Constant) { } XLA_TEST_F(LocalClientExecuteTest, AddScalars) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x"); auto y = builder.ConstantR0(123.0f); builder.Add(x, y); @@ -77,7 +76,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddScalars) { } XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0}), "x"); auto y = builder.ConstantR1({}); builder.Add(x, y); @@ -90,7 +89,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddZeroElementVectors) { } XLA_TEST_F(LocalClientExecuteTest, AddVectors) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); @@ -104,7 +103,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectors) { } XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); @@ -122,7 +121,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddVectorsWithProfile) { } XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); builder.Add(x, y); @@ -155,7 +154,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) { } XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); builder.Add(x, y); @@ -192,7 +191,7 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentOutputLayouts) { } XLA_TEST_F(LocalClientExecuteTest, TupleResult) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); builder.Tuple({x, y, x}); @@ -220,7 +219,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) { } XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); auto inner_tuple = builder.Tuple({x, y, x}); @@ -254,7 +253,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) { XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) { // Verify setting the result layout of a computation with a tuple output. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {2, 2}), "y"); builder.Tuple({x, y}); @@ -291,7 +290,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) { // Computation adds the respective array and vector elements from each tuple // argument and returns the results as a tuple. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, tuple_shape0, "x"); auto y = builder.Parameter(1, tuple_shape1, "y"); auto x_0 = builder.GetTupleElement(x, 0); @@ -338,7 +337,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) { // Computation negates the array element and sums the two vector elements in // the nested tuple. The resulting array and vector are returned as a tuple. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto param = builder.Parameter(0, nested_tuple_shape, "param"); auto inner_tuple = builder.GetTupleElement(param, 0); auto inner_array = builder.GetTupleElement(inner_tuple, 0); @@ -376,7 +375,7 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) { const Shape tuple_shape = ShapeUtil::MakeTupleShape({array_shape, array_shape}); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto param = builder.Parameter(0, tuple_shape, "param"); auto element_0 = builder.GetTupleElement(param, 0); auto element_1 = builder.GetTupleElement(param, 1); @@ -420,11 +419,11 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) { std::vector element_shapes(kElementCount, element_shape); const Shape tuple_shape = ShapeUtil::MakeTupleShape(element_shapes); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto param = builder.Parameter(0, tuple_shape, "param"); // Add each element's tuple index value to every element. - std::vector result_elements; + std::vector result_elements; for (int i = 0; i < kElementCount; ++i) { auto element = builder.GetTupleElement(param, i); result_elements.push_back( @@ -453,9 +452,7 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) { } } -// TODO(b/66968986): Test times out on CPU parallel backend. Disabled -// 2017-09-26. -XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) { +XLA_TEST_F(LocalClientExecuteTest, LargeNestedTuple) { // Construct and run a computation which takes a two-level nested tuple // parameter with a large fanout. const int kFanout = 40; @@ -467,15 +464,15 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_CPU_PARALLEL(LargeNestedTuple)) { std::vector inner_tuple_shapes(kFanout, inner_tuple_shape); const Shape tuple_shape = ShapeUtil::MakeTupleShape(inner_tuple_shapes); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto param = builder.Parameter(0, tuple_shape, "param"); // The computation increments each leaf value by an amount equal to the leaf's // ordinal position in a traversal of the tuple. - std::vector result_elements; + std::vector result_elements; for (int i = 0; i < kFanout; ++i) { auto outer_element = builder.GetTupleElement(param, i); - std::vector inner_result_elements; + std::vector inner_result_elements; for (int j = 0; j < kFanout; ++j) { auto inner_element = builder.GetTupleElement(outer_element, j); inner_result_elements.push_back(builder.Add( @@ -522,7 +519,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) { shape = ShapeUtil::MakeTupleShape({shape}); } - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto element = builder.Parameter(0, shape, "param"); for (int i = 0; i < kTupleDepth; ++i) { element = builder.GetTupleElement(element, 0); @@ -556,7 +553,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) { XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) { // Test passing in an invalid number of arguments. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {3}), "y"); builder.Add(x, y); @@ -573,7 +570,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) { XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) { // Test passing in an argument with the wrong shape. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); builder.Neg(x); @@ -590,7 +587,7 @@ XLA_TEST_F(LocalClientExecuteTest, IncorrectArgumentShape) { XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) { // Test passing in an invalid result layout parameter. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "x"); builder.Neg(x); @@ -613,7 +610,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidResultLayout) { XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) { // Try to run a trivial computation on every device on the system. If a // specific device is not supported, check that the right error is returned. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); auto computation = builder.Build().ConsumeValueOrDie(); for (int d = 0; d < local_client_->device_count(); ++d) { @@ -640,7 +637,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnAllDeviceOrdinals) { XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) { // Try running computations on devices with device ordinal values which do not // exist. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); auto computation = builder.Build().ConsumeValueOrDie(); @@ -657,7 +654,7 @@ XLA_TEST_F(LocalClientExecuteTest, InvalidDeviceOrdinalValues) { XLA_TEST_F(LocalClientExecuteTest, RunOnStream) { // Run a computation on a specific stream on each device on the system. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); auto computation = builder.Build().ConsumeValueOrDie(); @@ -693,7 +690,7 @@ XLA_TEST_F(LocalClientExecuteTest, se::Stream wrong_stream(wrong_platform->ExecutorForDevice(0).ValueOrDie()); wrong_stream.Init(); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); auto execute_status = ExecuteLocally( builder.Build().ValueOrDie(), {}, DefaultExecutableBuildOptions(), @@ -710,7 +707,7 @@ XLA_TEST_F(LocalClientExecuteTest, .ValueOrDie(); TestAllocator allocator(wrong_platform); - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto y = builder.ConstantR0(123.0f); auto execute_status = ExecuteLocally( @@ -723,7 +720,7 @@ XLA_TEST_F(LocalClientExecuteTest, XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) { // Try to run a computation on a stream that has not been initialized. - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); builder.ConstantR0(42.0f); LOG(INFO) << "default device = " << local_client_->default_device_ordinal(); @@ -743,7 +740,7 @@ XLA_TEST_F(LocalClientExecuteTest, RunOnUninitializedStream) { } XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); std::initializer_list vec1 = {1.f, 2.f, 3.f}; std::initializer_list vec2 = {2.f, 4.f, 6.f}; @@ -763,7 +760,7 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) { } XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {3}), "x"); auto y = builder.ConstantR1({2.0f, 3.0f, 4.0f}); builder.Add(x, y); @@ -853,9 +850,8 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) { // TODO(b/34359662): Support infeed/outfeed on GPU and CPU parallel. // 2017-10-18. -XLA_TEST_F(LocalClientExecuteTest, - DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(InfeedOutfeedTest))) { - ComputationBuilder builder(local_client_, TestName()); +XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_GPU(InfeedOutfeedTest)) { + XlaBuilder builder(TestName()); const Shape shape = ShapeUtil::MakeShape(F32, {3}); auto in = builder.Infeed(shape); auto constant = builder.ConstantR1({1.0f, 2.0f, 3.0f}); @@ -893,7 +889,7 @@ void BM_LocalClientOverhead(int num_iters) { int device_ordinal = client->default_device_ordinal(); // Use a tiny add operation as the computation. - ComputationBuilder builder(client, "Add"); + XlaBuilder builder("Add"); auto shape = ShapeUtil::MakeShape(F32, {2, 3}); auto x = builder.Parameter(0, shape, "x"); builder.Add(x, x); diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc index c60ba2422f48bf1ada89396fb344f5f337ab3413..ca8e4cdbdb6a8fdb2c70fbd39f77ffc18c995aa6 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.cc +++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc @@ -27,7 +27,7 @@ limitations under the License. #include "tensorflow/compiler/xla/test_helpers.h" #include "tensorflow/core/common_runtime/eigen_thread_pool.h" #include "tensorflow/core/lib/core/threadpool.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/logging.h" @@ -44,7 +44,8 @@ StatusOr TestAllocator::Allocate(int device_ordinal, allocation_count_++; device_allocation_count_[device_ordinal]++; } - return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size); + return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size, + retry_on_failure); } tensorflow::Status TestAllocator::Deallocate(int device_ordinal, @@ -156,7 +157,7 @@ ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const { } ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( - const Computation& computation, + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments) { return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions()) @@ -164,7 +165,7 @@ ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( } ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( - const Computation& computation, + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options) { @@ -173,14 +174,14 @@ ScopedShapedBuffer LocalClientTestBase::ExecuteLocallyOrDie( } StatusOr LocalClientTestBase::ExecuteLocally( - const Computation& computation, + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments) { return ExecuteLocally(computation, arguments, DefaultExecutableBuildOptions(), DefaultExecutableRunOptions()); } StatusOr LocalClientTestBase::ExecuteLocally( - const Computation& computation, + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options) { diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h index 4ee56a05ec6ec1ddf3fba0b357e2c3b586eb456e..3bbb760c806412a671bc2502846e123e2582fd16 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.h +++ b/tensorflow/compiler/xla/tests/local_client_test_base.h @@ -21,8 +21,8 @@ limitations under the License. #include #include "tensorflow/compiler/xla/client/client_library.h" -#include "tensorflow/compiler/xla/client/computation.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/platform_util.h" @@ -93,19 +93,19 @@ class LocalClientTestBase : public ::testing::Test { // Execute the given computation on the local client. With and without // options. StatusOr ExecuteLocally( - const Computation& computation, + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments); StatusOr ExecuteLocally( - const Computation& computation, + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options); ScopedShapedBuffer ExecuteLocallyOrDie( - const Computation& computation, + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments); ScopedShapedBuffer ExecuteLocallyOrDie( - const Computation& computation, + const XlaComputation& computation, tensorflow::gtl::ArraySlice arguments, const ExecutableBuildOptions& build_options, const ExecutableRunOptions& run_options); diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc index 8fabcaca1b968dbd0546e16b97f48c43dbb60e3c..7df45bebebdd3eb2e71f27d831a8e2ac9e3b5f7c 100644 --- a/tensorflow/compiler/xla/tests/map_test.cc +++ b/tensorflow/compiler/xla/tests/map_test.cc @@ -16,8 +16,6 @@ limitations under the License. #include #include "tensorflow/compiler/xla/array2d.h" -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" @@ -341,48 +339,6 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) { ComputeAndCompareR0(&builder, 73.0, {}, ErrorSpec(0.01f)); } -TEST_F(MapTest, VersionedEmbeddedComputation) { - // Build a computation X, use it in a map, then add an additional operation to - // computation X and use it again in a different map. Verify that the proper - // versions of computation X are used in each of the maps. - - // Create a (embedded) computation which adds one to its parameter argument. - ComputationBuilder embedded_builder(client_, "EmbeddedComputation"); - auto param_0 = - embedded_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0"); - auto constant_one = embedded_builder.ConstantR0(1.0); - auto adder_to_one = embedded_builder.Add(param_0, constant_one); - auto computation_status = embedded_builder.Build(); - ASSERT_IS_OK(computation_status.status()); - auto embedded_computation = computation_status.ConsumeValueOrDie(); - - ComputationBuilder builder(client_, TestName()); - auto constant_vector = builder.ConstantR1({1.0, 2.0, 3.0, 4.0}); - auto map_plus_1 = builder.Map({constant_vector}, embedded_computation, {0}); - - // Add another Add(1) operation to the existing embedded computation. This - // requires using the stub interface because the ComputationBuilder does not - // allow modification to the XlaComputation objects after they have been - // built. - BinaryOpRequest request; - request.set_binop(BINOP_ADD); - *request.mutable_lhs() = adder_to_one; - *request.mutable_rhs() = constant_one; - OpRequest op_request; - *op_request.mutable_computation() = embedded_computation.handle(); - *op_request.mutable_binary_op_request() = request; - OpResponse response; - tensorflow::Status s = client_->stub()->Op(&op_request, &response); - ASSERT_TRUE(s.ok()); - - auto map_plus_2 = builder.Map({map_plus_1}, embedded_computation, {0}); - - // The original vector has Add(1) applied to it with a map, followed by - // Add(1+1) resulting in a net Add(3). - ComputeAndCompareR1(&builder, {4.0, 5.0, 6.0, 7.0}, {}, - ErrorSpec(0.01f)); -} - TEST_F(MapTest, MapBinaryAdder) { // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors. XlaBuilder builder(TestName()); diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc index c42f71388baba73e08a361d817e41b03e03bf133..7fa61eb33c2930ac8192ac965a71122001f808d3 100644 --- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc +++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc @@ -19,8 +19,9 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/reference_util.h" @@ -60,7 +61,7 @@ TYPED_TEST_CASE(MatOpsSimpleTest_F16F32, TypesF16F32); XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) { using T = TypeParam; - ComputationBuilder builder(this->client_, "exp_2x2"); + XlaBuilder builder("exp_2x2"); auto data = builder.ConstantR2FromArray2D({ {1.0f, 0.0f}, // row 0 {-1.0f, 0.5f}, // row 1 @@ -77,10 +78,10 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) { XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) { using T = TypeParam; - Computation add_half; + XlaComputation add_half; { // add_half(x) = x + 0.5 - ComputationBuilder builder(this->client_, "add_half"); + XlaBuilder builder("add_half"); auto x_value = builder.Parameter(0, ShapeUtil::MakeShapeWithType({}), "x_value"); auto half = builder.ConstantR0(static_cast(0.5)); @@ -90,7 +91,7 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) { add_half = computation_status.ConsumeValueOrDie(); } - ComputationBuilder builder(this->client_, "map_2x2"); + XlaBuilder builder("map_2x2"); auto data = builder.ConstantR2FromArray2D({ {1.0f, 0.0f}, // row 0 {-1.0f, 0.5f}, // row 1 @@ -106,7 +107,7 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) { XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MaxTwoByTwoValues) { using T = TypeParam; - ComputationBuilder builder(this->client_, "max_2x2"); + XlaBuilder builder("max_2x2"); auto lhs = builder.ConstantR2FromArray2D({ {7.0f, 2.0f}, // row 0 {3.0f, -4.0f}, // row 1 @@ -143,8 +144,7 @@ class TestLinspaceMaxParametric MakeLinspaceArray2D(from, to, rows, cols); auto arhs = MakeUnique>(rows, cols, static_cast(1.0f)); - ComputationBuilder builder( - client_, + XlaBuilder builder( tensorflow::strings::Printf("max_%lldx%lld_linspace", rows, cols)); auto lhs = builder.ConstantR2FromArray2D(*alhs); auto rhs = builder.ConstantR2FromArray2D(*arhs); @@ -219,7 +219,7 @@ class MatOpsDotAddTest client_->TransferToServer(*Literal::CreateR2FromArray2DWithLayout( rhs, LayoutUtil::MakeLayout(minor_to_major(row_major))))); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto lhs_arg = builder.Parameter(0, lhs_shape, "lhs"); auto lhs_mat_arg = lhs_arg; if (transpose) { diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc index 11c0bf7a5a5bde9edcfb7f76a5c10ac4dd77bcee..0791a71aacf7614286fe964623a3172a174d4722 100644 --- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc +++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc @@ -19,8 +19,8 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -32,7 +32,7 @@ namespace { class SliceTest : public ClientLibraryTestBase {}; XLA_TEST_F(SliceTest, Slice2D) { - ComputationBuilder builder(client_, "slice_2d"); + XlaBuilder builder("slice_2d"); auto original = builder.ConstantR2( {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}); builder.Slice(original, {2, 1}, {4, 3}, {1, 1}); @@ -42,7 +42,7 @@ XLA_TEST_F(SliceTest, Slice2D) { } XLA_TEST_F(SliceTest, Slice3D) { - ComputationBuilder builder(client_, "slice_3d"); + XlaBuilder builder("slice_3d"); Array3D array_3d( {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}); auto original = builder.ConstantR3FromArray3D(array_3d); diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc index bb7e800df84121f2045141bc366c34b94ba694ea..97dab860c06bddb2a0ffd45e48c4912c5f55d574 100644 --- a/tensorflow/compiler/xla/tests/params_test.cc +++ b/tensorflow/compiler/xla/tests/params_test.cc @@ -20,9 +20,10 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -41,7 +42,7 @@ namespace { class ParamsTest : public ClientLibraryTestBase {}; XLA_TEST_F(ParamsTest, ConstantR0F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR0(3.14159f); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); @@ -53,7 +54,7 @@ XLA_TEST_F(ParamsTest, ConstantR0F32Param) { } XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR1({}); std::unique_ptr param0_data = client_->TransferToServer(*param0_literal).ConsumeValueOrDie(); @@ -65,7 +66,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S0F32Param) { } XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR1({3.14f, -100.25f}); std::unique_ptr param0_data = @@ -78,7 +79,7 @@ XLA_TEST_F(ParamsTest, ConstantR1S2F32Param) { } XLA_TEST_F(ParamsTest, ConstantR1U8Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); string str("hello world"); std::unique_ptr param0_literal = Literal::CreateR1U8(str); std::unique_ptr param0_data = @@ -91,7 +92,7 @@ XLA_TEST_F(ParamsTest, ConstantR1U8Param) { } XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR2FromArray2D(Array2D(3, 0)); std::unique_ptr param0_data = @@ -104,7 +105,7 @@ XLA_TEST_F(ParamsTest, ConstantR2_3x0_F32Param) { } XLA_TEST_F(ParamsTest, ConstantR2F32Param) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR2( {{3.14f, -100.25f}, {7e8f, 7e-9f}, {30.3f, -100.0f}}); std::unique_ptr param0_data = @@ -119,7 +120,7 @@ XLA_TEST_F(ParamsTest, ConstantR2F32Param) { } XLA_TEST_F(ParamsTest, TwoParameters) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr literal0 = Literal::CreateR1({1, 2}); std::unique_ptr param0_data = @@ -156,19 +157,15 @@ XLA_TEST_F(ParamsTest, MissingParameter) { std::unique_ptr data = client_->TransferToServer(*literal).ConsumeValueOrDie(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto p = builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "param2"); - auto computation = builder.Build().ConsumeValueOrDie(); + auto computation_status = builder.Build(); - auto execute_status = client_->Execute(computation, {data.get(), data.get()}, - /*execution_options=*/nullptr, - /*execution_profile=*/nullptr); - ASSERT_EQ(execute_status.status().code(), - tensorflow::error::FAILED_PRECONDITION); + ASSERT_NE(computation_status.status(), tensorflow::Status::OK()); } XLA_TEST_F(ParamsTest, UnusedParameter) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr literal0 = Literal::CreateR1({1, 2}); std::unique_ptr param0_data = @@ -188,7 +185,7 @@ XLA_TEST_F(ParamsTest, UnusedParameter) { XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) { // Build a computation with a couple unused parameters which are used in an // unused expression. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr literal0 = Literal::CreateR1({1, 2}); std::unique_ptr param0_data = @@ -214,12 +211,12 @@ XLA_TEST_F(ParamsTest, UnusedParametersInUnusedExpression) { } XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); constexpr int size = 8 * 128 * 2; std::vector init_value = {{0, 1}}; init_value.resize(size); - ComputationDataHandle sum_handle = builder.ConstantR1(init_value); + XlaOp sum_handle = builder.ConstantR1(init_value); std::vector sum = {{0, 1}}; sum.resize(size); @@ -237,8 +234,7 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) { std::unique_ptr literal = Literal::CreateR1(sum_value); param_data_owner.push_back( client_->TransferToServer(*literal).ConsumeValueOrDie()); - ComputationDataHandle param = - builder.Parameter(i, literal->shape(), "param"); + XlaOp param = builder.Parameter(i, literal->shape(), "param"); sum_handle = builder.Add(sum_handle, param); } @@ -262,10 +258,10 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) { // compilation. XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(ThreeThousandParameters))) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector> param_data_owner; - ComputationDataHandle sum_handle = builder.ConstantR0(0.0f); + XlaOp sum_handle = builder.ConstantR0(0.0f); float target = 0.0; constexpr int kParamCount = 3000; for (int i = 0; i < kParamCount; ++i) { @@ -273,8 +269,7 @@ XLA_TEST_F(ParamsTest, std::unique_ptr literal = Literal::CreateR0(i); param_data_owner.push_back( std::move(client_->TransferToServer(*literal)).ValueOrDie()); - ComputationDataHandle param = - builder.Parameter(i, literal->shape(), "param"); + XlaOp param = builder.Parameter(i, literal->shape(), "param"); sum_handle = builder.Add(sum_handle, param); } @@ -294,25 +289,24 @@ XLA_TEST_F(ParamsTest, // compilation. XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU( ThreeThousandParametersAndOutputElements))) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector> param_data_owner; - ComputationDataHandle sum_handle = builder.ConstantR1({0, 0}); + XlaOp sum_handle = builder.ConstantR1({0, 0}); int32 target = 0; constexpr int kParamCount = 3000; - std::vector params; + std::vector params; for (int i = 0; i < kParamCount; ++i) { target += i; std::unique_ptr literal = Literal::CreateR1({i, i}); param_data_owner.push_back( std::move(client_->TransferToServer(*literal)).ValueOrDie()); - ComputationDataHandle param = - builder.Parameter(i, literal->shape(), "param"); + XlaOp param = builder.Parameter(i, literal->shape(), "param"); params.push_back(param); sum_handle = builder.Add(sum_handle, param); } - std::vector outputs; + std::vector outputs; for (int i = 0; i < kParamCount; ++i) { outputs.push_back(builder.Add(params[i], sum_handle)); } @@ -353,18 +347,17 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU( // 2017-12-12. XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(ManyParametersIntoWhileLoop))) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector> param_data_owner; constexpr int kParamCount = 1900; - std::vector params; + std::vector params; std::vector parameter_shapes; for (int i = 0; i < kParamCount; ++i) { std::unique_ptr literal = Literal::CreateR1({i, i}); param_data_owner.push_back( std::move(client_->TransferToServer(*literal)).ValueOrDie()); - ComputationDataHandle param = - builder.Parameter(i, literal->shape(), "param"); + XlaOp param = builder.Parameter(i, literal->shape(), "param"); params.push_back(param); parameter_shapes.push_back(literal->shape()); } @@ -374,7 +367,7 @@ XLA_TEST_F(ParamsTest, std::unique_ptr bool_literal = Literal::CreateR0(false); param_data_owner.push_back( std::move(client_->TransferToServer(*bool_literal)).ValueOrDie()); - ComputationDataHandle bool_param = + XlaOp bool_param = builder.Parameter(kParamCount, bool_literal->shape(), "bool_param"); params.push_back(bool_param); parameter_shapes.push_back(bool_literal->shape()); @@ -383,9 +376,9 @@ XLA_TEST_F(ParamsTest, // Create a computation for the condition: while(bool_param). Shape while_shape = ShapeUtil::MakeTupleShape(parameter_shapes); - Computation condition; + XlaComputation condition; { - ComputationBuilder builder(client_, "condition"); + XlaBuilder builder("condition"); auto condition_parameter = builder.Parameter(0, while_shape, "condition_parameter"); builder.GetTupleElement(condition_parameter, kParamCount); @@ -394,11 +387,11 @@ XLA_TEST_F(ParamsTest, // Create a computation for the body. // Add {1, 1} to the each tuple element. - Computation body; + XlaComputation body; { - ComputationBuilder builder(client_, "body"); + XlaBuilder builder("body"); auto body_parameter = builder.Parameter(0, while_shape, "body_parameter"); - std::vector updates; + std::vector updates; for (int i = 0; i < kParamCount; ++i) { auto add = builder.Add(builder.GetTupleElement(body_parameter, i), builder.ConstantR1({1, 1})); @@ -413,7 +406,7 @@ XLA_TEST_F(ParamsTest, auto loop = builder.While(condition, body, init); - std::vector outputs; + std::vector outputs; for (int i = 0; i < kParamCount; ++i) { outputs.push_back(builder.GetTupleElement(loop, i)); } @@ -437,7 +430,7 @@ XLA_TEST_F(ParamsTest, #endif XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Shape r1f32_3 = ShapeUtil::MakeShape(F32, {3}); Shape tuple_shape = ShapeUtil::MakeTupleShape({r1f32_3, r1f32_3}); @@ -464,7 +457,7 @@ XLA_TEST_F(ParamsTest, TupleOfR1ParametersAddedTogether) { XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) { std::unique_ptr literal = Literal::CreateR2WithLayout( {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({0, 1})); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.Parameter(0, literal->shape(), "input"); std::unique_ptr data = @@ -476,7 +469,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_Layout_01) { XLA_TEST_F(ParamsTest, R2_2x2_Layout_10) { std::unique_ptr literal = Literal::CreateR2WithLayout( {{1, 3}, {2, 4}}, LayoutUtil::MakeLayout({1, 0})); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.Parameter(0, literal->shape(), "input"); std::unique_ptr data = @@ -501,7 +494,7 @@ XLA_TEST_F(ParamsTest, R2_2x2_TryToPassReverseLayoutToParameter) { ASSERT_EQ(2, literal->Get({0, 1})); } // Use the original shape in building the computation. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto input = builder.Parameter(0, original, "input"); // Use the slice operator to get an off-diagonal element. builder.Slice(input, {0, 1}, {1, 2}, {1, 1}); diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc index 10e44b274a8a9f3ac28dc40d7b1938d24a9ee40c..77159efb26f3b7dd4918f24305f7269a2d6ff647 100644 --- a/tensorflow/compiler/xla/tests/pred_test.cc +++ b/tensorflow/compiler/xla/tests/pred_test.cc @@ -17,9 +17,9 @@ limitations under the License. #include #include "tensorflow/compiler/xla/array2d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" @@ -29,63 +29,62 @@ namespace { class PredTest : public ClientLibraryTestBase { protected: - void TestCompare(bool lhs, bool rhs, bool expected, - ComputationDataHandle (ComputationBuilder::*op)( - const ComputationDataHandle&, - const ComputationDataHandle&, - tensorflow::gtl::ArraySlice)) { - ComputationBuilder builder(client_, TestName()); - ComputationDataHandle lhs_op = builder.ConstantR0(lhs); - ComputationDataHandle rhs_op = builder.ConstantR0(rhs); - ComputationDataHandle result = (builder.*op)(lhs_op, rhs_op, {}); + void TestCompare( + bool lhs, bool rhs, bool expected, + XlaOp (XlaBuilder::*op)(const xla::XlaOp&, const xla::XlaOp&, + tensorflow::gtl::ArraySlice)) { + XlaBuilder builder(TestName()); + XlaOp lhs_op = builder.ConstantR0(lhs); + XlaOp rhs_op = builder.ConstantR0(rhs); + XlaOp result = (builder.*op)(lhs_op, rhs_op, {}); ComputeAndCompareR0(&builder, expected, {}); } }; TEST_F(PredTest, ConstantR0PredTrue) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR0(true); ComputeAndCompareR0(&builder, true, {}); } TEST_F(PredTest, ConstantR0PredFalse) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR0(false); ComputeAndCompareR0(&builder, false, {}); } TEST_F(PredTest, ConstantR0PredCompareEq) { - TestCompare(true, false, false, &ComputationBuilder::Eq); + TestCompare(true, false, false, &XlaBuilder::Eq); } TEST_F(PredTest, ConstantR0PredCompareNe) { - TestCompare(true, false, true, &ComputationBuilder::Ne); + TestCompare(true, false, true, &XlaBuilder::Ne); } TEST_F(PredTest, ConstantR0PredCompareLe) { - TestCompare(true, false, false, &ComputationBuilder::Le); + TestCompare(true, false, false, &XlaBuilder::Le); } TEST_F(PredTest, ConstantR0PredCompareLt) { - TestCompare(true, false, false, &ComputationBuilder::Lt); + TestCompare(true, false, false, &XlaBuilder::Lt); } TEST_F(PredTest, ConstantR0PredCompareGe) { - TestCompare(true, false, true, &ComputationBuilder::Ge); + TestCompare(true, false, true, &XlaBuilder::Ge); } TEST_F(PredTest, ConstantR0PredCompareGt) { - TestCompare(true, false, true, &ComputationBuilder::Gt); + TestCompare(true, false, true, &XlaBuilder::Gt); } TEST_F(PredTest, ConstantR1Pred) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR1({true, false, false, true}); ComputeAndCompareR1(&builder, {true, false, false, true}, {}); } TEST_F(PredTest, ConstantR2Pred) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR2({{false, true, true}, {true, false, false}}); const string expected = R"(pred[2,3] { @@ -96,28 +95,28 @@ TEST_F(PredTest, ConstantR2Pred) { } TEST_F(PredTest, AnyR1True) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR1({true, false}); TF_ASSERT_OK(Any(a, &builder).status()); ComputeAndCompareR0(&builder, true, {}); } TEST_F(PredTest, AnyR1False) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR1({false, false}); TF_ASSERT_OK(Any(a, &builder).status()); ComputeAndCompareR0(&builder, false, {}); } TEST_F(PredTest, AnyR1VacuouslyFalse) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR1({}); TF_ASSERT_OK(Any(a, &builder).status()); ComputeAndCompareR0(&builder, false, {}); } TEST_F(PredTest, AnyR2True) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR2({ {false, false, false}, {false, false, false}, @@ -128,7 +127,7 @@ TEST_F(PredTest, AnyR2True) { } TEST_F(PredTest, AnyR2False) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR2({ {false, false, false}, {false, false, false}, diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc index 6aafb9fa6cb2175c478f0e9a5e16f5808cbea590..29a4f75001c688f2215745ab913df68bf2f62b76 100644 --- a/tensorflow/compiler/xla/tests/prng_test.cc +++ b/tensorflow/compiler/xla/tests/prng_test.cc @@ -16,8 +16,8 @@ limitations under the License. #include #include -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/primitive_util.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -52,13 +52,14 @@ class PrngTest : public ClientLibraryTestBase { template std::unique_ptr PrngTest::UniformTest( T a, T b, tensorflow::gtl::ArraySlice dims, int64 seed) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.RngUniform( builder.ConstantR0(a), builder.ConstantR0(b), ShapeUtil::MakeShape(primitive_util::NativeToPrimitiveType(), dims)); SetSeed(seed); - auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{}); + auto actual = + ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); EXPECT_THAT(dims, ::testing::ElementsAreArray(actual->shape().dimensions())); actual->EachCell([=](tensorflow::gtl::ArraySlice, T value) { EXPECT_LE(a, value); @@ -81,8 +82,7 @@ XLA_TEST_F(PrngTest, LargeU01) { UniformTest(0, 1, {0x100, 0x100}); } XLA_TEST_F(PrngTest, TwelveValuesU524) { UniformTest(5, 24, {12}); } // TODO(b/71543667): Fix Rng ops on LLVM backends. -XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL( - DISABLED_ON_CPU(ScalarBF16Tests)))) { +XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) { for (int64 seed = 0; seed < 100; ++seed) { // The largest negative number smaller than zero in bf16 that's not // denormalized. @@ -105,8 +105,7 @@ XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL( } // TODO(b/71543667): Fix Rng ops on LLVM backends. -XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU( - DISABLED_ON_CPU_PARALLEL(ScalarBF16CountTests)))) { +XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16CountTests))) { // There are 3 BF16 values in the range of [32.25, 33): 32.25, 32.5, 32.75, // they should get similar counts. bfloat16 low = static_cast(32.25); @@ -141,13 +140,14 @@ double PrngTest::UniformChiSquared(int32 range_size, int32 expected_count, int64 seed) { int32 sample_size = range_size * expected_count; - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.RngUniform(builder.ConstantR0(0), builder.ConstantR0(range_size), ShapeUtil::MakeShape(S32, {sample_size})); SetSeed(seed); - auto actual = ExecuteAndTransferOrDie(&builder, /*arguments=*/{}); + auto actual = + ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); std::vector counts(range_size, 0); actual->EachCell([&counts](tensorflow::gtl::ArraySlice, int32 value) { ++counts[value]; }); @@ -182,16 +182,15 @@ XLA_TEST_F(PrngTest, Uniformity256) { XLA_TEST_F(PrngTest, MapUsingRng) { // Build a x -> (x + U[0,1)) computation. - auto build_sum_rng = [this](ComputationBuilder& builder) { + auto build_sum_rng = [this](XlaBuilder& builder) { auto b = builder.CreateSubBuilder("sum_with_rng"); auto x = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "input"); - b->Add(x, - b->RngUniform(b->ConstantR0(0), b->ConstantR0(1), - ShapeUtil::MakeShape(F32, {}))); + b->Add(x, b->RngUniform(b->ConstantR0(0), b->ConstantR0(1), + ShapeUtil::MakeShape(F32, {}))); return b->BuildAndNoteError(); }; - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::unique_ptr param0_literal = Literal::CreateR1({2.2f, 5.3f, 4.4f, 5.5f}); TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr param0_data, @@ -226,7 +225,7 @@ XLA_TEST_F(PrngTest, MapUsingRng) { XLA_TEST_F(PrngTest, PassInGlobalRngSeed) { // Build a U[0,1) computation. auto build_computation = [this]() { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.RngUniform(builder.ConstantR0(0), builder.ConstantR0(1), ShapeUtil::MakeShape(F32, {10})); @@ -282,24 +281,24 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) { } XLA_TEST_F(PrngTest, TenValuesN01) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); builder.RngNormal(builder.ConstantR0(0), builder.ConstantR0(1), ShapeUtil::MakeShape(F32, {10})); SetSeed(42); - ExecuteAndTransferOrDie(&builder, /*arguments=*/{}); + ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); // TODO(b/25995601): Test that resultant values are reasonable } XLA_TEST_F(PrngTest, RngUniformCrash) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // This used to crash XLA during LLVM IR generation for CPUs. auto rng_uniform = builder.RngUniform(builder.ConstantR0(0), builder.ConstantR0(1000 * 1000), ShapeUtil::MakeShape(S32, {})); SetSeed(0); - ExecuteAndTransferOrDie(&builder, /*arguments=*/{}); + ExecuteAndTransfer(&builder, /*arguments=*/{}).ConsumeValueOrDie(); } } // namespace diff --git a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc index 212512207cfdc4d2ebdc4e7fd8f5794852cc6a79..f95e75648343aa88bd7c39de4ee9f387f2b60506 100644 --- a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc +++ b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc @@ -15,8 +15,8 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -30,13 +30,13 @@ namespace { class QueryInferredShapeTest : public ClientLibraryTestBase {}; TEST_F(QueryInferredShapeTest, OnePlusOneShape) { - ComputationBuilder builder(client_, "one_plus_one"); + XlaBuilder builder("one_plus_one"); auto one = builder.ConstantR0(1.0); auto result = builder.Add(one, one); - StatusOr> shape_status = builder.GetShape(result); + StatusOr shape_status = builder.GetShape(result); ASSERT_IS_OK(shape_status.status()); auto shape = shape_status.ConsumeValueOrDie(); - ASSERT_TRUE(ShapeUtil::Equal(*shape, ShapeUtil::MakeShape(F32, {}))); + ASSERT_TRUE(ShapeUtil::Equal(shape, ShapeUtil::MakeShape(F32, {}))); } } // namespace diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc index 423ccadb5b3b7df950824349737a833c08870d77..bcc05c2d41d8439b021cdf6533b5ca87c19aec1f 100644 --- a/tensorflow/compiler/xla/tests/reduce_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_test.cc @@ -35,7 +35,6 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" @@ -60,10 +59,9 @@ limitations under the License. namespace xla { namespace { -using FuncGeneratorForType = Computation (*)(PrimitiveType, - ComputationBuilder*); +using FuncGeneratorForType = XlaComputation (*)(PrimitiveType, XlaBuilder*); -using FuncGenerator = Computation (*)(ComputationBuilder*); +using FuncGenerator = XlaComputation (*)(XlaBuilder*); class ReduceTest : public ClientLibraryTestBase { protected: @@ -89,8 +87,8 @@ class ReduceTest : public ClientLibraryTestBase { // Runs an R1 => R0 reduction test with the given number of elements. void RunR1ToR0Test(int64 element_count) { - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {element_count}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -119,13 +117,13 @@ class ReduceTest : public ClientLibraryTestBase { void RunR1ToR0PredTest(bool and_reduce, tensorflow::gtl::ArraySlice input_data) { const int element_count = input_data.size(); - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const Shape input_shape = ShapeUtil::MakeShape(S32, {element_count}); auto input_par = builder.Parameter(0, input_shape, "input"); auto pred_values = builder.Eq(input_par, builder.ConstantR1(element_count, 1)); - ComputationDataHandle init_value; - Computation reduce; + XlaOp init_value; + XlaComputation reduce; if (and_reduce) { init_value = builder.ConstantR0(true); reduce = CreateScalarAndComputation(&builder); @@ -157,13 +155,13 @@ class ReduceTest : public ClientLibraryTestBase { template void RunR2ToR1PredTest(bool and_reduce, int64 rows, int64 minor = 1, int64 major = 0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const Shape input_shape = ShapeUtil::MakeShape(U8, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto input_pred = builder.Eq(input, builder.ConstantR0(1)); - ComputationDataHandle init_value; - Computation reduce_op; + XlaOp init_value; + XlaComputation reduce_op; if (and_reduce) { init_value = builder.ConstantR0(true); reduce_op = CreateScalarAndComputation(&builder); @@ -202,8 +200,8 @@ class ReduceTest : public ClientLibraryTestBase { // Runs an R2 => R0 reduction test with the given number of (rows, cols). void RunR2ToR0Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) { - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -230,8 +228,8 @@ class ReduceTest : public ClientLibraryTestBase { // Runs an R2 => R1 reduction test with the given number of (rows, cols). void RunR2ToR1Test(int64 rows, int64 cols, int64 minor = 1, int64 major = 0) { - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -261,7 +259,7 @@ class ReduceTest : public ClientLibraryTestBase { template void ComputeAndCompareGeneric( typename std::enable_if::value, - ComputationBuilder>::type* builder, + XlaBuilder>::type* builder, tensorflow::gtl::ArraySlice expected, tensorflow::gtl::ArraySlice arguments) { ComputeAndCompareR1(builder, expected, arguments, @@ -271,7 +269,7 @@ class ReduceTest : public ClientLibraryTestBase { template void ComputeAndCompareGeneric( typename std::enable_if::value, - ComputationBuilder>::type* builder, + XlaBuilder>::type* builder, tensorflow::gtl::ArraySlice expected, tensorflow::gtl::ArraySlice arguments) { ComputeAndCompareR1(builder, expected, arguments); @@ -279,15 +277,15 @@ class ReduceTest : public ClientLibraryTestBase { template void RunVectorizedReduceTestForType( - const std::function& + const std::function& reduction_function_generator, const std::function& reference_reduction_function, const NativeT& initial_value) { const int rows = 64, cols = 128; const int minor = 1, major = 0; - ComputationBuilder builder(client_, TestName()); - Computation reduction_function = reduction_function_generator(&builder); + XlaBuilder builder(TestName()); + XlaComputation reduction_function = reduction_function_generator(&builder); const Shape input_shape = ShapeUtil::MakeShape( xla::primitive_util::NativeToPrimitiveType(), {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); @@ -322,7 +320,7 @@ class ReduceTest : public ClientLibraryTestBase { } void RunVectorizedReduceTest( - const std::function& + const std::function& reduction_function_generator_for_type, const std::function& reference_reduction_function_for_floats, @@ -334,21 +332,21 @@ class ReduceTest : public ClientLibraryTestBase { uint32 unsigned_int_identity) { // Float version RunVectorizedReduceTestForType( - [&](ComputationBuilder* builder) { + [&](XlaBuilder* builder) { return reduction_function_generator_for_type(F32, builder); }, reference_reduction_function_for_floats, floating_point_identity); // Signed int version RunVectorizedReduceTestForType( - [&](ComputationBuilder* builder) { + [&](XlaBuilder* builder) { return reduction_function_generator_for_type(S32, builder); }, reference_reduction_function_for_ints, signed_int_identity); // Unsigned int version RunVectorizedReduceTestForType( - [&](ComputationBuilder* builder) { + [&](XlaBuilder* builder) { return reduction_function_generator_for_type(U32, builder); }, reference_reduction_function_for_uints, unsigned_int_identity); @@ -442,8 +440,8 @@ XLA_TEST_F(ReduceTest, OrReduceOnesAndZerosR1_10_Pred) { XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) { const int64 rows = 111, cols = 50; - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -473,8 +471,8 @@ XLA_TEST_F(ReduceTest, ReduceElementwiseR2_111x50_To_R1) { XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) { const int64 rows = 111, cols = 50; - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, cols}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -522,8 +520,8 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) { XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) { const int64 rows = 111, cols = 50; - ComputationBuilder builder(client_, TestName()); - Computation add_f32 = CreateScalarAddComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder); const Shape input_shape = ShapeUtil::MakeShape(F32, {rows, 2, cols / 2}); auto input = builder.Parameter(0, input_shape, "input"); auto zero = builder.ConstantR0(0.0); @@ -569,7 +567,7 @@ void PrintTo(const BoundsLayout& spec, std::ostream* os) { // Add-reduces a broadcasted scalar matrix among dimension 1 and 0. XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto add = CreateScalarAddComputation(F32, &builder); auto scalar = builder.ConstantR0(42.0); auto broadcasted = builder.Broadcast(scalar, {500, 500}); @@ -581,7 +579,7 @@ XLA_TEST_F(ReduceTest, AddReduce2DScalarToR0) { // Max-reduces a broadcasted scalar matrix among dimension 1 and 0. XLA_TEST_F(ReduceTest, MaxReduce2DScalarToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto max = CreateScalarMaxComputation(F32, &builder); auto scalar = builder.ConstantR0(42.0); auto broadcasted = builder.Broadcast(scalar, {500, 500}); @@ -593,7 +591,7 @@ XLA_TEST_F(ReduceTest, MaxReduce2DScalarToR0) { // Max-reduces a matrix among dimension 1 and 0. XLA_TEST_F(ReduceTest, MaxReduce2DToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto max = CreateScalarMaxComputation(F32, &builder); Array2D input(300, 250); input.FillRandom(214.0f); @@ -608,7 +606,7 @@ XLA_TEST_F(ReduceTest, MaxReduce2DToR0) { // Min-reduces matrix among dimension 1 and 0. XLA_TEST_F(ReduceTest, MinReduce2DToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto min = CreateScalarMinComputation(F32, &builder); Array2D input(150, 130); input.FillRandom(214.0f); @@ -623,7 +621,7 @@ XLA_TEST_F(ReduceTest, MinReduce2DToR0) { } XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array2D input({{1}, {2}}); auto min = CreateScalarMinComputation(U32, &builder); auto input_literal = Literal::CreateR2FromArray2D(input); @@ -636,7 +634,7 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MinReduce) { } XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array2D input({{1}, {2}}); auto max = CreateScalarMaxComputation(U32, &builder); auto input_literal = Literal::CreateR2FromArray2D(input); @@ -650,7 +648,7 @@ XLA_TEST_F(ReduceTest, UnsignedInt_MaxReduce) { // Reduces a matrix among dimension 1. XLA_TEST_F(ReduceTest, Reduce2DAmong1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_2d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {1}); @@ -661,7 +659,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong1) { XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) { // Reduce a matrix among dimensions 0 and 1 (sum it up to a scalar). - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_2d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0, 1}); @@ -671,7 +669,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmong0and1) { // Tests 2D matrix ReduceToRow operation. XLA_TEST_F(ReduceTest, Reduce2DAmongY) { - ComputationBuilder builder(client_, "reduce_among_y"); + XlaBuilder builder("reduce_among_y"); auto m = builder.ConstantLiteral(*literal_2d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0}); @@ -681,7 +679,7 @@ XLA_TEST_F(ReduceTest, Reduce2DAmongY) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {1, 2}); @@ -691,7 +689,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_1_2) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0, 1}); @@ -701,7 +699,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDims_0_1) { } XLA_TEST_F(ReduceTest, ReduceR3ToR0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0, 1, 2}); @@ -711,7 +709,7 @@ XLA_TEST_F(ReduceTest, ReduceR3ToR0) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {0}); @@ -726,7 +724,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim0) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {1}); @@ -743,7 +741,7 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim1) { } XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto m = builder.ConstantLiteral(*literal_3d_); auto add = CreateScalarAddComputation(F32, &builder); builder.Reduce(m, builder.ConstantR0(0.0f), add, {2}); @@ -817,7 +815,7 @@ class ReduceR3ToR2Test : public ReduceTest, public ::testing::WithParamInterface {}; XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); const auto& bounds = GetParam().bounds; Array3D input_array(bounds[0], bounds[1], bounds[2]); // input_array.FillRandom(3.14f, 0.05); @@ -831,7 +829,7 @@ XLA_TEST_P(ReduceR3ToR2Test, ReduceR3ToR2) { auto input_activations = builder.Parameter(0, input_literal->shape(), "input"); - Computation add = CreateScalarAddComputation(F32, &builder); + XlaComputation add = CreateScalarAddComputation(F32, &builder); auto sum = builder.Reduce(input_activations, builder.ConstantR0(0.0f), add, GetParam().reduce_dims); @@ -871,8 +869,8 @@ INSTANTIATE_TEST_CASE_P( // IrEmitterUnnested::EmitInitializer() for the Reduce operator. Failed on // 2017-07-26. XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OperationOnConstantAsInitValue)) { - ComputationBuilder builder(client_, TestName()); - Computation max_f32 = CreateScalarMaxComputation(F32, &builder); + XlaBuilder builder(TestName()); + XlaComputation max_f32 = CreateScalarMaxComputation(F32, &builder); auto a = builder.ConstantR0(2.0f); auto a2 = builder.Abs(a); @@ -899,8 +897,8 @@ class ReduceInitializerTest : public ReduceTest { protected: template void DoTest(T initializer, int num_elems) { - ComputationBuilder builder(client_, TestName()); - Computation max_fn = CreateScalarMaxComputation( + XlaBuilder builder(TestName()); + XlaComputation max_fn = CreateScalarMaxComputation( primitive_util::NativeToPrimitiveType(), &builder); auto init = builder.ConstantR0(initializer); @@ -940,7 +938,7 @@ XLA_TEST_F(ReduceInitializerTest, U64InitializerBigValue) { // returns one of the parameters). In this case, we return the rhs, which for // a 1D array with one element, should not be the init value. XLA_TEST_F(ReduceTest, ReduceIdentity) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Shape single_float = ShapeUtil::MakeShape(F32, {}); builder.Parameter(0, single_float, "lhs-unused"); builder.Parameter(1, single_float, "rhs-used"); diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc index 0a097667222d8c315cb9943688d6dbcb4426a8b3..10a3da3a387641ec45baf02d15790e32371601fa 100644 --- a/tensorflow/compiler/xla/tests/reduce_window_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc @@ -861,8 +861,7 @@ INSTANTIATE_TEST_CASE_P( class R4ReduceWindowAnyDimsTest : public R4ReduceWindowTest {}; // TODO(b/72234705): Fix the test cases failed on CPU and GPU. -XLA_TEST_P(R4ReduceWindowAnyDimsTest, - DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) { +XLA_TEST_P(R4ReduceWindowAnyDimsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) { DoIt(); } @@ -1151,7 +1150,7 @@ class R2ReduceWindowFailingCpuGpuBf16Test : public R2ReduceWindowTest {}; // TODO(b/72234705): Fix the test cases failed on CPU and GPU. XLA_TEST_P(R2ReduceWindowFailingCpuGpuBf16Test, - DISABLED_ON_CPU_PARALLEL(DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt)))) { + DISABLED_ON_CPU(DISABLED_ON_GPU(DoIt))) { DoIt(); } diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc index 6d063ffc363c092a1fbc40cbc22e87181d0c2502..36d763b0f7f4267ede076c0b25cfaf9654e96e0d 100644 --- a/tensorflow/compiler/xla/tests/replay_test.cc +++ b/tensorflow/compiler/xla/tests/replay_test.cc @@ -15,13 +15,13 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/protobuf_util.h" -#include "tensorflow/compiler/xla/service/session.pb.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" @@ -38,17 +38,17 @@ class ReplayTest : public ClientLibraryTestBase {}; TEST_F(ReplayTest, TwoPlusTwoReplay) { // Make 2+2 computation. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto two = builder.ConstantR0(2); builder.Add(two, two); - Computation computation = builder.Build().ConsumeValueOrDie(); + XlaComputation computation = builder.Build().ConsumeValueOrDie(); // Serialize it out. - std::unique_ptr module = + std::unique_ptr module = computation.Snapshot().ConsumeValueOrDie(); // Replay it. - Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); + XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); // Check signature is the same. std::unique_ptr original_shape = @@ -69,18 +69,18 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) { XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) { // Make computation. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x"); auto y = builder.Parameter(1, ShapeUtil::MakeShape(S32, {}), "y"); builder.Add(x, y); - Computation computation = builder.Build().ConsumeValueOrDie(); + XlaComputation computation = builder.Build().ConsumeValueOrDie(); // Serialize it out. - std::unique_ptr module = + std::unique_ptr module = computation.Snapshot().ConsumeValueOrDie(); // Replay it. - Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); + XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); // Check signature is the same. std::unique_ptr original_shape = @@ -109,24 +109,24 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) { TEST_F(ReplayTest, MapPlusTwoOverR1) { // As above, but with map(+2) over some constant array. - ComputationBuilder plus_two_builder(client_, "plus two"); + XlaBuilder plus_two_builder("plus two"); auto input = plus_two_builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "input"); plus_two_builder.Add(input, plus_two_builder.ConstantR0(2)); - Computation plus_two = plus_two_builder.Build().ConsumeValueOrDie(); + XlaComputation plus_two = plus_two_builder.Build().ConsumeValueOrDie(); - ComputationBuilder mapper_builder(client_, TestName()); + XlaBuilder mapper_builder(TestName()); auto original = mapper_builder.ConstantR1({1, 2, 3}); mapper_builder.Map({original}, plus_two, {0}); - Computation computation = mapper_builder.Build().ConsumeValueOrDie(); + XlaComputation computation = mapper_builder.Build().ConsumeValueOrDie(); // Serialize it out. - std::unique_ptr module = + std::unique_ptr module = computation.Snapshot().ConsumeValueOrDie(); // Replay it. - Computation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); + XlaComputation replayed = client_->LoadSnapshot(*module).ConsumeValueOrDie(); // Check signature is the same. std::unique_ptr original_shape = @@ -135,10 +135,6 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) { client_->GetComputationShape(replayed).ConsumeValueOrDie(); ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape)); - // Destroy the originals. - computation.Reset(); - plus_two.Reset(); - // Run it. std::unique_ptr literal = client_ diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc index e045e164e2e2db7d3480e7c2d1e20f461820ae67..5ebd5268992846e80dcce2675f8e92038e190ecf 100644 --- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc +++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc @@ -20,10 +20,9 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array4d.h" -#include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/reference_util.h" @@ -45,7 +44,7 @@ namespace { using ReshapeMotionTest = ClientLibraryTestBase; TEST_F(ReshapeMotionTest, ElementwiseOfReshapesWithNonSameInputShapes) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto a = builder.ConstantR2({{2, 3, 5}, {7, 11, 13}}); auto b = builder.ConstantR2({{17, 19}, {23, 29}, {31, 37}}); auto c = builder.Reshape(a, {6}); diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc index 6959c95502cb7af6b720592e7836c6789719a528..e7bd142dc9ddefbd8bebfb77d72218d662645c31 100644 --- a/tensorflow/compiler/xla/tests/reverse_test.cc +++ b/tensorflow/compiler/xla/tests/reverse_test.cc @@ -114,7 +114,7 @@ class ReverseTest : public ClientLibraryTestBase {}; // Tests the reverse operation on a 4D U8 array on dimension 0 and 3. XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); // Input shape is U8[1x2x3x4]. // clang-format off Array4D input({{ @@ -144,7 +144,7 @@ XLA_TEST_F(ReverseTest, Reverse4DU8ArrayOnDim23) { // Tests the reverse operation on a 4D float array on dimension 0 and 1. TEST_F(ReverseTest, Reverse4DFloatArrayOnDim01) { - ComputationBuilder b(client_, TestName()); + XlaBuilder b(TestName()); // Input shape is float[4x3x2x1]. // clang-format off Array4D input({ diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h index e2d406f66d94f8ec76faa5b7d2d2e84dcaf6db57..7ca99a91635e85cd0888e59ecde31e47fec21844 100644 --- a/tensorflow/compiler/xla/tests/test_macros.h +++ b/tensorflow/compiler/xla/tests/test_macros.h @@ -34,7 +34,6 @@ limitations under the License. #include "tensorflow/core/platform/test.h" #define DISABLED_ON_CPU(X) X -#define DISABLED_ON_CPU_PARALLEL(X) X #define DISABLED_ON_GPU(X) X #define DISABLED_ON_INTERPRETER(X) X @@ -51,13 +50,6 @@ limitations under the License. # define DISABLED_ON_CPU(X) XLA_TEST_PASTE(DISABLED_, X) #endif // XLA_TEST_BACKEND_CPU -#ifdef XLA_TEST_BACKEND_CPU_PARALLEL -# undef DISABLED_ON_CPU -# define DISABLED_ON_CPU(X) XLA_TEST_PASTE(DISABLED_, X) -# undef DISABLED_ON_CPU_PARALLEL -# define DISABLED_ON_CPU_PARALLEL(X) XLA_TEST_PASTE(DISABLED_, X) -#endif // XLA_TEST_BACKEND_CPU_PARALLEL - #ifdef XLA_TEST_BACKEND_GPU # undef DISABLED_ON_GPU # define DISABLED_ON_GPU(X) XLA_TEST_PASTE(DISABLED_, X) diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc index e8efc6e2a83f42bf81fc1261ba508632cf3f85b3..59afd28a80c0fbf3df38457cd05961c883769856 100644 --- a/tensorflow/compiler/xla/tests/test_utils_test.cc +++ b/tensorflow/compiler/xla/tests/test_utils_test.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/test_utils.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/tests/local_client_test_base.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -28,7 +28,7 @@ namespace { class TestUtilsTest : public LocalClientTestBase {}; XLA_TEST_F(TestUtilsTest, UnusedParam) { - ComputationBuilder builder(local_client_, TestName()); + XlaBuilder builder(TestName()); // Make the reduction lambda. Shape single_float = ShapeUtil::MakeShape(F32, {}); builder.Parameter(0, single_float, "unused"); diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc index 61d0fa02aba80ddf60ab83f8355a5e3c1ef44518..61be1746530a1991271107910d033f6b93ea9bfd 100644 --- a/tensorflow/compiler/xla/tests/tuple_test.cc +++ b/tensorflow/compiler/xla/tests/tuple_test.cc @@ -269,7 +269,7 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) { ComputeAndCompareR2(&builder, expected, {}, error_spec_); } -XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) { +XLA_TEST_F(TupleTest, SelectBetweenTuplesOnFalse) { // Tests a selection between tuples with "false" path taken. XlaBuilder builder(TestName()); @@ -313,7 +313,7 @@ XLA_TEST_F(TupleTest, TuplesInAMap) { ComputeAndCompareR1(&b, {-99.0f, 101.0f, 214.41f}, {}, error_spec_); } -XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) { +XLA_TEST_F(TupleTest, SelectBetweenTuplesOnTrue) { // Tests a selection between tuples with "true" path taken. XlaBuilder builder(TestName()); @@ -350,7 +350,7 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) { } // Cascaded selects between tuple types. -XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) { +XLA_TEST_F(TupleTest, SelectBetweenTuplesCascaded) { // // vec1 vec2 vec2 vec1 // | | | | @@ -390,8 +390,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) { ComputeAndCompareR1(&builder, {3.f, 6.f, 9.f}, {}, error_spec_); } -XLA_TEST_F(TupleTest, - DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesReuseConstants)) { +XLA_TEST_F(TupleTest, SelectBetweenTuplesReuseConstants) { // Similar to SelectBetweenTuples, but the constants are shared between the // input tuples. XlaBuilder builder(TestName()); @@ -516,10 +515,8 @@ XLA_TEST_F(TupleTest, ComplexTuples) { class TupleHloTest : public HloTestBase {}; -// Disabled on CPU parallel because that's broken and will be removed soon. // Disabled on the interpreter because bitcast doesn't exist on the interpreter. -TEST_F(TupleHloTest, - DISABLED_ON_INTERPRETER(DISABLED_ON_CPU_PARALLEL(BitcastAfterGTE))) { +TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) { const char* testcase = R"( HloModule m diff --git a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc index 32ba067a10df6c15348344da813e6a960f05491c..82d301983fc7885ef5c1c1ed05b74fc017bb7727 100644 --- a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc +++ b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc @@ -19,9 +19,9 @@ limitations under the License. #include "tensorflow/compiler/xla/array2d.h" #include "tensorflow/compiler/xla/array3d.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" #include "tensorflow/compiler/xla/tests/client_library_test_base.h" #include "tensorflow/compiler/xla/tests/literal_test_util.h" #include "tensorflow/compiler/xla/tests/test_macros.h" @@ -33,9 +33,9 @@ namespace { class VecOpsReduceTest : public ClientLibraryTestBase { public: - VecOpsReduceTest() : builder_(client_, TestName()) {} + VecOpsReduceTest() : builder_(TestName()) {} - ComputationDataHandle BuildSampleConstantCube() { + XlaOp BuildSampleConstantCube() { // clang-format off Array3D x3d({ {{1.0, 2.0, 3.0}, // | dim 1 // } plane 0 in dim 0 @@ -49,7 +49,7 @@ class VecOpsReduceTest : public ClientLibraryTestBase { return builder_.ConstantR3FromArray3D(x3d); } - ComputationBuilder builder_; + XlaBuilder builder_; ErrorSpec errspec_{1e-3, 0}; }; diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc index 697d78fe6e9abf6136f9ec4a708ef983732d0013..3dded3f7157195b2c7aaac2ff9aac79ca4611d05 100644 --- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc +++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc @@ -19,10 +19,11 @@ limitations under the License. #include "tensorflow/compiler/xla/array4d.h" #include "tensorflow/compiler/xla/client/computation.h" -#include "tensorflow/compiler/xla/client/computation_builder.h" #include "tensorflow/compiler/xla/client/global_data.h" #include "tensorflow/compiler/xla/client/lib/arithmetic.h" #include "tensorflow/compiler/xla/client/local_client.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h" +#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/test_helpers.h" @@ -49,7 +50,7 @@ class VecOpsSimpleTest : public ClientLibraryTestBase { }; XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); auto exp = builder.Exp(x); @@ -63,7 +64,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpTenValues) { XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) { for (int count : {63, 64, 65, 127, 128, 129, 17 * 4096}) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); std::vector exponents; exponents.reserve(count); for (int i = 0; i < count; ++i) { @@ -84,7 +85,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpManyValues) { } XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); Array4D exponents(2, 2, 2, 2); std::vector exponents_vector; @@ -106,7 +107,7 @@ XLA_TEST_F(VecOpsSimpleTest, ExpIn4D) { } XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); builder.Neg(x); @@ -117,7 +118,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenFloatValues) { } XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({2, -2, 12, -4, 5, 20, -15, 0, -2, 1}); builder.Neg(x); @@ -126,7 +127,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateTenInt32Values) { } XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {0, 1, 42, static_cast(-1), static_cast(-12)}); builder.Neg(x); @@ -136,7 +137,7 @@ XLA_TEST_F(VecOpsSimpleTest, NegateUint32Values) { } XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); builder.SquareF32(x); @@ -147,7 +148,7 @@ XLA_TEST_F(VecOpsSimpleTest, SquareTenValues) { } XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); builder.ReciprocalF32(x); @@ -159,7 +160,7 @@ XLA_TEST_F(VecOpsSimpleTest, ReciprocalTenValues) { } XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({0.0, -0.0}); auto exp = builder.SqrtF32(x); @@ -167,7 +168,7 @@ XLA_TEST_F(VecOpsSimpleTest, SqrtZeroes) { } XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({16.0, 1.0, 1024.0, 0.16, 0.2, 12345}); auto exp = builder.SqrtF32(x); @@ -176,7 +177,7 @@ XLA_TEST_F(VecOpsSimpleTest, SqrtSixValues) { } XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({16.0, 1.0, 1024.0, 0.16, 0.2, 12345, 1.2345}); auto exp = builder.Pow(x, builder.ConstantR0(-.5f)); @@ -188,7 +189,7 @@ XLA_TEST_F(VecOpsSimpleTest, InvSqrtSevenValues) { } XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto add = CreateScalarAddComputation(F32, &builder); auto x = builder.ConstantR1( @@ -203,7 +204,7 @@ XLA_TEST_F(VecOpsSimpleTest, AddTenValuesViaMap) { } XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); auto y = builder.ConstantR1( @@ -218,8 +219,8 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValues) { XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) { // Similar to MaxTenValues, except that the inputs come from params rather // than constants. - ComputationBuilder builder(client_, TestName()); - ComputationDataHandle v1, v2; + XlaBuilder builder(TestName()); + XlaOp v1, v2; std::unique_ptr param0_data = CreateR1Parameter( {41.0f, 2.0f, 3.0f, 84.0f}, /*parameter_number=*/0, /*name=*/"v1", /*builder=*/&builder, /*data_handle=*/&v1); @@ -236,7 +237,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesFromParams) { XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) { // Similar to MaxTenValuesFromParams, except that the data size passed in and // out is large. - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); // Number of floats in the data passed into and out of the computation. constexpr int datalen = 15 * 1000; @@ -259,7 +260,7 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) { expected_vec.push_back(larger); } - ComputationDataHandle v1, v2; + XlaOp v1, v2; std::unique_ptr param0_data = CreateR1Parameter(v1vec, /*parameter_number=*/0, /*name=*/"v1", /*builder=*/&builder, /*data_handle=*/&v1); @@ -274,7 +275,7 @@ XLA_TEST_F(VecOpsSimpleTest, Max15000ValuesFromParams) { } XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); auto y = builder.ConstantR0(0); @@ -286,7 +287,7 @@ XLA_TEST_F(VecOpsSimpleTest, MaxTenValuesWithScalar) { } XLA_TEST_F(VecOpsSimpleTest, MinTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1( {2.1, -2.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); auto y = builder.ConstantR1( @@ -299,7 +300,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinTenValues) { } XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR0(0); auto one = builder.ConstantR0(1); auto x = builder.ConstantR1( @@ -312,7 +313,7 @@ XLA_TEST_F(VecOpsSimpleTest, MinMaxTenValues) { } XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR0(0); auto one = builder.ConstantR0(1); auto x = builder.ConstantR1( @@ -325,7 +326,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstant) { } XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto zero = builder.ConstantR1({0.0f, 0.0f}); auto one = builder.ConstantR1({1.0f, 1.0f}); auto x = builder.ConstantR1({2.1, -2.6}); @@ -336,7 +337,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTwoValuesConstant) { } XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto one = builder.ConstantR0(1); auto two = builder.ConstantR0(2); auto x = builder.ConstantR1( @@ -348,11 +349,22 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) { ComputeAndCompareR1(&builder, expected, {}); } +XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) { + ComputationBuilder builder(client_, TestName()); + auto zero = builder.ConstantR0(0); + auto one = builder.ConstantR0(10); + auto x = builder.ConstantR1({-3, 3, 9, 13}); + auto clamp = builder.Clamp(zero, x, one); + + std::vector expected = {0, 3, 9, 10}; + ComputeAndCompareR1(&builder, expected, {}); +} + XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { - Computation add_half; + XlaComputation add_half; { // add_half(x) = x + 0.5 - ComputationBuilder builder(client_, "add_half"); + XlaBuilder builder("add_half"); auto x_value = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x_value"); auto half = builder.ConstantR0(0.5); @@ -362,10 +374,10 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { add_half = computation_status.ConsumeValueOrDie(); } - Computation clamp; + XlaComputation clamp; { // clamp(y) = clamp<0,5>(y) - ComputationBuilder builder(client_, "clamp"); + XlaBuilder builder("clamp"); auto y_value = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y_value"); auto zero = builder.ConstantR0(0.0); @@ -375,10 +387,10 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { clamp = computation_status.ConsumeValueOrDie(); } - Computation mult_relu_add; + XlaComputation mult_relu_add; { // mult_relu_add(z) = clamp(add_half(2 * max(z, 0))) - ComputationBuilder builder(client_, "mult_relu_add"); + XlaBuilder builder("mult_relu_add"); auto z_value = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "z_value"); auto zero = builder.ConstantR0(0.0); @@ -392,7 +404,7 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { mult_relu_add = computation_status.ConsumeValueOrDie(); } - ComputationBuilder builder(client_, "map10"); + XlaBuilder builder("map10"); { auto x = builder.ConstantR1( {2.1, -21.6, 2.6, -4.0, 2.1, 2.3, -5.0, -0.9, -2.4, 1.6}); @@ -405,7 +417,7 @@ XLA_TEST_F(VecOpsSimpleTest, MapTenValues) { } XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({-5, -4, -3, -2, -1, 0, 1, 2, 3, 4}); auto y = builder.ConstantR0(3); builder.Rem(x, y); @@ -415,7 +427,7 @@ XLA_TEST_F(VecOpsSimpleTest, RemainderTenValuesS32) { } XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({false, true}); auto y = builder.ConstantR1({true, false}); builder.Eq(x, y); @@ -425,7 +437,7 @@ XLA_TEST_F(VecOpsSimpleTest, VectorPredicateEqual) { } XLA_TEST_F(VecOpsSimpleTest, VectorPredicateNotEqual) { - ComputationBuilder builder(client_, TestName()); + XlaBuilder builder(TestName()); auto x = builder.ConstantR1({false, true}); auto y = builder.ConstantR1({true, false}); builder.Ne(x, y); diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc index 1e18b5679956710d7e43c6209be3fdfe4877b4d3..336fed27c6f19fab7f8b171b7364e323a6e08a07 100644 --- a/tensorflow/compiler/xla/tests/while_test.cc +++ b/tensorflow/compiler/xla/tests/while_test.cc @@ -1321,10 +1321,6 @@ void BM_WhileLoop(int num_iters) { } } -// TODO(b/32470510): Benchmark fails on parallel CPU backend. -#ifndef XLA_TEST_BACKEND_CPU_PARALLEL BENCHMARK(BM_WhileLoop); -#endif - } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc index 837a01e873e1cfbcf7c52c13833bbe4366cc9163..8354bb71cb7e8875b654e6624a5a1a13cbf30d9d 100644 --- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -175,8 +175,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, XLA_VLOG_LINES(4, *profile_output); } -// TODO(b/71364943): This test exposes a bug in the parallel CPU backend. -XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) { +XLA_TEST_F(HloProfileTest, ProfileSingleComputation) { const int64 m = 256, k = 256, n = 256; Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k}); Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k}); @@ -239,12 +238,9 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_CPU_PARALLEL(ProfileSingleComputation)) { EXPECT_TRUE(HasTrops(tanh_profile)); } -// TODO(b/71364943): This test exposes a bug in the parallel CPU backend. -// // TODO(b/71544591): The GPU backend does not record cycles spent in on Hlo // instructions "interior" to while nodes. -XLA_TEST_F(HloProfileTest, - DISABLED_ON_GPU(DISABLED_ON_CPU_PARALLEL(ProfileWhileComputation))) { +XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) { const int64 size = 256; Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size}); Shape while_result_shape = diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc index 95d3fd28b38a5945f5ed685f07db046701250273..fdbfc0210ea63ac4350ba48ac3354d23c53c69a7 100644 --- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc +++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc @@ -303,18 +303,14 @@ bool HloParser::ParseComputations() { // set the layouts to what the hlo text says. for (int p = 0; p < computation->num_parameters(); p++) { const Shape& param_shape = computation->parameter_instruction(p)->shape(); - if (param_shape.has_layout()) { - module_->mutable_entry_computation_layout() - ->mutable_parameter_layout(p) - ->ResetLayout(param_shape.layout()); - } + TF_CHECK_OK(module_->mutable_entry_computation_layout() + ->mutable_parameter_layout(p) + ->CopyLayoutFromShape(param_shape)); } const Shape& result_shape = computation->root_instruction()->shape(); - if (result_shape.has_layout()) { - module_->mutable_entry_computation_layout() - ->mutable_result_layout() - ->ResetLayout(result_shape.layout()); - } + TF_CHECK_OK(module_->mutable_entry_computation_layout() + ->mutable_result_layout() + ->CopyLayoutFromShape(result_shape)); } return true; diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc index 93284b80f9e1f82c4b18dc7388754d5c01a7740c..f11123ca24849af1d9c4fd49809a986eb7202bd5 100644 --- a/tensorflow/compiler/xla/window_util.cc +++ b/tensorflow/compiler/xla/window_util.cc @@ -199,6 +199,9 @@ bool IsInactiveWindowDimension(const Window& window, int64 logical_dim) { int64 DilatedBound(int64 bound, int64 dilation) { CHECK_GE(bound, 0); CHECK_GE(dilation, 1); + if (bound == 0) { + return 0; + } // Suppose the array has three entries 123 and the dilation factor is 4. Then // the dilated array has 9 entries 1xxx2xxx3. Here, each original entry except @@ -212,7 +215,7 @@ int64 StridedBound(int64 bound, int64 window_size, int64 stride) { CHECK_GE(bound, 0); CHECK_GE(stride, 1); - if (window_size > bound) { + if (bound == 0 || window_size > bound) { return 0; } diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD index 7e47516550068e8f38d2155e48e229c2ab77b488..abdbdb4cd22ff38a0fae89af10c600a178d9a3d4 100644 --- a/tensorflow/contrib/BUILD +++ b/tensorflow/contrib/BUILD @@ -25,11 +25,13 @@ py_library( "//tensorflow/contrib/batching:batch_py", "//tensorflow/contrib/bayesflow:bayesflow_py", "//tensorflow/contrib/boosted_trees:init_py", + "//tensorflow/contrib/checkpoint/python:checkpoint", "//tensorflow/contrib/cloud:cloud_py", "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip", "//tensorflow/contrib/cluster_resolver:cluster_resolver_py", - "//tensorflow/contrib/coder:coder_ops_py", + "//tensorflow/contrib/coder:coder_py", "//tensorflow/contrib/compiler:compiler_py", + "//tensorflow/contrib/constrained_optimization", "//tensorflow/contrib/copy_graph:copy_graph_py", "//tensorflow/contrib/crf:crf_py", "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py", diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py index 0d163daa6e23435fe42aaa046860ea434039a619..7f33d460dce0778601ecfd3456f118189a3f346e 100644 --- a/tensorflow/contrib/__init__.py +++ b/tensorflow/contrib/__init__.py @@ -29,6 +29,7 @@ from tensorflow.contrib import cloud from tensorflow.contrib import cluster_resolver from tensorflow.contrib import coder from tensorflow.contrib import compiler +from tensorflow.contrib import constrained_optimization from tensorflow.contrib import copy_graph from tensorflow.contrib import crf from tensorflow.contrib import cudnn_rnn diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py index 8add2aacff1d64f1617cd24167c4c6c6706044da..159d985db5c48f8fe1a26350255f8d8f68482473 100644 --- a/tensorflow/contrib/all_reduce/python/all_reduce.py +++ b/tensorflow/contrib/all_reduce/python/all_reduce.py @@ -18,10 +18,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import math -import re from tensorflow.contrib import nccl +from tensorflow.python.framework import device as device_lib from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -659,21 +660,20 @@ def _split_by_task(devices, values): num_devices = len(devices) if num_devices != len(values): raise ValueError("len(devices) must equal len(values)") - pattern = re.compile(r"/task:(\d+)/") - per_task_devices = [] - per_task_values = [] + per_task_devices = collections.OrderedDict() + per_task_values = collections.OrderedDict() for d in range(num_devices): - m = pattern.search(devices[d]) - if m: - index = int(m.group(1)) - while index >= len(per_task_devices): - per_task_devices.append([]) - per_task_values.append([]) - per_task_devices[index].append(devices[d]) - per_task_values[index].append(values[d]) - else: + d_spec = device_lib.DeviceSpec.from_string(devices[d]) + if not hasattr(d_spec, "task") or d_spec.task is None: assert False, "failed to parse device %s" % devices[d] - return (per_task_devices, per_task_values) + index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task) + if index not in per_task_devices: + per_task_devices[index] = [] + per_task_values[index] = [] + per_task_devices[index].append(devices[d]) + per_task_values[index].append(values[d]) + + return (list(per_task_devices.values()), list(per_task_values.values())) def build_nccl_all_reduce(input_tensors, red_op, un_op=None): diff --git a/tensorflow/contrib/autograph/README.md b/tensorflow/contrib/autograph/README.md index 7e84f237dc9a83098f142a54c48cf5b6ba35aaaa..0fcbf5dd59ceceb337434b1e27006ddaf620b017 100644 --- a/tensorflow/contrib/autograph/README.md +++ b/tensorflow/contrib/autograph/README.md @@ -1,4 +1,117 @@ -# Autograph +# AutoGraph -A compiler for generating TensorFlow numeric and control flow ops from Python -code. +IMPORTANT: AutoGraph is pre-alpha, under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback! + +AutoGraph is a Python to TensorFlow compiler. + +With AutoGraph, you can write [Eager style](https://www.tensorflow.org/programmers_guide/eager) code in a concise manner, and run it as a TensorFlow graph. AutoGraph uses source code transformation and partial evaluation to generate Python code that builds an equivalent TensorFlow subgraph. The result is code that behaves like ops and can be freely combined with other TensorFlow ops. + +For example, this Python function: + +``` +def f(x): + if x < 0: + x = -x + return x +``` + +would be converted to this: + +``` +def graph_mode_f(x): + with tf.name_scope('f'): + + def if_true(): + with tf.name_scope('if_true'): + x_1, = x, + x_1 = tf.negative(x_1) + return x_1, + + def if_false(): + with tf.name_scope('if_false'): + x_1, = x, + return x_1, + x = ag__.utils.run_cond(tf.greater(x, 0), if_true, if_false) + return x +``` + +so you can use it like an op: + +``` +with tf.Graph().as_default(): + x = tf.constant(-1.0) + + converted_f = autograph.to_graph(f) + y = converted_f(x) + + with tf.Session() as sess: + print(sess.run(y)) + # Output: 1 +``` + +# Getting started + +Use AutoGraph in one of the following ways, described below: + + 1. Annotations (simpler) + 2. Functional API (more flexible) + +NOTE: You can find more examples in this [interactive notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb). + +To get started, install the latest nightly TensorFlow build: + +```shell +pip install -U tf-nightly +``` + +Then import the `autograph` module from `tf.contrib`: + +``` +from tensorflow.contrib import autograph as ag +``` + +## Using with annotations + +Annotating a function or class with `@convert` converts it in place: + +``` +@ag.convert() +def f(x): + if x < 0: + x = -x + return x +``` + +... so that it always outputs TensorFlow code: + +``` +with tf.Graph().as_default(): + x = tf.constant(-1) + + y = f(x) + + with tf.Session() as sess: + print(sess.run(y)) + # Output: 1 +``` + +## Using the functional API + +The functional API allows you to convert an existing function, class or object after it was defined: + +``` +converted_f = ag.to_graph(f) + +print(converted_f(tf.constant(-1))) +# Output: Tensor + +print(f(-1)) +# Output: 1 +``` + +You can use the functional API to inspect the generated code as well: + +``` +print(ag.to_code(f)) +# Output: +``` diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py index 7df514cd207c5e781f3b4abaa2020016b197669d..9d6cc9245aa463d0c8cfc7ad209736357b6c0323 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py @@ -417,9 +417,18 @@ class SparseSplitHandler(InequalitySplitHandler): return (are_splits_ready, partition_ids, gains, split_infos) -@function.Defun(dtypes.bool, dtypes.bool, dtypes.float32, dtypes.float32, - dtypes.int32, dtypes.float32, dtypes.float32, dtypes.float32, - dtypes.float32, dtypes.float32) +@function.Defun( + dtypes.bool, + dtypes.bool, + dtypes.float32, + dtypes.float32, + dtypes.int32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + noinline=True) def dense_make_stats_update(is_active, are_buckets_ready, float_column, quantile_buckets, example_partition_ids, gradients, hessians, weights, empty_gradients, empty_hessians): @@ -452,9 +461,20 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column, gradients, hessians) -@function.Defun(dtypes.bool, dtypes.bool, dtypes.int64, dtypes.float32, - dtypes.int64, dtypes.float32, dtypes.int32, dtypes.float32, - dtypes.float32, dtypes.float32, dtypes.float32, dtypes.float32) +@function.Defun( + dtypes.bool, + dtypes.bool, + dtypes.int64, + dtypes.float32, + dtypes.int64, + dtypes.float32, + dtypes.int32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + dtypes.float32, + noinline=True) def sparse_make_stats_update( is_active, are_buckets_ready, sparse_column_indices, sparse_column_values, sparse_column_shape, quantile_buckets, example_partition_ids, gradients, diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py index 4bde7f3e33d6f8b295cd35cb32bbbccecf8a2b87..08c1dcdd028829e6ef290965347d184ed42f416d 100644 --- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py +++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py @@ -970,10 +970,8 @@ class GradientBoostedDecisionTreeModel(object): # Stack all the inputs to one tensor per type. # This is a workaround for the slowness of graph building in tf.cond. # See (b/36554864). - split_sizes = array_ops.stack([ - array_ops.shape(partition_id)[0] - for partition_id in partition_ids_list - ]) + split_sizes = array_ops.reshape( + array_ops.shape_n(partition_ids_list), [-1]) partition_ids = array_ops.concat(partition_ids_list, axis=0) gains = array_ops.concat(gains_list, axis=0) split_infos = array_ops.concat(split_info_list, axis=0) diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py index 70d7d2d8d79865af0cd9dd3fd67e8224219bcad0..1192cc44a17823f69db28947308a8b839a83e57e 100644 --- a/tensorflow/contrib/checkpoint/__init__.py +++ b/tensorflow/contrib/checkpoint/__init__.py @@ -16,6 +16,7 @@ For creating and managing dependencies: +@@dot_graph_from_checkpoint @@split_dependency """ @@ -24,6 +25,8 @@ from __future__ import division from __future__ import print_function from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency +from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint + from tensorflow.python.util.all_util import remove_undocumented remove_undocumented(module_name=__name__) diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD index d57b01aab26a9365db49d0fc0ebfe3f0f564a071..a5681ffa61d07ef29d0a0862db9736a210c8e26e 100644 --- a/tensorflow/contrib/checkpoint/python/BUILD +++ b/tensorflow/contrib/checkpoint/python/BUILD @@ -4,6 +4,15 @@ package(default_visibility = ["//tensorflow:internal"]) load("//tensorflow:tensorflow.bzl", "py_test") +py_library( + name = "checkpoint", + srcs_version = "PY2AND3", + deps = [ + ":split_dependency", + ":visualize", + ], +) + py_library( name = "split_dependency", srcs = ["split_dependency.py"], @@ -27,3 +36,26 @@ py_test( "//tensorflow/python/eager:test", ], ) + +py_library( + name = "visualize", + srcs = ["visualize.py"], + srcs_version = "PY2AND3", + visibility = ["//tensorflow:internal"], + deps = [ + "//tensorflow/python:pywrap_tensorflow", + ], +) + +py_test( + name = "visualize_test", + srcs = ["visualize_test.py"], + deps = [ + ":visualize", + "//tensorflow/python:array_ops", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:training", + "//tensorflow/python/eager:test", + ], +) diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py new file mode 100644 index 0000000000000000000000000000000000000000..86fbdb41d2c37803f2bd71b5aa2f72845c87d448 --- /dev/null +++ b/tensorflow/contrib/checkpoint/python/visualize.py @@ -0,0 +1,111 @@ +"""Utilities for visualizing dependency graphs.""" +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.core.protobuf import checkpointable_object_graph_pb2 +from tensorflow.python import pywrap_tensorflow +from tensorflow.python.framework import errors_impl +from tensorflow.python.training import checkpointable + + +def dot_graph_from_checkpoint(save_path): + r"""Visualizes an object-based checkpoint (from `tf.train.Checkpoint`). + + Useful for inspecting checkpoints and debugging loading issues. + + Example usage from Python (requires pydot): + ```python + import tensorflow as tf + import pydot + + dot_string = tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt') + parsed, = pydot.graph_from_dot_data(dot_string) + parsed.write_svg('/tmp/tensorflow/visualized_checkpoint.svg') + ``` + + Example command line usage: + ```sh + python -c "import tensorflow as tf;\ + print(tf.contrib.checkpoint.dot_graph_from_checkpoint('/path/to/ckpt'))"\ + | dot -Tsvg > /tmp/tensorflow/checkpoint_viz.svg + ``` + + Args: + save_path: The checkpoint prefix, as returned by `tf.train.Checkpoint.save` + or `tf.train.latest_checkpoint`. + Returns: + A graph in DOT format as a string. + """ + reader = pywrap_tensorflow.NewCheckpointReader(save_path) + try: + object_graph_string = reader.get_tensor( + checkpointable.OBJECT_GRAPH_PROTO_KEY) + except errors_impl.NotFoundError: + raise ValueError( + ('The specified checkpoint "%s" does not appear to be object-based (it ' + 'is missing the key "%s"). Likely it was created with a name-based ' + 'saver and does not contain an object dependency graph.') % ( + save_path, checkpointable.OBJECT_GRAPH_PROTO_KEY)) + shape_map = reader.get_variable_to_shape_map() + dtype_map = reader.get_variable_to_dtype_map() + object_graph = ( + checkpointable_object_graph_pb2.CheckpointableObjectGraph()) + object_graph.ParseFromString(object_graph_string) + graph = 'digraph {\n' + def _escape(name): + return name.replace('"', '\\"') + slot_ids = set() + for node in object_graph.nodes: + for slot_reference in node.slot_variables: + slot_ids.add(slot_reference.slot_variable_node_id) + for node_id, node in enumerate(object_graph.nodes): + if (len(node.attributes) == 1 + and node.attributes[0].name == checkpointable.VARIABLE_VALUE_KEY): + if node_id in slot_ids: + color = 'orange' + tooltip_prefix = 'Slot variable' + else: + color = 'blue' + tooltip_prefix = 'Variable' + attribute = node.attributes[0] + graph += ('N_%d [shape=point label="" color=%s width=.25' + ' tooltip="%s %s shape=%s %s"]\n') % ( + node_id, + color, + tooltip_prefix, + _escape(attribute.full_name), + shape_map[attribute.checkpoint_key], + dtype_map[attribute.checkpoint_key].name) + elif node.slot_variables: + graph += ('N_%d [shape=point label="" width=.25 color=red,' + 'tooltip="Optimizer"]\n') % node_id + else: + graph += 'N_%d [shape=point label="" width=.25]\n' % node_id + for reference in node.children: + graph += 'N_%d -> N_%d [label="%s"]\n' % ( + node_id, reference.node_id, _escape(reference.local_name)) + for slot_reference in node.slot_variables: + graph += 'N_%d -> N_%d [label="%s" style=dotted]\n' % ( + node_id, + slot_reference.slot_variable_node_id, + _escape(slot_reference.slot_name)) + graph += 'N_%d -> N_%d [style=dotted]\n' % ( + slot_reference.original_variable_node_id, + slot_reference.slot_variable_node_id) + graph += '}\n' + return graph diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py new file mode 100644 index 0000000000000000000000000000000000000000..1d9ab789235cb964521315b4864563f89745ae75 --- /dev/null +++ b/tensorflow/contrib/checkpoint/python/visualize_test.py @@ -0,0 +1,97 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import os + +from tensorflow.contrib.checkpoint.python import visualize + +from tensorflow.python.eager import context +from tensorflow.python.eager import test +from tensorflow.python.framework import constant_op +from tensorflow.python.keras._impl.keras.engine import training +from tensorflow.python.keras._impl.keras.layers import core +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.training import adam +from tensorflow.python.training import checkpointable_utils + +try: + import pydot # pylint: disable=g-import-not-at-top +except ImportError: + pydot = None + + +class MyModel(training.Model): + """A concrete Model for testing.""" + + def __init__(self): + super(MyModel, self).__init__() + self._named_dense = core.Dense(1, use_bias=True) + self._second = core.Dense(1, use_bias=False) + + def call(self, values): + ret = self._second(self._named_dense(values)) + return ret + + +class DotGraphTests(test.TestCase): + + def testMakeDotGraph(self): + with context.eager_mode(): + input_value = constant_op.constant([[3.]]) + model = MyModel() + optimizer = adam.AdamOptimizer(0.001) + optimizer_step = resource_variable_ops.ResourceVariable(12) + save_checkpoint = checkpointable_utils.Checkpoint( + optimizer=optimizer, model=model, optimizer_step=optimizer_step) + optimizer.minimize(functools.partial(model, input_value)) + checkpoint_directory = self.get_temp_dir() + checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt') + save_path = save_checkpoint.save(checkpoint_prefix) + prefix = save_checkpoint.save(save_path) + + dot_graph_string = visualize.dot_graph_from_checkpoint(prefix) + + # The remainder of this test is more-or-less optional since it's so + # dependent on pydot/platform/Python versions. + if pydot is None: + self.skipTest('pydot is required for the remainder of this test.') + try: + parsed, = pydot.graph_from_dot_data(dot_graph_string) + except NameError as e: + if "name 'dot_parser' is not defined" in str(e): + self.skipTest("pydot isn't working") + else: + raise + # Check that the graph isn't completely trivial + self.assertEqual( + '"model"', + parsed.obj_dict['edges'][('N_0', 'N_1')][0]['attributes']['label']) + image_path = os.path.join(self.get_temp_dir(), 'saved.svg') + try: + parsed.write_svg(image_path) + except Exception as e: # pylint: disable=broad-except + # For some reason PyDot's "dot not available" error is an Exception, not + # something more specific. + if '"dot" not found in path' in str(e): + self.skipTest("pydot won't save SVGs (dot not available)") + else: + raise + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py index 5a2771229d9ffe2b5b389d1077fe02a230e9a4c0..1403483d287041b02dfbf538f7e7ddee11662f47 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py +++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py @@ -245,7 +245,7 @@ class TPUClusterResolver(ClusterResolver): else: if not self._tpu.startswith(compat.as_bytes('grpc://')): # Case 3. - return server_lib.ClusterSpec({}) + return None # Case 2. cluster_spec = {self._job_name: [self._tpu[len( compat.as_bytes('grpc://')):]]} diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py index dff7a03b6847fb6e159dc2fa9832fceb3dfe2d54..5b3f9be5a11237f9dceebefa1db294efaf7e482d 100644 --- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py +++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py @@ -356,8 +356,7 @@ class TPUClusterResolverTest(test.TestCase): tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar') self.assertEqual( compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master()) - self.assertEqual( - server_lib.ClusterSpec({}), tpu_cluster_resolver.cluster_spec()) + self.assertEqual(None, tpu_cluster_resolver.cluster_spec()) def testGkeEnvironment(self): os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470' diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt index 10f29deca08f2a70eeeb7c758f3268e0bce11e8c..5f38a8e5c755cd11a568e437b3ade665d983461f 100644 --- a/tensorflow/contrib/cmake/CMakeLists.txt +++ b/tensorflow/contrib/cmake/CMakeLists.txt @@ -214,6 +214,7 @@ include(protobuf) include(re2) include(cub) include(sqlite) +include(double_conversion) if (tensorflow_BUILD_CC_TESTS) include(googletest) endif() @@ -234,6 +235,7 @@ set(tensorflow_EXTERNAL_LIBRARIES ${protobuf_STATIC_LIBRARIES} ${re2_STATIC_LIBRARIES} ${sqlite_STATIC_LIBRARIES} + ${double_conversion_STATIC_LIBRARIES} ) if (systemlib_ZLIB) @@ -261,6 +263,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES fft2d re2 sqlite_copy_headers_to_destination + double_conversion ) include_directories( @@ -283,6 +286,7 @@ include_directories( ${PROTOBUF_INCLUDE_DIRS} ${re2_INCLUDE_DIR} ${sqlite_INCLUDE_DIR} + ${double_conversion_INCLUDE_DIR} ) if(tensorflow_ENABLE_SSL_SUPPORT) diff --git a/tensorflow/contrib/cmake/external/double_conversion.cmake b/tensorflow/contrib/cmake/external/double_conversion.cmake new file mode 100644 index 0000000000000000000000000000000000000000..527ccdc8d887cb4c2e7d2412c99a8bc682568472 --- /dev/null +++ b/tensorflow/contrib/cmake/external/double_conversion.cmake @@ -0,0 +1,54 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +include (ExternalProject) + +set(double_conversion_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/double_conversion/src/double_conversion) +set(double_conversion_URL https://github.com/google/double-conversion.git) +set(double_conversion_TAG 5664746) +set(double_conversion_BUILD ${double_conversion_INCLUDE_DIR}) +set(double_conversion_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.so) +set(double_conversion_INCLUDES ${double_conversion_BUILD}) + +if(WIN32) + set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/$(Configuration)/double-conversion.lib) +else() + set(double_conversion_STATIC_LIBRARIES ${double_conversion_BUILD}/double-conversion/libdouble-conversion.a) +endif() + +set(double_conversion_HEADERS + "${double_conversion_INCLUDE_DIR}/double-conversion/bignum-dtoa.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/cached-powers.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/double-conversion.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/fixed-dtoa.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/strtod.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/bignum.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/diy-fp.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/fast-dtoa.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/ieee.h" + "${double_conversion_INCLUDE_DIR}/double-conversion/utils.h" +) + +ExternalProject_Add(double_conversion + PREFIX double_conversion + GIT_REPOSITORY ${double_conversion_URL} + GIT_TAG ${double_conversion_TAG} + DOWNLOAD_DIR "${DOWNLOAD_LOCATION}" + BUILD_IN_SOURCE 1 + INSTALL_COMMAND "" + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=Release + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON +) diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt index fbcdf7e753de45519203c5747b1b2217c36527f0..6468bed4979253be5c20666d26bf24fa479d64a0 100644 --- a/tensorflow/contrib/cmake/python_modules.txt +++ b/tensorflow/contrib/cmake/python_modules.txt @@ -129,7 +129,11 @@ tensorflow/contrib/boosted_trees/kernels tensorflow/contrib/boosted_trees/ops tensorflow/contrib/boosted_trees/proto tensorflow/contrib/boosted_trees/python +tensorflow/contrib/boosted_trees/python/kernel_tests tensorflow/contrib/boosted_trees/python/ops +tensorflow/contrib/boosted_trees/python/training +tensorflow/contrib/boosted_trees/python/training/functions +tensorflow/contrib/boosted_trees/python/utils tensorflow/contrib/checkpoint tensorflow/contrib/checkpoint/python tensorflow/contrib/cloud @@ -144,8 +148,11 @@ tensorflow/contrib/coder tensorflow/contrib/coder/kernels tensorflow/contrib/coder/ops tensorflow/contrib/coder/python +tensorflow/contrib/coder/python/layers tensorflow/contrib/coder/python/ops tensorflow/contrib/compiler +tensorflow/contrib/constrained_optimization +tensorflow/contrib/constrained_optimization/python tensorflow/contrib/copy_graph tensorflow/contrib/copy_graph/python tensorflow/contrib/copy_graph/python/util diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake index ed018b4fed8e47632f632723f19cc755f2079f86..376496b33f404e42a20b81ab26bf6ddd6541f28f 100644 --- a/tensorflow/contrib/cmake/tf_core_kernels.cmake +++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake @@ -63,6 +63,7 @@ if(tensorflow_BUILD_CONTRIB_KERNELS) "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/training_ops.cc" + "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/pmf_to_cdf_op.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc" "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc" diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD index 9ca4ce8a9c765677865f77ea4982ad8613ce334c..a2c6e413039ee3b5af3cb53d1af3325037536d36 100644 --- a/tensorflow/contrib/coder/BUILD +++ b/tensorflow/contrib/coder/BUILD @@ -1,5 +1,5 @@ # Description: -# Contains entropy coding related modules. +# Contains tools related to data compression. package(default_visibility = [ "//learning/brain:__subpackages__", @@ -54,19 +54,27 @@ tf_gen_op_libs( ], ) +cc_library( + name = "range_coder_ops_util", + srcs = ["kernels/range_coder_ops_util.cc"], + hdrs = ["kernels/range_coder_ops_util.h"], + visibility = ["//visibility:public"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/core:lib", + ], +) + tf_kernel_library( name = "range_coder_ops", srcs = [ "kernels/range_coder_ops.cc", - "kernels/range_coder_ops_util.cc", - ], - hdrs = [ - "kernels/range_coder_ops_util.h", ], visibility = ["//visibility:public"], deps = [ ":coder_ops_op_lib", ":range_coder", + ":range_coder_ops_util", "//tensorflow/core:framework", "//tensorflow/core:lib", ], @@ -152,10 +160,21 @@ tf_gen_op_wrapper_py( deps = [":coder_ops_op_lib"], ) +py_library( + name = "coder_py", + srcs = [ + "__init__.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":coder_ops_py", + ":entropybottleneck_py", + ], +) + tf_custom_op_py_library( name = "coder_ops_py", srcs = [ - "__init__.py", "python/ops/coder_ops.py", ], dso = [ @@ -186,3 +205,44 @@ tf_py_test( ], main = "python/ops/coder_ops_test.py", ) + +py_library( + name = "entropybottleneck_py", + srcs = [ + "python/layers/entropybottleneck.py", + ], + srcs_version = "PY2AND3", + deps = [ + ":coder_ops_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:functional_ops", + "//tensorflow/python:init_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:nn", + "//tensorflow/python:ops", + "//tensorflow/python:random_ops", + "//tensorflow/python:state_ops", + "//tensorflow/python:summary_ops", + "//tensorflow/python:tensor_shape", + "//tensorflow/python:variable_scope", + "//tensorflow/python/eager:context", + "//tensorflow/python/keras:engine", + "//third_party/py/numpy", + ], +) + +tf_py_test( + name = "entropybottleneck_py_test", + srcs = [ + "python/layers/entropybottleneck_test.py", + ], + additional_deps = [ + ":entropybottleneck_py", + "//tensorflow/python:client_testlib", + "//tensorflow/python:variables", + "//tensorflow/python:training", + ], + main = "python/layers/entropybottleneck_test.py", +) diff --git a/tensorflow/contrib/coder/__init__.py b/tensorflow/contrib/coder/__init__.py index b7e663e6f1359f399cdaa80e037635a8f7546b37..99b8ac7595ec632b2918e6b7ca22c06dd7f0a8b3 100644 --- a/tensorflow/contrib/coder/__init__.py +++ b/tensorflow/contrib/coder/__init__.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Entropy code operations.""" +"""Data compression tools.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function # pylint: disable=wildcard-import +from tensorflow.contrib.coder.python.layers.entropybottleneck import * from tensorflow.contrib.coder.python.ops.coder_ops import * # pylint: enable=wildcard-import diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck.py b/tensorflow/contrib/coder/python/layers/entropybottleneck.py new file mode 100644 index 0000000000000000000000000000000000000000..f039cb0f5265b920200f63c5bd5ebeb4e23826be --- /dev/null +++ b/tensorflow/contrib/coder/python/layers/entropybottleneck.py @@ -0,0 +1,697 @@ +# -*- coding: utf-8 -*- +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Entropy bottleneck layer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.coder.python.ops import coder_ops + +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.keras._impl.keras import engine +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import functional_ops +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import nn +from tensorflow.python.ops import random_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.summary import summary + + +class EntropyBottleneck(engine.Layer): + """Entropy bottleneck layer. + + This layer can be used to model the entropy (the amount of information + conveyed) of the tensor passing through it. During training, this can be used + to impose a (soft) entropy constraint on its activations, limiting the amount + of information flowing through the layer. Note that this is distinct from + other types of bottlenecks, which reduce the dimensionality of the space, for + example. Dimensionality reduction does not limit the amount of information, + and does not enable efficient data compression per se. + + After training, this layer can be used to compress any input tensor to a + string, which may be written to a file, and to decompress a file which it + previously generated back to a reconstructed tensor (possibly on a different + machine having access to the same model checkpoint). The entropies estimated + during training or evaluation are approximately equal to the average length of + the strings in bits. + + The layer implements a flexible probability density model to estimate entropy, + which is described in the appendix of the paper (please cite the paper if you + use this code for scientific work): + + "Variational image compression with a scale hyperprior" + + Johannes Ballé, David Minnen, Saurabh Singh, Sung Jin Hwang, Nick Johnston + + https://arxiv.org/abs/1802.01436 + + The layer assumes that the input tensor is at least 2D, with a batch dimension + at the beginning and a channel dimension as specified by `data_format`. The + layer trains an independent probability density model for each channel, but + assumes that across all other dimensions, the inputs are i.i.d. (independent + and identically distributed). Because the entropy (and hence, average + codelength) is a function of the densities, this assumption may have a direct + effect on the compression performance. + + Because data compression always involves discretization, the outputs of the + layer are generally only approximations of its inputs. During training, + discretization is modeled using additive uniform noise to ensure + differentiability. The entropies computed during training are differential + entropies. During evaluation, the data is actually quantized, and the + entropies are discrete (Shannon entropies). To make sure the approximated + tensor values are good enough for practical purposes, the training phase must + be used to balance the quality of the approximation with the entropy, by + adding an entropy term to the training loss, as in the following example. + + Here, we use the entropy bottleneck to compress the latent representation of + an autoencoder. The data vectors `x` in this case are 4D tensors in + `'channels_last'` format (for example, 16x16 pixel grayscale images). + + The layer always produces exactly one auxiliary loss and one update op which + are only significant for compression and decompression. To use the compression + feature, the auxiliary loss must be minimized during or after training. After + that, the update op must be executed at least once. Here, we simply attach + them to the main training step. + + Training: + ``` + # Build autoencoder. + x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) + y = forward_transform(x) + entropy_bottleneck = EntropyBottleneck() + y_, likelihoods = entropy_bottleneck(y, training=True) + x_ = backward_transform(y_) + + # Information content (= predicted codelength) in bits of each batch element + # (note that taking the natural logarithm and dividing by `log(2)` is + # equivalent to taking base-2 logarithms): + bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2) + + # Squared difference of each batch element: + squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3)) + + # The loss is a weighted sum of mean squared error and entropy (average + # information content), where the weight controls the trade-off between + # approximation error and entropy. + main_loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits) + + # Minimize loss and auxiliary loss, and execute update op. + main_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) + main_step = optimizer.minimize(main_loss) + # 1e-2 is a good starting point for the learning rate of the auxiliary loss, + # assuming Adam is used. + aux_optimizer = tf.train.AdamOptimizer(learning_rate=1e-2) + aux_step = optimizer.minimize(entropy_bottleneck.losses[0]) + step = tf.group(main_step, aux_step, entropy_bottleneck.updates[0]) + ``` + + Evaluation: + ``` + # Build autoencoder. + x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) + y = forward_transform(x) + y_, likelihoods = EntropyBottleneck()(y, training=False) + x_ = backward_transform(y_) + + # Information content (= predicted codelength) in bits of each batch element: + bits = tf.reduce_sum(tf.log(likelihoods), axis=(1, 2, 3)) / -np.log(2) + + # Squared difference of each batch element: + squared_error = tf.reduce_sum(tf.squared_difference(x, x_), axis=(1, 2, 3)) + + # The loss is a weighted sum of mean squared error and entropy (average + # information content), where the weight controls the trade-off between + # approximation error and entropy. + loss = 0.5 * tf.reduce_mean(squared_error) + tf.reduce_mean(bits) + ``` + + To be able to compress the bottleneck tensor and decompress it in a different + session, or on a different machine, you need three items: + - The compressed representations stored as strings. + - The shape of the bottleneck for these string representations as a `Tensor`, + as well as the number of channels of the bottleneck at graph construction + time. + - The checkpoint of the trained model that was used for compression. Note: + It is crucial that the auxiliary loss produced by this layer is minimized + during or after training, and that the update op is run after training and + minimization of the auxiliary loss, but *before* the checkpoint is saved. + + Compression: + ``` + x = tf.placeholder(tf.float32, shape=[None, 16, 16, 1]) + y = forward_transform(x) + strings = EntropyBottleneck().compress(y) + shape = tf.shape(y)[1:] + ``` + + Decompression: + ``` + strings = tf.placeholder(tf.string, shape=[None]) + shape = tf.placeholder(tf.int32, shape=[3]) + entropy_bottleneck = EntropyBottleneck(dtype=tf.float32) + y_ = entropy_bottleneck.decompress(strings, shape, channels=5) + x_ = backward_transform(y_) + ``` + Here, we assumed that the tensor produced by the forward transform has 5 + channels. + + The above four use cases can also be implemented within the same session (i.e. + on the same `EntropyBottleneck` instance), for testing purposes, etc., by + calling the object more than once. + + Arguments: + init_scale: Float. A scaling factor determining the initial width of the + probability densities. This should be chosen big enough so that the + range of values of the layer inputs roughly falls within the interval + [`-init_scale`, `init_scale`] at the beginning of training. + filters: An iterable of ints, giving the number of filters at each layer of + the density model. Generally, the more filters and layers, the more + expressive is the density model in terms of modeling more complicated + distributions of the layer inputs. For details, refer to the paper + referenced above. The default is `[3, 3, 3]`, which should be sufficient + for most practical purposes. + tail_mass: Float, between 0 and 1. The bottleneck layer automatically + determines the range of input values that should be represented based on + their frequency of occurrence. Values occurring in the tails of the + distributions will be clipped to that range during compression. + `tail_mass` determines the amount of probability mass in the tails which + is cut off in the worst case. For example, the default value of `1e-9` + means that at most 1 in a billion input samples will be clipped to the + range. + optimize_integer_offset: Boolean. Typically, the input values of this layer + are floats, which means that quantization during evaluation can be + performed with an arbitrary offset. By default, the layer determines that + offset automatically. In special situations, such as when it is known that + the layer will receive only full integer values during evaluation, it can + be desirable to set this argument to `False` instead, in order to always + quantize to full integer values. + likelihood_bound: Float. If positive, the returned likelihood values are + ensured to be greater than or equal to this value. This prevents very + large gradients with a typical entropy loss (defaults to 1e-9). + range_coder_precision: Integer, between 1 and 16. The precision of the range + coder used for compression and decompression. This trades off computation + speed with compression efficiency, where 16 is the slowest but most + efficient setting. Choosing lower values may increase the average + codelength slightly compared to the estimated entropies. + data_format: Either `'channels_first'` or `'channels_last'` (default). + trainable: Boolean. Whether the layer should be trained. + name: String. The name of the layer. + dtype: Default dtype of the layer's parameters (default of `None` means use + the type of the first input). + + Read-only properties: + init_scale: See above. + filters: See above. + tail_mass: See above. + optimize_integer_offset: See above. + likelihood_bound: See above. + range_coder_precision: See above. + data_format: See above. + name: String. See above. + dtype: See above. + trainable_variables: List of trainable variables. + non_trainable_variables: List of non-trainable variables. + variables: List of all variables of this layer, trainable and non-trainable. + updates: List of update ops of this layer. Always contains exactly one + update op, which must be run once after the last training step, before + `compress` or `decompress` is used. + losses: List of losses added by this layer. Always contains exactly one + auxiliary loss, which must be added to the training loss. + + Mutable properties: + trainable: Boolean. Whether the layer should be trained. + input_spec: Optional `InputSpec` object specifying the constraints on inputs + that can be accepted by the layer. + """ + + def __init__(self, init_scale=10, filters=(3, 3, 3), tail_mass=1e-9, + optimize_integer_offset=True, likelihood_bound=1e-9, + range_coder_precision=16, data_format="channels_last", **kwargs): + super(EntropyBottleneck, self).__init__(**kwargs) + self._init_scale = float(init_scale) + self._filters = tuple(int(f) for f in filters) + self._tail_mass = float(tail_mass) + if not 0 < self.tail_mass < 1: + raise ValueError( + "`tail_mass` must be between 0 and 1, got {}.".format(self.tail_mass)) + self._optimize_integer_offset = bool(optimize_integer_offset) + self._likelihood_bound = float(likelihood_bound) + self._range_coder_precision = int(range_coder_precision) + self._data_format = data_format + self._channel_axis(2) # trigger ValueError early + self.input_spec = engine.InputSpec(min_ndim=2) + + @property + def init_scale(self): + return self._init_scale + + @property + def filters(self): + return self._filters + + @property + def tail_mass(self): + return self._tail_mass + + @property + def optimize_integer_offset(self): + return self._optimize_integer_offset + + @property + def likelihood_bound(self): + return self._likelihood_bound + + @property + def range_coder_precision(self): + return self._range_coder_precision + + @property + def data_format(self): + return self._data_format + + def _channel_axis(self, ndim): + try: + return {"channels_first": 1, "channels_last": ndim - 1}[self.data_format] + except KeyError: + raise ValueError("Unsupported `data_format` for {} layer: {}.".format( + self.__class__.__name__, self.data_format)) + + def _logits_cumulative(self, inputs, stop_gradient): + """Evaluate logits of the cumulative densities. + + Args: + inputs: The values at which to evaluate the cumulative densities, expected + to be a `Tensor` of shape `(channels, 1, batch)`. + stop_gradient: Boolean. Whether to add `array_ops.stop_gradient` calls so + that the gradient of the output with respect to the density model + parameters is disconnected (the gradient with respect to `inputs` is + left untouched). + + Returns: + A `Tensor` of the same shape as `inputs`, containing the logits of the + cumulative densities evaluated at the given inputs. + """ + logits = inputs + + for i in range(len(self.filters) + 1): + matrix = self._matrices[i] + if stop_gradient: + matrix = array_ops.stop_gradient(matrix) + logits = math_ops.matmul(matrix, logits) + + bias = self._biases[i] + if stop_gradient: + bias = array_ops.stop_gradient(bias) + logits += bias + + if i < len(self._factors): + factor = self._factors[i] + if stop_gradient: + factor = array_ops.stop_gradient(factor) + logits += factor * math_ops.tanh(logits) + + return logits + + def build(self, input_shape): + """Builds the layer. + + Creates the variables for the network modeling the densities, creates the + auxiliary loss estimating the median and tail quantiles of the densities, + and then uses that to create the probability mass functions and the update + op that produces the discrete cumulative density functions used by the range + coder. + + Args: + input_shape: Shape of the input tensor, used to get the number of + channels. + + Raises: + ValueError: if `input_shape` doesn't specify the length of the channel + dimension. + """ + input_shape = tensor_shape.TensorShape(input_shape) + channel_axis = self._channel_axis(input_shape.ndims) + channels = input_shape[channel_axis].value + if channels is None: + raise ValueError("The channel dimension of the inputs must be defined.") + self.input_spec = engine.InputSpec( + ndim=input_shape.ndims, axes={channel_axis: channels}) + filters = (1,) + self.filters + (1,) + scale = self.init_scale ** (1 / (len(self.filters) + 1)) + + # Create variables. + self._matrices = [] + self._biases = [] + self._factors = [] + for i in range(len(self.filters) + 1): + init = np.log(np.expm1(1 / scale / filters[i + 1])) + matrix = self.add_variable( + "matrix_{}".format(i), dtype=self.dtype, + shape=(channels, filters[i + 1], filters[i]), + initializer=init_ops.Constant(init)) + matrix = nn.softplus(matrix) + self._matrices.append(matrix) + + bias = self.add_variable( + "bias_{}".format(i), dtype=self.dtype, + shape=(channels, filters[i + 1], 1), + initializer=init_ops.RandomUniform(-.5, .5)) + self._biases.append(bias) + + if i < len(self.filters): + factor = self.add_variable( + "factor_{}".format(i), dtype=self.dtype, + shape=(channels, filters[i + 1], 1), + initializer=init_ops.Zeros()) + factor = math_ops.tanh(factor) + self._factors.append(factor) + + # To figure out what range of the densities to sample, we need to compute + # the quantiles given by `tail_mass / 2` and `1 - tail_mass / 2`. Since we + # can't take inverses of the cumulative directly, we make it an optimization + # problem: + # `quantiles = argmin(|logit(cumulative) - target|)` + # where `target` is `logit(tail_mass / 2)` or `logit(1 - tail_mass / 2)`. + # Taking the logit (inverse of sigmoid) of the cumulative makes the + # representation of the right target more numerically stable. + + # Numerically stable way of computing logits of `tail_mass / 2` + # and `1 - tail_mass / 2`. + target = np.log(2 / self.tail_mass - 1) + # Compute lower and upper tail quantile as well as median. + target = constant_op.constant([-target, 0, target], dtype=self.dtype) + + def quantiles_initializer(shape, dtype=None, partition_info=None): + del partition_info # unused + assert tuple(shape[1:]) == (1, 3) + init = constant_op.constant( + [[[-self.init_scale, 0, self.init_scale]]], dtype=dtype) + return array_ops.tile(init, (shape[0], 1, 1)) + + quantiles = self.add_variable( + "quantiles", shape=(channels, 1, 3), dtype=self.dtype, + initializer=quantiles_initializer) + logits = self._logits_cumulative(quantiles, stop_gradient=True) + loss = math_ops.reduce_sum(abs(logits - target)) + self.add_loss(loss, inputs=None) + + # Save medians for `call`, `compress`, and `decompress`. + self._medians = quantiles[:, :, 1:2] + if not self.optimize_integer_offset: + self._medians = math_ops.round(self._medians) + + # Largest distance observed between lower tail quantile and median, + # or between median and upper tail quantile. + minima = math_ops.reduce_max(self._medians - quantiles[:, :, 0:1]) + maxima = math_ops.reduce_max(quantiles[:, :, 2:3] - self._medians) + minmax = math_ops.maximum(minima, maxima) + minmax = math_ops.ceil(minmax) + minmax = math_ops.maximum(minmax, 1) + + # Sample the density up to `minmax` around the median. + samples = math_ops.range(-minmax, minmax + 1, dtype=self.dtype) + samples += self._medians + + half = constant_op.constant(.5, dtype=self.dtype) + # We strip the sigmoid from the end here, so we can use the special rule + # below to only compute differences in the left tail of the sigmoid. + # This increases numerical stability (see explanation in `call`). + lower = self._logits_cumulative(samples - half, stop_gradient=True) + upper = self._logits_cumulative(samples + half, stop_gradient=True) + # Flip signs if we can move more towards the left tail of the sigmoid. + sign = -math_ops.sign(math_ops.add_n([lower, upper])) + pmf = abs(math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) + # Add tail masses to first and last bin of pmf, as we clip values for + # compression, meaning that out-of-range values get mapped to these bins. + pmf = array_ops.concat([ + math_ops.add_n([pmf[:, 0, :1], math_ops.sigmoid(lower[:, 0, :1])]), + pmf[:, 0, 1:-1], + math_ops.add_n([pmf[:, 0, -1:], math_ops.sigmoid(-upper[:, 0, -1:])]), + ], axis=-1) + self._pmf = pmf + + cdf = coder_ops.pmf_to_quantized_cdf( + pmf, precision=self.range_coder_precision) + def cdf_getter(*args, **kwargs): + del args, kwargs # ignored + return variable_scope.get_variable( + "quantized_cdf", dtype=dtypes.int32, initializer=cdf, + trainable=False, validate_shape=False, collections=()) + # Need to provide a fake shape here since add_variable insists on it. + self._quantized_cdf = self.add_variable( + "quantized_cdf", shape=(channels, 1), dtype=dtypes.int32, + getter=cdf_getter, trainable=False) + + update_op = state_ops.assign( + self._quantized_cdf, cdf, validate_shape=False) + self.add_update(update_op, inputs=None) + + super(EntropyBottleneck, self).build(input_shape) + + def call(self, inputs, training): + """Pass a tensor through the bottleneck. + + Args: + inputs: The tensor to be passed through the bottleneck. + training: Boolean. If `True`, returns a differentiable approximation of + the inputs, and their likelihoods under the modeled probability + densities. If `False`, returns the quantized inputs and their + likelihoods under the corresponding probability mass function. These + quantities can't be used for training, as they are not differentiable, + but represent actual compression more closely. + + Returns: + values: `Tensor` with the same shape as `inputs` containing the perturbed + or quantized input values. + likelihood: `Tensor` with the same shape as `inputs` containing the + likelihood of `values` under the modeled probability distributions. + + Raises: + ValueError: if `inputs` has different `dtype` or number of channels than + a previous set of inputs the model was invoked with earlier. + """ + inputs = ops.convert_to_tensor(inputs) + ndim = self.input_spec.ndim + channel_axis = self._channel_axis(ndim) + half = constant_op.constant(.5, dtype=self.dtype) + + # Convert to (channels, 1, batch) format by commuting channels to front + # and then collapsing. + order = list(range(ndim)) + order.pop(channel_axis) + order.insert(0, channel_axis) + values = array_ops.transpose(inputs, order) + shape = array_ops.shape(values) + values = array_ops.reshape(values, (shape[0], 1, -1)) + + # Add noise or quantize. + if training: + noise = random_ops.random_uniform(array_ops.shape(values), -half, half) + values = math_ops.add_n([values, noise]) + elif self.optimize_integer_offset: + values = math_ops.round(values - self._medians) + self._medians + else: + values = math_ops.round(values) + + # Evaluate densities. + # We can use the special rule below to only compute differences in the left + # tail of the sigmoid. This increases numerical stability: sigmoid(x) is 1 + # for large x, 0 for small x. Subtracting two numbers close to 0 can be done + # with much higher precision than subtracting two numbers close to 1. + lower = self._logits_cumulative(values - half, stop_gradient=False) + upper = self._logits_cumulative(values + half, stop_gradient=False) + # Flip signs if we can move more towards the left tail of the sigmoid. + sign = -math_ops.sign(math_ops.add_n([lower, upper])) + sign = array_ops.stop_gradient(sign) + likelihood = abs( + math_ops.sigmoid(sign * upper) - math_ops.sigmoid(sign * lower)) + if self.likelihood_bound > 0: + likelihood_bound = constant_op.constant( + self.likelihood_bound, dtype=self.dtype) + # TODO(jballe): Override gradients. + likelihood = math_ops.maximum(likelihood, likelihood_bound) + + # Convert back to input tensor shape. + order = list(range(1, ndim)) + order.insert(channel_axis, 0) + values = array_ops.reshape(values, shape) + values = array_ops.transpose(values, order) + likelihood = array_ops.reshape(likelihood, shape) + likelihood = array_ops.transpose(likelihood, order) + + if not context.executing_eagerly(): + values_shape, likelihood_shape = self.compute_output_shape(inputs.shape) + values.set_shape(values_shape) + likelihood.set_shape(likelihood_shape) + + return values, likelihood + + def compress(self, inputs): + """Compress inputs and store their binary representations into strings. + + Args: + inputs: `Tensor` with values to be compressed. + + Returns: + String `Tensor` vector containing the compressed representation of each + batch element of `inputs`. + """ + with ops.name_scope(self._name_scope()): + inputs = ops.convert_to_tensor(inputs) + if not self.built: + # Check input assumptions set before layer building, e.g. input rank. + self._assert_input_compatibility(inputs) + if self.dtype is None: + self._dtype = inputs.dtype.base_dtype.name + self.build(inputs.shape) + + # Check input assumptions set after layer building, e.g. input shape. + if not context.executing_eagerly(): + self._assert_input_compatibility(inputs) + + ndim = self.input_spec.ndim + channel_axis = self._channel_axis(ndim) + # Tuple of slices for expanding dimensions of tensors below. + slices = ndim * [None] + [slice(None)] + slices[channel_axis] = slice(None) + slices = tuple(slices) + + # Expand dimensions of CDF to input dimensions, keeping the channels along + # the right dimension. + cdf = self._quantized_cdf[slices[1:]] + num_levels = array_ops.shape(cdf)[-1] - 1 + + # Bring inputs to the right range by centering the range on the medians. + half = constant_op.constant(.5, dtype=self.dtype) + medians = array_ops.squeeze(self._medians, [1, 2]) + offsets = (math_ops.cast(num_levels // 2, self.dtype) + half) - medians + # Expand offsets to input dimensions and add to inputs. + values = inputs + offsets[slices[:-1]] + + # Clip to range and cast to integers. Because we have added .5 above, and + # all values are positive, the cast effectively implements rounding. + values = math_ops.maximum(values, half) + values = math_ops.minimum( + values, math_ops.cast(num_levels, self.dtype) - half) + values = math_ops.cast(values, dtypes.int16) + + def loop_body(tensor): + return coder_ops.range_encode( + tensor, cdf, precision=self.range_coder_precision) + strings = functional_ops.map_fn( + loop_body, values, dtype=dtypes.string, back_prop=False) + + if not context.executing_eagerly(): + strings.set_shape(inputs.shape[:1]) + + return strings + + def decompress(self, strings, shape, channels=None): + """Decompress values from their compressed string representations. + + Args: + strings: A string `Tensor` vector containing the compressed data. + shape: A `Tensor` vector of int32 type. Contains the shape of the tensor + to be decompressed, excluding the batch dimension. + channels: Integer. Specifies the number of channels statically. Needs only + be set if the layer hasn't been built yet (i.e., this is the first input + it receives). + + Returns: + The decompressed `Tensor`. Its shape will be equal to `shape` prepended + with the batch dimension from `strings`. + + Raises: + ValueError: If the length of `shape` isn't available at graph construction + time. + """ + with ops.name_scope(self._name_scope()): + strings = ops.convert_to_tensor(strings) + shape = ops.convert_to_tensor(shape) + if self.built: + ndim = self.input_spec.ndim + channel_axis = self._channel_axis(ndim) + if channels is None: + channels = self.input_spec.axes[channel_axis] + else: + if not (shape.shape.is_fully_defined() and shape.shape.ndims == 1): + raise ValueError("`shape` must be a vector with known length.") + ndim = shape.shape[0].value + 1 + channel_axis = self._channel_axis(ndim) + input_shape = ndim * [None] + input_shape[channel_axis] = channels + self.build(input_shape) + + # Tuple of slices for expanding dimensions of tensors below. + slices = ndim * [None] + [slice(None)] + slices[channel_axis] = slice(None) + slices = tuple(slices) + + # Expand dimensions of CDF to input dimensions, keeping the channels along + # the right dimension. + cdf = self._quantized_cdf[slices[1:]] + num_levels = array_ops.shape(cdf)[-1] - 1 + + def loop_body(string): + return coder_ops.range_decode( + string, shape, cdf, precision=self.range_coder_precision) + outputs = functional_ops.map_fn( + loop_body, strings, dtype=dtypes.int16, back_prop=False) + outputs = math_ops.cast(outputs, self.dtype) + + medians = array_ops.squeeze(self._medians, [1, 2]) + offsets = math_ops.cast(num_levels // 2, self.dtype) - medians + outputs -= offsets[slices[:-1]] + + if not context.executing_eagerly(): + outputs_shape = ndim * [None] + outputs_shape[0] = strings.shape[0] + outputs_shape[channel_axis] = channels + outputs.set_shape(outputs_shape) + + return outputs + + def visualize(self): + """Multi-channel visualization of densities as images. + + Creates and returns an image summary visualizing the current probabilty + density estimates. The image contains one row for each channel. Within each + row, the pixel intensities are proportional to probability values, and each + row is centered on the median of the corresponding distribution. + + Returns: + The created image summary. + """ + with ops.name_scope(self._name_scope()): + image = self._pmf + image *= 255 / math_ops.reduce_max(image, axis=1, keepdims=True) + image = math_ops.cast(image + .5, dtypes.uint8) + image = image[None, :, :, None] + return summary.image("pmf", image, max_outputs=1) + + def compute_output_shape(self, input_shape): + input_shape = tensor_shape.TensorShape(input_shape) + return input_shape, input_shape diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py new file mode 100644 index 0000000000000000000000000000000000000000..798b0234ebcce7df108a0da65d1305502ce0253a --- /dev/null +++ b/tensorflow/contrib/coder/python/layers/entropybottleneck_test.py @@ -0,0 +1,315 @@ +# -*- coding: utf-8 -*- +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests of EntropyBottleneck class.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.coder.python.layers import entropybottleneck + +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import gradient_descent + + +class EntropyBottleneckTest(test.TestCase): + + def test_noise(self): + # Tests that the noise added is uniform noise between -0.5 and 0.5. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck() + noisy, _ = layer(inputs, training=True) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + values = np.linspace(-50, 50, 100)[:, None] + noisy, = sess.run([noisy], {inputs: values}) + self.assertFalse(np.allclose(values, noisy, rtol=0, atol=.49)) + self.assertAllClose(values, noisy, rtol=0, atol=.5) + + def test_quantization(self): + # Tests that inputs are quantized to full integer values, even after + # quantiles have been updated. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=False) + quantized, _ = layer(inputs, training=False) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + values = np.linspace(-50, 50, 100)[:, None] + quantized, = sess.run([quantized], {inputs: values}) + self.assertAllClose(np.around(values), quantized, rtol=0, atol=1e-6) + + def test_quantization_optimized_offset(self): + # Tests that inputs are not quantized to full integer values after quantiles + # have been updated. However, the difference between input and output should + # be between -0.5 and 0.5, and the offset must be consistent. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck(optimize_integer_offset=True) + quantized, _ = layer(inputs, training=False) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + values = np.linspace(-50, 50, 100)[:, None] + quantized, = sess.run([quantized], {inputs: values}) + self.assertAllClose(values, quantized, rtol=0, atol=.5) + diff = np.ravel(np.around(values) - quantized) % 1 + self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6) + self.assertNotEqual(diff[0], 0) + + def test_codec(self): + # Tests that inputs are compressed and decompressed correctly, and quantized + # to full integer values, even after quantiles have been updated. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=60, + optimize_integer_offset=False) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = np.linspace(-50, 50, 100)[None, :, None] + decoded, = sess.run([decoded], {inputs: values}) + self.assertAllClose(np.around(values), decoded, rtol=0, atol=1e-6) + + def test_codec_optimized_offset(self): + # Tests that inputs are compressed and decompressed correctly, and not + # quantized to full integer values after quantiles have been updated. + # However, the difference between input and output should be between -0.5 + # and 0.5, and the offset must be consistent. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=60, + optimize_integer_offset=True) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + opt = gradient_descent.GradientDescentOptimizer(learning_rate=1) + self.assertTrue(len(layer.losses) == 1) + step = opt.minimize(layer.losses[0]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run(step) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = np.linspace(-50, 50, 100)[None, :, None] + decoded, = sess.run([decoded], {inputs: values}) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + diff = np.ravel(np.around(values) - decoded) % 1 + self.assertAllClose(diff, np.full_like(diff, diff[0]), rtol=0, atol=5e-6) + self.assertNotEqual(diff[0], 0) + + def test_codec_clipping(self): + # Tests that inputs are compressed and decompressed correctly, and clipped + # to the expected range. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=40) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = np.linspace(-50, 50, 100)[None, :, None] + decoded, = sess.run([decoded], {inputs: values}) + expected = np.clip(np.around(values), -40, 40) + self.assertAllClose(expected, decoded, rtol=0, atol=1e-6) + + def test_channels_last(self): + # Test the layer with more than one channel and multiple input dimensions, + # with the channels in the last dimension. + inputs = array_ops.placeholder(dtypes.float32, (None, None, None, 2)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_last", init_scale=50) + noisy, _ = layer(inputs, training=True) + quantized, _ = layer(inputs, training=False) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = 5 * np.random.normal(size=(7, 5, 3, 2)) + noisy, quantized, decoded = sess.run( + [noisy, quantized, decoded], {inputs: values}) + self.assertAllClose(values, noisy, rtol=0, atol=.5) + self.assertAllClose(values, quantized, rtol=0, atol=.5) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + + def test_channels_first(self): + # Test the layer with more than one channel and multiple input dimensions, + # with the channel dimension right after the batch dimension. + inputs = array_ops.placeholder(dtypes.float32, (None, 3, None, None)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_first", init_scale=50) + noisy, _ = layer(inputs, training=True) + quantized, _ = layer(inputs, training=False) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = 5 * np.random.normal(size=(2, 3, 5, 7)) + noisy, quantized, decoded = sess.run( + [noisy, quantized, decoded], {inputs: values}) + self.assertAllClose(values, noisy, rtol=0, atol=.5) + self.assertAllClose(values, quantized, rtol=0, atol=.5) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + + def test_compress(self): + # Test compression and decompression, and produce test data for + # `test_decompress`. If you set the constant at the end to `True`, this test + # will fail and the log will contain the new test data. + inputs = array_ops.placeholder(dtypes.float32, (2, 3, 10)) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_first", filters=(), init_scale=2) + bitstrings = layer.compress(inputs) + decoded = layer.decompress(bitstrings, array_ops.shape(inputs)[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + values = 5 * np.random.uniform(size=(2, 3, 10)) - 2.5 + bitstrings, quantized_cdf, decoded = sess.run( + [bitstrings, layer._quantized_cdf, decoded], {inputs: values}) + self.assertAllClose(values, decoded, rtol=0, atol=.5) + # Set this constant to `True` to log new test data for `test_decompress`. + if False: # pylint:disable=using-constant-test + assert False, (bitstrings, quantized_cdf, decoded) + + # Data generated by `test_compress`. + # pylint:disable=g-inconsistent-quotes,bad-whitespace + bitstrings = np.array([ + b'\x1e\xbag}\xc2\xdaN\x8b\xbd.', + b'\x8dF\xf0%\x1cv\xccllW' + ], dtype=object) + + quantized_cdf = np.array([ + [ 0, 15636, 22324, 30145, 38278, 65536], + [ 0, 19482, 26927, 35052, 42904, 65535], + [ 0, 21093, 28769, 36919, 44578, 65536] + ], dtype=np.int32) + + expected = np.array([ + [[-2., 1., 0., -2., -1., -2., -2., -2., 2., -1.], + [ 1., 2., 1., 0., -2., -2., 1., 2., 0., 1.], + [ 2., 0., -2., 2., 0., -1., -2., 0., 2., 0.]], + [[ 1., 2., 0., -1., 1., 2., 1., 1., 2., -2.], + [ 2., -1., -1., 0., -1., 2., 0., 2., -2., 2.], + [ 2., -2., -2., -1., -2., 1., -2., 0., 0., 0.]] + ], dtype=np.float32) + # pylint:enable=g-inconsistent-quotes,bad-whitespace + + def test_decompress(self): + # Test that decompression of values compressed with a previous version + # works, i.e. that the file format doesn't change across revisions. + bitstrings = array_ops.placeholder(dtypes.string) + input_shape = array_ops.placeholder(dtypes.int32) + quantized_cdf = array_ops.placeholder(dtypes.int32) + layer = entropybottleneck.EntropyBottleneck( + data_format="channels_first", filters=(), dtype=dtypes.float32) + layer.build(self.expected.shape) + layer._quantized_cdf = quantized_cdf + decoded = layer.decompress(bitstrings, input_shape[1:]) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + decoded, = sess.run([decoded], { + bitstrings: self.bitstrings, input_shape: self.expected.shape, + quantized_cdf: self.quantized_cdf}) + self.assertAllClose(self.expected, decoded, rtol=0, atol=1e-6) + + def test_build_decompress(self): + # Test that layer can be built when `decompress` is the first call to it. + bitstrings = array_ops.placeholder(dtypes.string) + input_shape = array_ops.placeholder(dtypes.int32, shape=[3]) + layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) + layer.decompress(bitstrings, input_shape[1:], channels=5) + self.assertTrue(layer.built) + + def test_pmf_normalization(self): + # Test that probability mass functions are normalized correctly. + layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) + layer.build((None, 10)) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + pmf, = sess.run([layer._pmf]) + self.assertAllClose(np.ones(10), np.sum(pmf, axis=-1), rtol=0, atol=1e-6) + + def test_visualize(self): + # Test that summary op can be constructed. + layer = entropybottleneck.EntropyBottleneck(dtype=dtypes.float32) + layer.build((None, 10)) + summary = layer.visualize() + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + sess.run([summary]) + + def test_normalization(self): + # Test that densities are normalized correctly. + inputs = array_ops.placeholder(dtypes.float32, (None, 1)) + layer = entropybottleneck.EntropyBottleneck(filters=(2,)) + _, likelihood = layer(inputs, training=True) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + x = np.repeat(np.arange(-200, 201), 1000)[:, None] + likelihood, = sess.run([likelihood], {inputs: x}) + self.assertEqual(x.shape, likelihood.shape) + integral = np.sum(likelihood) * .001 + self.assertAllClose(1, integral, rtol=0, atol=1e-4) + + def test_entropy_estimates(self): + # Test that entropy estimates match actual range coding. + inputs = array_ops.placeholder(dtypes.float32, (1, None, 1)) + layer = entropybottleneck.EntropyBottleneck( + filters=(2, 3), data_format="channels_last") + _, likelihood = layer(inputs, training=True) + diff_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2) + _, likelihood = layer(inputs, training=False) + disc_entropy = math_ops.reduce_sum(math_ops.log(likelihood)) / -np.log(2) + bitstrings = layer.compress(inputs) + with self.test_session() as sess: + sess.run(variables.global_variables_initializer()) + self.assertTrue(len(layer.updates) == 1) + sess.run(layer.updates[0]) + diff_entropy, disc_entropy, bitstrings = sess.run( + [diff_entropy, disc_entropy, bitstrings], + {inputs: np.random.normal(size=(1, 10000, 1))}) + codelength = 8 * sum(len(bitstring) for bitstring in bitstrings) + self.assertAllClose(diff_entropy, disc_entropy, rtol=5e-3, atol=0) + self.assertAllClose(disc_entropy, codelength, rtol=5e-3, atol=0) + self.assertGreater(codelength, disc_entropy) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/constrained_optimization/BUILD b/tensorflow/contrib/constrained_optimization/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..619153df67c90cea5a5082a411972948bac5fe90 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/BUILD @@ -0,0 +1,91 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +load("//tensorflow:tensorflow.bzl", "py_test") + +# Transitive dependencies of this target will be included in the pip package. +py_library( + name = "constrained_optimization_pip", + deps = [ + ":constrained_optimization", + ":test_util", + ], +) + +py_library( + name = "constrained_optimization", + srcs = [ + "__init__.py", + "python/candidates.py", + "python/constrained_minimization_problem.py", + "python/constrained_optimizer.py", + "python/external_regret_optimizer.py", + "python/swap_regret_optimizer.py", + ], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework", + "//tensorflow/python:standard_ops", + "//tensorflow/python:state_ops", + "//tensorflow/python:training", + "//third_party/py/numpy", + "@six_archive//:six", + ], +) + +py_test( + name = "candidates_test", + srcs = ["python/candidates_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":constrained_optimization", + "//tensorflow/python:client_testlib", + "//third_party/py/numpy", + ], +) + +# NOTE: This library can't be "testonly" since it needs to be included in the +# pip package. +py_library( + name = "test_util", + srcs = ["python/test_util.py"], + srcs_version = "PY2AND3", + deps = [ + ":constrained_optimization", + "//tensorflow/python:dtypes", + "//tensorflow/python:standard_ops", + ], +) + +py_test( + name = "external_regret_optimizer_test", + srcs = ["python/external_regret_optimizer_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":constrained_optimization", + ":test_util", + "//tensorflow/python:client_testlib", + "//tensorflow/python:standard_ops", + "//tensorflow/python:training", + "//third_party/py/numpy", + ], +) + +py_test( + name = "swap_regret_optimizer_test", + srcs = ["python/swap_regret_optimizer_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":constrained_optimization", + ":test_util", + "//tensorflow/python:client_testlib", + "//tensorflow/python:standard_ops", + "//tensorflow/python:training", + "//third_party/py/numpy", + ], +) diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c65a150464efc1e77419040f66f36fc6756325aa --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/README.md @@ -0,0 +1,345 @@ + + +# ConstrainedOptimization (TFCO) + +TFCO is a library for optimizing inequality-constrained problems in TensorFlow. +Both the objective function and the constraints are represented as Tensors, +giving users the maximum amount of flexibility in specifying their optimization +problems. + +This flexibility makes optimization considerably more difficult: on a non-convex +problem, if one uses the "standard" approach of introducing a Lagrange +multiplier for each constraint, and then jointly maximizing over the Lagrange +multipliers and minimizing over the model parameters, then a stable stationary +point might not even *exist*. Hence, in some cases, oscillation, instead of +convergence, is inevitable. + +Thankfully, it turns out that even if, over the course of optimization, no +*particular* iterate does a good job of minimizing the objective while +satisfying the constraints, the *sequence* of iterates, on average, usually +will. This observation suggests the following approach: at training time, we'll +periodically snapshot the model state during optimization; then, at evaluation +time, each time we're given a new example to evaluate, we'll sample one of the +saved snapshots uniformly at random, and apply it to the example. This +*stochastic model* will generally perform well, both with respect to the +objective function, and the constraints. + +In fact, we can do better: it's possible to post-process the set of snapshots to +find a distribution over at most $$m+1$$ snapshots, where $$m$$ is the number of +constraints, that will be at least as good (and will usually be much better) +than the (much larger) uniform distribution described above. If you're unable or +unwilling to use a stochastic model at all, then you can instead use a heuristic +to choose the single best snapshot. + +For full details, motivation, and theoretical results on the approach taken by +this library, please refer to: + +> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex +> Constrained Optimization". +> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + +which will be referred to as [CoJiSr18] throughout the remainder of this +document. + +### Proxy Constraints + +Imagine that we want to constrain the recall of a binary classifier to be at +least 90%. Since the recall is proportional to the number of true positive +classifications, which itself is a sum of indicator functions, this constraint +is non-differentible, and therefore cannot be used in a problem that will be +optimized using a (stochastic) gradient-based algorithm. + +For this and similar problems, TFCO supports so-called *proxy constraints*, +which are (at least semi-differentiable) approximations of the original +constraints. For example, one could create a proxy recall function by replacing +the indicator functions with sigmoids. During optimization, each proxy +constraint function will be penalized, with the magnitude of the penalty being +chosen to satisfy the corresponding *original* (non-proxy) constraint. + +On a problem including proxy constraints—even a convex problem—the +Lagrangian approach discussed above isn't guaranteed to work. However, a +different algorithm, based on minimizing *swap regret*, does work. Aside from +this difference, the recommended procedure for optimizing a proxy-constrained +problem remains the same: periodically snapshot the model during optimization, +and then either find the best $$m+1$$-sized distribution, or heuristically +choose the single best snapshot. + +## Components + +* [constrained_minimization_problem](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py): + contains the `ConstrainedMinimizationProblem` interface. Your own + constrained optimization problems should be represented using + implementations of this interface. + +* [constrained_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py): + contains the `ConstrainedOptimizer` interface, which is similar to (but + different from) `tf.train.Optimizer`, with the main difference being that + `ConstrainedOptimizer`s are given `ConstrainedMinimizationProblem`s to + optimize, and perform constrained optimization. + + * [external_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py): + contains the `AdditiveExternalRegretOptimizer` implementation, which is + a `ConstrainedOptimizer` implementing the Lagrangian approach discussed + above (with additive updates to the Lagrange multipliers). You should + use this optimizer for problems *without* proxy constraints. It may also + work for problems with proxy constraints, but we recommend using a swap + regret optimizer, instead. + + This optimizer is most similar to Algorithm 3 in Appendix C.3 of + [CoJiSr18], and is discussed in Section 3. The two differences are that + it uses proxy constraints (if they're provided) in the update of the + model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for + the "inner" updates. + + * [swap_regret_optimizer](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py): + contains the `AdditiveSwapRegretOptimizer` and + `MultiplicativeSwapRegretOptimizer` implementations, which are + `ConstrainedOptimizer`s implementing the swap-regret minimization + approach mentioned above (with additive or multiplicative updates, + respectively, to the parameters associated with the + constraints—these parameters are not Lagrange multipliers, but + play a similar role). You should use one of these optimizers (we suggest + `MultiplicativeSwapRegretOptimizer`) for problems *with* proxy + constraints. + + The `MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2 + in Section 4 of [CoJiSr18], with the difference being that it uses + `tf.train.Optimizer`s, instead of SGD, for the "inner" updates. The + `AdditiveSwapRegretOptimizer` differs further in that it performs + additive (instead of multiplicative) updates of the stochastic matrix. + +* [candidates](https://www.tensorflow.org/code/tensorflow/contrib/constrained_optimization/python/candidates.py): + contains two functions, `find_best_candidate_distribution` and + `find_best_candidate_index`. Both of these functions are given a set of + candidate solutions to a constrained optimization problem, from which the + former finds the best distribution over at most $$m+1$$ candidates, and the + latter heuristically finds the single best candidate. As discussed above, + the set of candidates will typically be model snapshots saved periodically + during optimization. Both of these functions require that scipy be + installed. + + The `find_best_candidate_distribution` function implements the approach + described in Lemma 3 of [CoJiSr18], while `find_best_candidate_index` + implements the heuristic used for hyperparameter search in the experiments + of Section 5.2. + +## Convex Example with Proxy Constraints + +This is a simple example of recall-constrained optimization on simulated data: +we will try to find a classifier that minimizes the average hinge loss while +constraining recall to be at least 90%. + +We'll start with the required imports—notice the definition of `tfco`: + +```python +import math +import numpy as np +import tensorflow as tf + +tfco = tf.contrib.constrained_optimization +``` + +We'll now create an implementation of the `ConstrainedMinimizationProblem` class +for this problem. The constructor takes three parameters: a Tensor containing +the classification labels (0 or 1) for every training example, another Tensor +containing the model's predictions on every training example (sometimes called +the "logits"), and the lower bound on recall that will be enforced using a +constraint. + +This implementation will contain both constraints *and* proxy constraints: the +former represents the constraint that the true recall (defined in terms of the +*number* of true positives) be at least `recall_lower_bound`, while the latter +represents the same constraint, but on a hinge approximation of the recall. + +```python +class ExampleProblem(tfco.ConstrainedMinimizationProblem): + + def __init__(self, labels, predictions, recall_lower_bound): + self._labels = labels + self._predictions = predictions + self._recall_lower_bound = recall_lower_bound + # The number of positively-labeled examples. + self._positive_count = tf.reduce_sum(self._labels) + + @property + def objective(self): + return tf.losses.hinge_loss(labels=self._labels, logits=self._predictions) + + @property + def constraints(self): + true_positives = self._labels * tf.to_float(self._predictions > 0) + true_positive_count = tf.reduce_sum(true_positives) + recall = true_positive_count / self._positive_count + # The constraint is (recall >= self._recall_lower_bound), which we convert + # to (self._recall_lower_bound - recall <= 0) because + # ConstrainedMinimizationProblems must always provide their constraints in + # the form (tensor <= 0). + # + # The result of this function should be a tensor, with each element being + # a quantity that is constrained to be nonpositive. We only have one + # constraint, so we return a one-element tensor. + return self._recall_lower_bound - recall + + @property + def proxy_constraints(self): + # Use 1 - hinge since we're SUBTRACTING recall in the constraint function, + # and we want the proxy constraint function to be convex. + true_positives = self._labels * tf.minimum(1.0, self._predictions) + true_positive_count = tf.reduce_sum(true_positives) + recall = true_positive_count / self._positive_count + # Please see the corresponding comment in the constraints property. + return self._recall_lower_bound - recall +``` + +We'll now create a simple simulated dataset by sampling 1000 random +10-dimensional feature vectors from a Gaussian, finding their labels using a +random "ground truth" linear model, and then adding noise by randomly flipping +200 labels. + +```python +# Create a simulated 10-dimensional training dataset consisting of 1000 labeled +# examples, of which 800 are labeled correctly and 200 are mislabeled. +num_examples = 1000 +num_mislabeled_examples = 200 +dimension = 10 +# We will constrain the recall to be at least 90%. +recall_lower_bound = 0.9 + +# Create random "ground truth" parameters to a linear model. +ground_truth_weights = np.random.normal(size=dimension) / math.sqrt(dimension) +ground_truth_threshold = 0 + +# Generate a random set of features for each example. +features = np.random.normal(size=(num_examples, dimension)).astype( + np.float32) / math.sqrt(dimension) +# Compute the labels from these features given the ground truth linear model. +labels = (np.matmul(features, ground_truth_weights) > + ground_truth_threshold).astype(np.float32) +# Add noise by randomly flipping num_mislabeled_examples labels. +mislabeled_indices = np.random.choice( + num_examples, num_mislabeled_examples, replace=False) +labels[mislabeled_indices] = 1 - labels[mislabeled_indices] +``` + +We're now ready to construct our model, and the corresponding optimization +problem. We'll use a linear model of the form $$f(x) = w^T x - t$$, where $$w$$ +is the `weights`, and $$t$$ is the `threshold`. The `problem` variable will hold +an instance of the `ExampleProblem` class we created earlier. + +```python +# Create variables containing the model parameters. +weights = tf.Variable(tf.zeros(dimension), dtype=tf.float32, name="weights") +threshold = tf.Variable(0.0, dtype=tf.float32, name="threshold") + +# Create the optimization problem. +constant_labels = tf.constant(labels, dtype=tf.float32) +constant_features = tf.constant(features, dtype=tf.float32) +predictions = tf.tensordot(constant_features, weights, axes=(1, 0)) - threshold +problem = ExampleProblem( + labels=constant_labels, + predictions=predictions, + recall_lower_bound=recall_lower_bound, +) +``` + +We're almost ready to train our model, but first we'll create a couple of +functions to measure its performance. We're interested in two quantities: the +average hinge loss (which we seek to minimize), and the recall (which we +constrain). + +```python +def average_hinge_loss(labels, predictions): + num_examples, = np.shape(labels) + signed_labels = (labels * 2) - 1 + total_hinge_loss = np.sum(np.maximum(0.0, 1.0 - signed_labels * predictions)) + return total_hinge_loss / num_examples + +def recall(labels, predictions): + positive_count = np.sum(labels) + true_positives = labels * (predictions > 0) + true_positive_count = np.sum(true_positives) + return true_positive_count / positive_count +``` + +As was mentioned earlier, external regret optimizers suffice for problems +without proxy constraints, but swap regret optimizers are recommended for +problems *with* proxy constraints. Since this problem contains proxy +constraints, we use the `MultiplicativeSwapRegretOptimizer`. + +For this problem, the constraint is fairly easy to satisfy, so we can use the +same "inner" optimizer (an `AdagradOptimizer` with a learning rate of 1) for +optimization of both the model parameters (`weights` and `threshold`), and the +internal parameters associated with the constraints (these are the analogues of +the Lagrange multipliers used by the `MultiplicativeSwapRegretOptimizer`). For +more difficult problems, it will often be necessary to use different optimizers, +with different learning rates (presumably found via a hyperparameter search): to +accomplish this, pass *both* the `optimizer` and `constraint_optimizer` +parameters to `MultiplicativeSwapRegretOptimizer`'s constructor. + +Since this is a convex problem (both the objective and proxy constraint +functions are convex), we can just take the last iterate. Periodic snapshotting, +and the use of the `find_best_candidate_distribution` or +`find_best_candidate_index` functions, is generally only necessary for +non-convex problems (and even then, it isn't *always* necessary). + +```python +with tf.Session() as session: + optimizer = tfco.MultiplicativeSwapRegretOptimizer( + optimizer=tf.train.AdagradOptimizer(learning_rate=1.0)) + train_op = optimizer.minimize(problem) + + session.run(tf.global_variables_initializer()) + for ii in xrange(1000): + session.run(train_op) + + trained_weights, trained_threshold = session.run((weights, threshold)) + +trained_predictions = np.matmul(features, trained_weights) - trained_threshold +print("Constrained average hinge loss = %f" % average_hinge_loss( + labels, trained_predictions)) +print("Constrained recall = %f" % recall(labels, trained_predictions)) +``` + +Running the above code gives the following output (due to the randomness of the +dataset, you'll get a different result when you run it): + +```none +Constrained average hinge loss = 0.710019 +Constrained recall = 0.899811 +``` + +As we hoped, the recall is extremely close to 90%—and, thanks to the use +of proxy constraints, this is the *true* recall, not a hinge approximation. + +For comparison, let's try optimizing the same problem *without* the recall +constraint: + +```python +with tf.Session() as session: + optimizer = tf.train.AdagradOptimizer(learning_rate=1.0) + # For optimizing the unconstrained problem, we just minimize the "objective" + # portion of the minimization problem. + train_op = optimizer.minimize(problem.objective) + + session.run(tf.global_variables_initializer()) + for ii in xrange(1000): + session.run(train_op) + + trained_weights, trained_threshold = session.run((weights, threshold)) + +trained_predictions = np.matmul(features, trained_weights) - trained_threshold +print("Unconstrained average hinge loss = %f" % average_hinge_loss( + labels, trained_predictions)) +print("Unconstrained recall = %f" % recall(labels, trained_predictions)) +``` + +This code gives the following output (again, you'll get a different answer, +since the dataset is random): + +```none +Unconstrained average hinge loss = 0.627271 +Unconstrained recall = 0.793951 +``` + +Because there is no constraint, the unconstrained problem does a better job of +minimizing the average hinge loss, but naturally doesn't approach 90% recall. diff --git a/tensorflow/contrib/constrained_optimization/__init__.py b/tensorflow/contrib/constrained_optimization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e49ba9f179ea98aaa9c35f79787605b53a1ec53 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/__init__.py @@ -0,0 +1,41 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""A library for performing constrained optimization in TensorFlow.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# pylint: disable=wildcard-import +from tensorflow.contrib.constrained_optimization.python.candidates import * +from tensorflow.contrib.constrained_optimization.python.constrained_minimization_problem import * +from tensorflow.contrib.constrained_optimization.python.constrained_optimizer import * +from tensorflow.contrib.constrained_optimization.python.external_regret_optimizer import * +from tensorflow.contrib.constrained_optimization.python.swap_regret_optimizer import * +# pylint: enable=wildcard-import + +from tensorflow.python.util.all_util import remove_undocumented + +_allowed_symbols = [ + "AdditiveExternalRegretOptimizer", + "AdditiveSwapRegretOptimizer", + "ConstrainedMinimizationProblem", + "ConstrainedOptimizer", + "find_best_candidate_distribution", + "find_best_candidate_index", + "MultiplicativeSwapRegretOptimizer", +] + +remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/constrained_optimization/python/candidates.py b/tensorflow/contrib/constrained_optimization/python/candidates.py new file mode 100644 index 0000000000000000000000000000000000000000..ac86a6741be1f244476f917d0e151166db65524b --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/candidates.py @@ -0,0 +1,319 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Code for optimizing over a set of candidate solutions. + +The functions in this file deal with the constrained problem: + +> minimize f(w) +> s.t. g_i(w) <= 0 for all i in {0,1,...,m-1} + +Here, f(w) is the "objective function", and g_i(w) is the ith (of m) "constraint +function". Given the values of the objective and constraint functions for a set +of n "candidate solutions" {w_0,w_1,...,w_{n-1}} (for a total of n objective +function values, and n*m constraint function values), the +`find_best_candidate_distribution` function finds the best DISTRIBUTION over +these candidates, while `find_best_candidate_index' heuristically finds the +single best candidate. + +Both of these functions have dependencies on `scipy`, so if you want to call +them, then you must make sure that `scipy` is available. The imports are +performed inside the functions themselves, so if they're not actually called, +then `scipy` is not needed. + +For more specifics, please refer to: + +> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex +> Constrained Optimization". +> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + +The `find_best_candidate_distribution` function implements the approach +described in Lemma 3, while `find_best_candidate_index` implements the heuristic +used for hyperparameter search in the experiments of Section 5.2. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin + + +def _find_best_candidate_distribution_helper(objective_vector, + constraints_matrix, + maximum_violation=0.0): + """Finds a distribution minimizing an objective subject to constraints. + + This function deals with the constrained problem: + + > minimize f(w) + > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1} + + Here, f(w) is the "objective function", and g_i(w) is the ith (of m) + "constraint function". Given a set of n "candidate solutions" + {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n + candidates that, in expectation, minimizes the objective while violating + the constraints by no more than `maximum_violation`. If no such distribution + exists, it returns an error (using Go-style error reporting). + + The `objective_vector` parameter should be a numpy array with shape (n,), for + which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a + numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j). + + This function will return a distribution for which at most m+1 probabilities, + and often fewer, are nonzero. + + Args: + objective_vector: numpy array of shape (n,), where n is the number of + "candidate solutions". Contains the objective function values. + constraints_matrix: numpy array of shape (m,n), where m is the number of + constraints and n is the number of "candidate solutions". Contains the + constraint violation magnitudes. + maximum_violation: nonnegative float, the maximum amount by which any + constraint may be violated, in expectation. + + Returns: + A pair (`result`, `message`), exactly one of which is None. If `message` is + None, then the `result` contains the optimal distribution as a numpy array + of shape (n,). If `result` is None, then `message` contains an error + message. + + Raises: + ValueError: If `objective_vector` and `constraints_matrix` have inconsistent + shapes, or if `maximum_violation` is negative. + ImportError: If we're unable to import `scipy.optimize`. + """ + if maximum_violation < 0.0: + raise ValueError("maximum_violation must be nonnegative") + + mm, nn = np.shape(constraints_matrix) + if (nn,) != np.shape(objective_vector): + raise ValueError( + "objective_vector must have shape (n,), and constraints_matrix (m, n)," + " where n is the number of candidates, and m is the number of " + "constraints") + + # We import scipy inline, instead of at the top of the file, so that a scipy + # dependency is only introduced if either find_best_candidate_distribution() + # or find_best_candidate_index() are actually called. + import scipy.optimize # pylint: disable=g-import-not-at-top + + # Feasibility (within maximum_violation) constraints. + a_ub = constraints_matrix + b_ub = np.full((mm, 1), maximum_violation) + # Sum-to-one constraint. + a_eq = np.ones((1, nn)) + b_eq = np.ones((1, 1)) + # Nonnegativity constraints. + bounds = (0, None) + + result = scipy.optimize.linprog( + objective_vector, + A_ub=a_ub, + b_ub=b_ub, + A_eq=a_eq, + b_eq=b_eq, + bounds=bounds) + # Go-style error reporting. We don't raise on error, since + # find_best_candidate_distribution() needs to handle the failure case, and we + # shouldn't use exceptions as flow-control. + if not result.success: + return (None, result.message) + else: + return (result.x, None) + + +def find_best_candidate_distribution(objective_vector, + constraints_matrix, + epsilon=0.0): + """Finds a distribution minimizing an objective subject to constraints. + + This function deals with the constrained problem: + + > minimize f(w) + > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1} + + Here, f(w) is the "objective function", and g_i(w) is the ith (of m) + "constraint function". Given a set of n "candidate solutions" + {w_0,w_1,...,w_{n-1}}, this function finds a distribution over these n + candidates that, in expectation, minimizes the objective while violating + the constraints by the smallest possible amount (with the amount being found + via bisection search). + + The `objective_vector` parameter should be a numpy array with shape (n,), for + which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a + numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j). + + This function will return a distribution for which at most m+1 probabilities, + and often fewer, are nonzero. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + This function implements the approach described in Lemma 3. + + Args: + objective_vector: numpy array of shape (n,), where n is the number of + "candidate solutions". Contains the objective function values. + constraints_matrix: numpy array of shape (m,n), where m is the number of + constraints and n is the number of "candidate solutions". Contains the + constraint violation magnitudes. + epsilon: nonnegative float, the threshold at which to terminate the binary + search while searching for the minimal expected constraint violation + magnitude. + + Returns: + The optimal distribution, as a numpy array of shape (n,). + + Raises: + ValueError: If `objective_vector` and `constraints_matrix` have inconsistent + shapes, or if `epsilon` is negative. + ImportError: If we're unable to import `scipy.optimize`. + """ + if epsilon < 0.0: + raise ValueError("epsilon must be nonnegative") + + # If there is a feasible solution (i.e. with maximum_violation=0), then that's + # what we'll return. + pp, _ = _find_best_candidate_distribution_helper(objective_vector, + constraints_matrix) + if pp is not None: + return pp + + # The bound is the minimum over all candidates, of the maximum per-candidate + # constraint violation. + lower = 0.0 + upper = np.min(np.amax(constraints_matrix, axis=0)) + best_pp, _ = _find_best_candidate_distribution_helper( + objective_vector, constraints_matrix, maximum_violation=upper) + assert best_pp is not None + + # Throughout this loop, a maximum_violation of "lower" is not achievable, + # but a maximum_violation of "upper" is achiveable. + while True: + middle = 0.5 * (lower + upper) + if (middle - lower <= epsilon) or (upper - middle <= epsilon): + break + else: + pp, _ = _find_best_candidate_distribution_helper( + objective_vector, constraints_matrix, maximum_violation=middle) + if pp is None: + lower = middle + else: + best_pp = pp + upper = middle + + return best_pp + + +def find_best_candidate_index(objective_vector, + constraints_matrix, + rank_objectives=False): + """Heuristically finds the best candidate solution to a constrained problem. + + This function deals with the constrained problem: + + > minimize f(w) + > s.t. g_i(w) <= 0 for all i in {0,1,...,m-1} + + Here, f(w) is the "objective function", and g_i(w) is the ith (of m) + "constraint function". Given a set of n "candidate solutions" + {w_0,w_1,...,w_{n-1}}, this function finds the "best" solution according + to the following heuristic: + + 1. Across all models, the ith constraint violations (i.e. max{0, g_i(0)}) + are ranked, as are the objectives (if rank_objectives=True). + 2. Each model is then associated its MAXIMUM rank across all m constraints + (and the objective, if rank_objectives=True). + 3. The model with the minimal maximum rank is then identified. Ties are + broken using the objective function value. + 4. The index of this "best" model is returned. + + The `objective_vector` parameter should be a numpy array with shape (n,), for + which objective_vector[i] = f(w_i). Likewise, `constraints_matrix` should be a + numpy array with shape (m,n), for which constraints_matrix[i,j] = g_i(w_j). + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + This function implements the heuristic used for hyperparameter search in the + experiments of Section 5.2. + + Args: + objective_vector: numpy array of shape (n,), where n is the number of + "candidate solutions". Contains the objective function values. + constraints_matrix: numpy array of shape (m,n), where m is the number of + constraints and n is the number of "candidate solutions". Contains the + constraint violation magnitudes. + rank_objectives: bool, whether the objective function values should be + included in the initial ranking step. If True, both the objective and + constraints will be ranked. If False, only the constraints will be ranked. + In either case, the objective function values will be used for + tiebreaking. + + Returns: + The index (in {0,1,...,n-1}) of the "best" model according to the above + heuristic. + + Raises: + ValueError: If `objective_vector` and `constraints_matrix` have inconsistent + shapes. + ImportError: If we're unable to import `scipy.stats`. + """ + mm, nn = np.shape(constraints_matrix) + if (nn,) != np.shape(objective_vector): + raise ValueError( + "objective_vector must have shape (n,), and constraints_matrix (m, n)," + " where n is the number of candidates, and m is the number of " + "constraints") + + # We import scipy inline, instead of at the top of the file, so that a scipy + # dependency is only introduced if either find_best_candidate_distribution() + # or find_best_candidate_index() are actually called. + import scipy.stats # pylint: disable=g-import-not-at-top + + if rank_objectives: + maximum_ranks = scipy.stats.rankdata(objective_vector, method="min") + else: + maximum_ranks = np.zeros(nn, dtype=np.int64) + for ii in xrange(mm): + # Take the maximum of the constraint functions with zero, since we want to + # rank the magnitude of constraint *violations*. If the constraint is + # satisfied, then we don't care how much it's satisfied by (as a result, we + # we expect all models satisfying a constraint to be tied at rank 1). + ranks = scipy.stats.rankdata( + np.maximum(0.0, constraints_matrix[ii, :]), method="min") + maximum_ranks = np.maximum(maximum_ranks, ranks) + + best_index = None + best_rank = float("Inf") + best_objective = float("Inf") + for ii in xrange(nn): + if maximum_ranks[ii] < best_rank: + best_index = ii + best_rank = maximum_ranks[ii] + best_objective = objective_vector[ii] + elif (maximum_ranks[ii] == best_rank) and (objective_vector[ii] <= + best_objective): + best_index = ii + best_objective = objective_vector[ii] + + return best_index diff --git a/tensorflow/contrib/constrained_optimization/python/candidates_test.py b/tensorflow/contrib/constrained_optimization/python/candidates_test.py new file mode 100644 index 0000000000000000000000000000000000000000..a4c49d48bc5c763489215261a909573af0f19055 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/candidates_test.py @@ -0,0 +1,95 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for constrained_optimization.python.candidates.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.constrained_optimization.python import candidates +from tensorflow.python.platform import test + + +class CandidatesTest(test.TestCase): + + def test_inconsistent_shapes_for_best_distribution(self): + """An error is raised when parameters have inconsistent shapes.""" + objective_vector = np.array([1, 2, 3]) + constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) + with self.assertRaises(ValueError): + _ = candidates.find_best_candidate_distribution(objective_vector, + constraints_matrix) + + def test_inconsistent_shapes_for_best_index(self): + """An error is raised when parameters have inconsistent shapes.""" + objective_vector = np.array([1, 2, 3]) + constraints_matrix = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]) + with self.assertRaises(ValueError): + _ = candidates.find_best_candidate_index(objective_vector, + constraints_matrix) + + def test_best_distribution(self): + """Distribution should match known solution.""" + objective_vector = np.array( + [0.03053309, -0.06667082, 0.88355145, 0.46529806]) + constraints_matrix = np.array( + [[-0.60164551, 0.36676229, 0.7856454, -0.8441711], + [0.00371592, -0.16392108, -0.59778071, -0.56908492]]) + distribution = candidates.find_best_candidate_distribution( + objective_vector, constraints_matrix) + # Verify that the solution is a probability distribution. + self.assertTrue(np.all(distribution >= 0)) + self.assertAlmostEqual(np.sum(distribution), 1.0) + # Verify that the solution satisfies the constraints. + maximum_constraint_violation = np.amax( + np.dot(constraints_matrix, distribution)) + self.assertLessEqual(maximum_constraint_violation, 0) + # Verify that the solution matches that which we expect. + expected_distribution = np.array([0.37872711, 0.62127289, 0, 0]) + self.assertAllClose(expected_distribution, distribution, rtol=0, atol=1e-6) + + def test_best_index_rank_objectives_true(self): + """Index should match known solution.""" + # Objective ranks = [2, 1, 4, 3]. + objective_vector = np.array( + [0.03053309, -0.06667082, 0.88355145, 0.46529806]) + # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]]. + constraints_matrix = np.array( + [[-0.60164551, 0.36676229, 0.7856454, -0.8441711], + [0.00371592, -0.16392108, -0.59778071, -0.56908492]]) + # Maximum ranks = [4, 3, 4, 3]. + index = candidates.find_best_candidate_index( + objective_vector, constraints_matrix, rank_objectives=True) + self.assertEqual(1, index) + + def test_best_index_rank_objectives_false(self): + """Index should match known solution.""" + # Objective ranks = [2, 1, 4, 3]. + objective_vector = np.array( + [0.03053309, -0.06667082, 0.88355145, 0.46529806]) + # Constraint ranks = [[1, 3, 4, 1], [4, 1, 1, 1]]. + constraints_matrix = np.array( + [[-0.60164551, 0.36676229, 0.7856454, -0.8441711], + [0.00371592, -0.16392108, -0.59778071, -0.56908492]]) + # Maximum ranks = [4, 3, 4, 1]. + index = candidates.find_best_candidate_index( + objective_vector, constraints_matrix, rank_objectives=False) + self.assertEqual(3, index) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py new file mode 100644 index 0000000000000000000000000000000000000000..70813fb217956b167b80a7e1d555c8ba79088fdb --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py @@ -0,0 +1,123 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines abstract class for `ConstrainedMinimizationProblem`s. + +A ConstrainedMinimizationProblem consists of an objective function to minimize, +and a set of constraint functions that are constrained to be nonpositive. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc + +import six + + +@six.add_metaclass(abc.ABCMeta) +class ConstrainedMinimizationProblem(object): + """Abstract class representing a `ConstrainedMinimizationProblem`. + + A ConstrainedMinimizationProblem consists of an objective function to + minimize, and a set of constraint functions that are constrained to be + nonpositive. + + In addition to the constraint functions, there may (optionally) be proxy + constraint functions: a ConstrainedOptimizer will attempt to penalize these + proxy constraint functions so as to satisfy the (non-proxy) constraints. Proxy + constraints could be used if the constraints functions are difficult or + impossible to optimize (e.g. if they're piecewise constant), in which case the + proxy constraints should be some approximation of the original constraints + that is well-enough behaved to permit successful optimization. + """ + + @abc.abstractproperty + def objective(self): + """Returns the objective function. + + Returns: + A 0d tensor that should be minimized. + """ + pass + + @property + def num_constraints(self): + """Returns the number of constraints. + + Returns: + An int containing the number of constraints. + + Raises: + ValueError: If the constraints (or proxy_constraints, if present) do not + have fully-known shapes, OR if proxy_constraints are present, and the + shapes of constraints and proxy_constraints are fully-known, but they're + different. + """ + constraints_shape = self.constraints.get_shape() + if self.proxy_constraints is None: + proxy_constraints_shape = constraints_shape + else: + proxy_constraints_shape = self.proxy_constraints.get_shape() + + if (constraints_shape is None or proxy_constraints_shape is None or + any([ii is None for ii in constraints_shape.as_list()]) or + any([ii is None for ii in proxy_constraints_shape.as_list()])): + raise ValueError( + "constraints and proxy_constraints must have fully-known shapes") + if constraints_shape != proxy_constraints_shape: + raise ValueError( + "constraints and proxy_constraints must have the same shape") + + size = 1 + for ii in constraints_shape.as_list(): + size *= ii + return int(size) + + @abc.abstractproperty + def constraints(self): + """Returns the vector of constraint functions. + + Letting g_i be the ith element of the constraints vector, the ith constraint + will be g_i <= 0. + + Returns: + A tensor of constraint functions. + """ + pass + + # This is a property, instead of an abstract property, since it doesn't need + # to be overridden: if proxy_constraints returns None, then there are no + # proxy constraints. + @property + def proxy_constraints(self): + """Returns the optional vector of proxy constraint functions. + + The difference between `constraints` and `proxy_constraints` is that, when + proxy constraints are present, the `constraints` are merely EVALUATED during + optimization, whereas the `proxy_constraints` are DIFFERENTIATED. If there + are no proxy constraints, then the `constraints` are both evaluated and + differentiated. + + For example, if we want to impose constraints on step functions, then we + could use these functions for `constraints`. However, because a step + function has zero gradient almost everywhere, we can't differentiate these + functions, so we would take `proxy_constraints` to be some differentiable + approximation of `constraints`. + + Returns: + A tensor of proxy constraint functions. + """ + return None diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..805554536610a5e2cc650ff0b47185f4fbd6fac5 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/constrained_optimizer.py @@ -0,0 +1,208 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines base class for `ConstrainedOptimizer`s.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc + +import six + +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import standard_ops +from tensorflow.python.training import optimizer as train_optimizer + + +@six.add_metaclass(abc.ABCMeta) +class ConstrainedOptimizer(object): + """Base class representing a constrained optimizer. + + A ConstrainedOptimizer wraps a tf.train.Optimizer (or more than one), and + applies it to a ConstrainedMinimizationProblem. Unlike a tf.train.Optimizer, + which takes a tensor to minimize as a parameter to its minimize() method, a + constrained optimizer instead takes a ConstrainedMinimizationProblem. + """ + + def __init__(self, optimizer): + """Constructs a new `ConstrainedOptimizer`. + + Args: + optimizer: tf.train.Optimizer, used to optimize the + ConstraintedMinimizationProblem. + + Returns: + A new `ConstrainedOptimizer`. + """ + self._optimizer = optimizer + + @property + def optimizer(self): + """Returns the `tf.train.Optimizer` used for optimization.""" + return self._optimizer + + def minimize_unconstrained(self, + minimization_problem, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the unconstrained problem. + + Unlike `minimize_constrained`, this function ignores the `constraints` (and + `proxy_constraints`) portion of the minimization problem entirely, and only + minimizes `objective`. + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + """ + return self.optimizer.minimize( + minimization_problem.objective, + global_step=global_step, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + name=name, + grad_loss=grad_loss) + + @abc.abstractmethod + def minimize_constrained(self, + minimization_problem, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the constrained problem. + + Unlike `minimize_unconstrained`, this function attempts to find a solution + that minimizes the `objective` portion of the minimization problem while + satisfying the `constraints` portion. + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + """ + pass + + def minimize(self, + minimization_problem, + unconstrained_steps=None, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the constrained problem. + + This method combines the functionality of `minimize_unconstrained` and + `minimize_constrained`. If global_step < unconstrained_steps, it will + perform an unconstrained update, and if global_step >= unconstrained_steps, + it will perform a constrained update. + + The reason for this functionality is that it may be best to initialize the + constrained optimizer with an approximate optimum of the unconstrained + problem. + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + unconstrained_steps: int, number of steps for which we should perform + unconstrained updates, before transitioning to constrained updates. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + + Raises: + ValueError: If unconstrained_steps is provided, but global_step is not. + """ + + def unconstrained_fn(): + """Returns an `Op` for minimizing the unconstrained problem.""" + return self.minimize_unconstrained( + minimization_problem=minimization_problem, + global_step=global_step, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + name=name, + grad_loss=grad_loss) + + def constrained_fn(): + """Returns an `Op` for minimizing the constrained problem.""" + return self.minimize_constrained( + minimization_problem=minimization_problem, + global_step=global_step, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + name=name, + grad_loss=grad_loss) + + if unconstrained_steps is not None: + if global_step is None: + raise ValueError( + "global_step cannot be None if unconstrained_steps is provided") + unconstrained_steps_tensor = ops.convert_to_tensor(unconstrained_steps) + dtype = unconstrained_steps_tensor.dtype + return control_flow_ops.cond( + standard_ops.cast(global_step, dtype) < unconstrained_steps_tensor, + true_fn=unconstrained_fn, + false_fn=constrained_fn) + else: + return constrained_fn() diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..01c6e4f08afb93e37aa124f31ca7faa10b07d4d6 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py @@ -0,0 +1,375 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines `AdditiveExternalRegretOptimizer`. + +This optimizer minimizes a `ConstrainedMinimizationProblem` by introducing +Lagrange multipliers, and using `tf.train.Optimizer`s to jointly optimize over +the model parameters and Lagrange multipliers. + +For the purposes of constrained optimization, at least in theory, +external-regret minimization suffices if the `ConstrainedMinimizationProblem` +we're optimizing doesn't have any `proxy_constraints`, while swap-regret +minimization should be used if `proxy_constraints` are present. + +For more specifics, please refer to: + +> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex +> Constrained Optimization". +> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + +The formulation used by the AdditiveExternalRegretOptimizer--which is simply the +usual Lagrangian formulation--can be found in Definition 1, and is discussed in +Section 3. This optimizer is most similar to Algorithm 3 in Appendix C.3, with +the two differences being that it uses proxy constraints (if they're provided) +in the update of the model parameters, and uses `tf.train.Optimizer`s, instead +of SGD, for the "inner" updates. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc + +import six + +from tensorflow.contrib.constrained_optimization.python import constrained_optimizer + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import standard_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.training import optimizer as train_optimizer + + +def _project_multipliers_wrt_euclidean_norm(multipliers, radius): + """Projects its argument onto the feasible region. + + The feasible region is the set of all vectors with nonnegative elements that + sum to at most `radius`. + + Args: + multipliers: 1d tensor, the Lagrange multipliers to project. + radius: float, the radius of the feasible region. + + Returns: + The 1d tensor that results from projecting `multipliers` onto the feasible + region w.r.t. the Euclidean norm. + + Raises: + ValueError: if the `multipliers` tensor does not have a fully-known shape, + or is not one-dimensional. + """ + multipliers_shape = multipliers.get_shape() + if multipliers_shape is None: + raise ValueError("multipliers must have known shape") + if multipliers_shape.ndims != 1: + raise ValueError( + "multipliers must be one dimensional (instead is %d-dimensional)" % + multipliers_shape.ndims) + dimension = multipliers_shape[0].value + if dimension is None: + raise ValueError("multipliers must have fully-known shape") + + def while_loop_condition(iteration, multipliers, inactive, old_inactive): + """Returns false if the while loop should terminate.""" + del multipliers # Needed by the body, but not the condition. + not_done = (iteration < dimension) + not_converged = standard_ops.reduce_any( + standard_ops.not_equal(inactive, old_inactive)) + return standard_ops.logical_and(not_done, not_converged) + + def while_loop_body(iteration, multipliers, inactive, old_inactive): + """Performs one iteration of the projection.""" + del old_inactive # Needed by the condition, but not the body. + iteration += 1 + scale = standard_ops.minimum( + 0.0, + (radius - standard_ops.reduce_sum(multipliers)) / standard_ops.maximum( + 1.0, standard_ops.reduce_sum(inactive))) + multipliers += scale * inactive + new_inactive = standard_ops.to_float(multipliers > 0) + multipliers *= new_inactive + return (iteration, multipliers, new_inactive, inactive) + + iteration = standard_ops.constant(0) + inactive = standard_ops.ones_like(multipliers) + + # We actually want a do-while loop, so we explicitly call while_loop_body() + # once before tf.while_loop(). + iteration, multipliers, inactive, old_inactive = while_loop_body( + iteration, multipliers, inactive, inactive) + iteration, multipliers, inactive, old_inactive = control_flow_ops.while_loop( + while_loop_condition, + while_loop_body, + loop_vars=(iteration, multipliers, inactive, old_inactive), + name="euclidean_projection") + + return multipliers + + +@six.add_metaclass(abc.ABCMeta) +class _ExternalRegretOptimizer(constrained_optimizer.ConstrainedOptimizer): + """Base class representing an `_ExternalRegretOptimizer`. + + This class contains most of the logic for performing constrained + optimization, minimizing external regret for the constraints player. What it + *doesn't* do is keep track of the internal state (the Lagrange multipliers). + Instead, the state is accessed via the _initial_state(), + _lagrange_multipliers(), _constraint_grad_and_var() and _projection_op() + methods. + + The reason for this is that we want to make it easy to implement different + representations of the internal state. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by `_ExternalRegretOptimizer`s--which is simply the usual + Lagrangian formulation--can be found in Definition 1, and is discussed in + Section 3. Such optimizers are most similar to Algorithm 3 in Appendix C.3. + """ + + def __init__(self, optimizer, constraint_optimizer=None): + """Constructs a new `_ExternalRegretOptimizer`. + + The difference between `optimizer` and `constraint_optimizer` (if the latter + is provided) is that the former is used for learning the model parameters, + while the latter us used for the Lagrange multipliers. If no + `constraint_optimizer` is provided, then `optimizer` is used for both. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of the ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multipliers. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multipliers. + + Returns: + A new `_ExternalRegretOptimizer`. + """ + super(_ExternalRegretOptimizer, self).__init__(optimizer=optimizer) + self._constraint_optimizer = constraint_optimizer + + @property + def constraint_optimizer(self): + """Returns the `tf.train.Optimizer` used for the Lagrange multipliers.""" + return self._constraint_optimizer + + @abc.abstractmethod + def _initial_state(self, num_constraints): + pass + + @abc.abstractmethod + def _lagrange_multipliers(self, state): + pass + + @abc.abstractmethod + def _constraint_grad_and_var(self, state, gradient): + pass + + @abc.abstractmethod + def _projection_op(self, state, name=None): + pass + + def minimize_constrained(self, + minimization_problem, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the constrained problem. + + The `optimizer` constructor parameter will be used to update the model + parameters, while the Lagrange multipliers will be updated using + `constrained_optimizer` (if provided) or `optimizer` (if not). + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + """ + objective = minimization_problem.objective + + constraints = minimization_problem.constraints + proxy_constraints = minimization_problem.proxy_constraints + if proxy_constraints is None: + proxy_constraints = constraints + # Flatten both constraints tensors to 1d. + num_constraints = minimization_problem.num_constraints + constraints = standard_ops.reshape(constraints, shape=(num_constraints,)) + proxy_constraints = standard_ops.reshape( + proxy_constraints, shape=(num_constraints,)) + + # We use a lambda to initialize the state so that, if this function call is + # inside the scope of a tf.control_dependencies() block, the dependencies + # will not be applied to the initializer. + state = standard_ops.Variable( + lambda: self._initial_state(num_constraints), + trainable=False, + name="external_regret_optimizer_state") + + multipliers = self._lagrange_multipliers(state) + loss = ( + objective + standard_ops.tensordot(multipliers, proxy_constraints, 1)) + multipliers_gradient = constraints + + update_ops = [] + if self.constraint_optimizer is None: + # If we don't have a separate constraint_optimizer, then we use + # self._optimizer for both the update of the model parameters, and that of + # the internal state. + grads_and_vars = self.optimizer.compute_gradients( + loss, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + grad_loss=grad_loss) + grads_and_vars.append( + self._constraint_grad_and_var(state, multipliers_gradient)) + update_ops.append( + self.optimizer.apply_gradients(grads_and_vars, name="update")) + else: + # If we have a separate constraint_optimizer, then we use self._optimizer + # for the update of the model parameters, and self._constraint_optimizer + # for that of the internal state. + grads_and_vars = self.optimizer.compute_gradients( + loss, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + grad_loss=grad_loss) + multiplier_grads_and_vars = [ + self._constraint_grad_and_var(state, multipliers_gradient) + ] + + gradients = [ + gradient for gradient, _ in grads_and_vars + multiplier_grads_and_vars + if gradient is not None + ] + with ops.control_dependencies(gradients): + update_ops.append( + self.optimizer.apply_gradients(grads_and_vars, name="update")) + update_ops.append( + self.constraint_optimizer.apply_gradients( + multiplier_grads_and_vars, name="optimizer_state_update")) + + with ops.control_dependencies(update_ops): + if global_step is None: + # If we don't have a global step, just project, and we're done. + return self._projection_op(state, name=name) + else: + # If we have a global step, then we need to increment it in addition to + # projecting. + projection_op = self._projection_op(state, name="project") + with ops.colocate_with(global_step): + global_step_op = state_ops.assign_add( + global_step, 1, name="global_step_increment") + return control_flow_ops.group(projection_op, global_step_op, name=name) + + +class AdditiveExternalRegretOptimizer(_ExternalRegretOptimizer): + """A `ConstrainedOptimizer` based on external-regret minimization. + + This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly + minimize over the model parameters, and maximize over Lagrange multipliers, + with the latter maximization using additive updates and an algorithm that + minimizes external regret. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by this optimizer--which is simply the usual Lagrangian + formulation--can be found in Definition 1, and is discussed in Section 3. It + is most similar to Algorithm 3 in Appendix C.3, with the two differences being + that it uses proxy constraints (if they're provided) in the update of the + model parameters, and uses `tf.train.Optimizer`s, instead of SGD, for the + "inner" updates. + """ + + def __init__(self, + optimizer, + constraint_optimizer=None, + maximum_multiplier_radius=None): + """Constructs a new `AdditiveExternalRegretOptimizer`. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multipliers. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multipliers. + maximum_multiplier_radius: float, an optional upper bound to impose on the + sum of the Lagrange multipliers. + + Returns: + A new `AdditiveExternalRegretOptimizer`. + + Raises: + ValueError: If the maximum_multiplier_radius parameter is nonpositive. + """ + super(AdditiveExternalRegretOptimizer, self).__init__( + optimizer=optimizer, constraint_optimizer=constraint_optimizer) + + if maximum_multiplier_radius and (maximum_multiplier_radius <= 0.0): + raise ValueError("maximum_multiplier_radius must be strictly positive") + + self._maximum_multiplier_radius = maximum_multiplier_radius + + def _initial_state(self, num_constraints): + # For an AdditiveExternalRegretOptimizer, the internal state is simply a + # tensor of Lagrange multipliers with shape (m,), where m is the number of + # constraints. + return standard_ops.zeros((num_constraints,), dtype=dtypes.float32) + + def _lagrange_multipliers(self, state): + return state + + def _constraint_grad_and_var(self, state, gradient): + # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True? + return (-gradient, state) + + def _projection_op(self, state, name=None): + with ops.colocate_with(state): + if self._maximum_multiplier_radius: + projected_multipliers = _project_multipliers_wrt_euclidean_norm( + state, self._maximum_multiplier_radius) + else: + projected_multipliers = standard_ops.maximum(state, 0.0) + return state_ops.assign(state, projected_multipliers, name=name) diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py new file mode 100644 index 0000000000000000000000000000000000000000..9b4bf6271009161c4c449cd9c3cdab9fba90aa59 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer_test.py @@ -0,0 +1,136 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for constrained_optimization.python.external_regret_optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.constrained_optimization.python import external_regret_optimizer +from tensorflow.contrib.constrained_optimization.python import test_util + +from tensorflow.python.ops import standard_ops +from tensorflow.python.platform import test +from tensorflow.python.training import gradient_descent + + +class AdditiveExternalRegretOptimizerWrapper( + external_regret_optimizer.AdditiveExternalRegretOptimizer): + """Testing wrapper class around AdditiveExternalRegretOptimizer. + + This class is identical to AdditiveExternalRegretOptimizer, except that it + caches the internal optimization state when _lagrange_multipliers() is called, + so that we can test that the Lagrange multipliers take on their expected + values. + """ + + def __init__(self, + optimizer, + constraint_optimizer=None, + maximum_multiplier_radius=None): + """Same as AdditiveExternalRegretOptimizer.__init__.""" + super(AdditiveExternalRegretOptimizerWrapper, self).__init__( + optimizer=optimizer, + constraint_optimizer=constraint_optimizer, + maximum_multiplier_radius=maximum_multiplier_radius) + self._cached_lagrange_multipliers = None + + @property + def lagrange_multipliers(self): + """Returns the cached Lagrange multipliers.""" + return self._cached_lagrange_multipliers + + def _lagrange_multipliers(self, state): + """Caches the internal state for testing.""" + self._cached_lagrange_multipliers = super( + AdditiveExternalRegretOptimizerWrapper, + self)._lagrange_multipliers(state) + return self._cached_lagrange_multipliers + + +class ExternalRegretOptimizerTest(test.TestCase): + + def test_project_multipliers_wrt_euclidean_norm(self): + """Tests Euclidean projection routine on some known values.""" + multipliers1 = standard_ops.constant([-0.1, -0.6, -0.3]) + expected_projected_multipliers1 = np.array([0.0, 0.0, 0.0]) + + multipliers2 = standard_ops.constant([-0.1, 0.6, 0.3]) + expected_projected_multipliers2 = np.array([0.0, 0.6, 0.3]) + + multipliers3 = standard_ops.constant([0.4, 0.7, -0.2, 0.5, 0.1]) + expected_projected_multipliers3 = np.array([0.2, 0.5, 0.0, 0.3, 0.0]) + + with self.test_session() as session: + projected_multipliers1 = session.run( + external_regret_optimizer._project_multipliers_wrt_euclidean_norm( + multipliers1, 1.0)) + projected_multipliers2 = session.run( + external_regret_optimizer._project_multipliers_wrt_euclidean_norm( + multipliers2, 1.0)) + projected_multipliers3 = session.run( + external_regret_optimizer._project_multipliers_wrt_euclidean_norm( + multipliers3, 1.0)) + + self.assertAllClose( + expected_projected_multipliers1, + projected_multipliers1, + rtol=0, + atol=1e-6) + self.assertAllClose( + expected_projected_multipliers2, + projected_multipliers2, + rtol=0, + atol=1e-6) + self.assertAllClose( + expected_projected_multipliers3, + projected_multipliers3, + rtol=0, + atol=1e-6) + + def test_additive_external_regret_optimizer(self): + """Tests that the Lagrange multipliers update as expected.""" + minimization_problem = test_util.ConstantMinimizationProblem( + np.array([0.6, -0.1, 0.4])) + optimizer = AdditiveExternalRegretOptimizerWrapper( + gradient_descent.GradientDescentOptimizer(1.0), + maximum_multiplier_radius=1.0) + train_op = optimizer.minimize_constrained(minimization_problem) + + expected_multipliers = [ + np.array([0.0, 0.0, 0.0]), + np.array([0.6, 0.0, 0.4]), + np.array([0.7, 0.0, 0.3]), + np.array([0.8, 0.0, 0.2]), + np.array([0.9, 0.0, 0.1]), + np.array([1.0, 0.0, 0.0]), + np.array([1.0, 0.0, 0.0]), + ] + + multipliers = [] + with self.test_session() as session: + session.run(standard_ops.global_variables_initializer()) + while len(multipliers) < len(expected_multipliers): + multipliers.append(session.run(optimizer.lagrange_multipliers)) + session.run(train_op) + + for expected, actual in zip(expected_multipliers, multipliers): + self.assertAllClose(expected, actual, rtol=0, atol=1e-6) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..04014ab4aebd6d9cd70653c53f9361320e803329 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py @@ -0,0 +1,595 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Defines `{Additive,Multiplicative}SwapRegretOptimizer`s. + +These optimizers minimize a `ConstrainedMinimizationProblem` by using a +swap-regret minimizing algorithm (either SGD or multiplicative weights) to learn +what weights should be associated with the objective function and constraints. +These algorithms do *not* use Lagrange multipliers, but the idea is similar. +The main differences between the formulation used here, and the standard +Lagrangian formulation, are that (i) the objective function is weighted, in +addition to the constraints, and (ii) we learn a matrix of weights, instead of a +vector. + +For the purposes of constrained optimization, at least in theory, +external-regret minimization suffices if the `ConstrainedMinimizationProblem` +we're optimizing doesn't have any `proxy_constraints`, while swap-regret +minimization should be used if `proxy_constraints` are present. + +For more specifics, please refer to: + +> Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex +> Constrained Optimization". +> [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + +The formulation used by both of the SwapRegretOptimizers can be found in +Definition 2, and is discussed in Section 4. The +`MultiplicativeSwapRegretOptimizer` is most similar to Algorithm 2 in Section 4, +with the difference being that it uses `tf.train.Optimizer`s, instead of SGD, +for the "inner" updates. The `AdditiveSwapRegretOptimizer` differs further in +that it performs additive (instead of multiplicative) updates of the stochastic +matrix. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import abc +import math + +import six + +from tensorflow.contrib.constrained_optimization.python import constrained_optimizer + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import standard_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.training import optimizer as train_optimizer + + +def _maximal_eigenvector_power_method(matrix, + epsilon=1e-6, + maximum_iterations=100): + """Returns the maximal right-eigenvector of `matrix` using the power method. + + Args: + matrix: 2D Tensor, the matrix of which we will find the maximal + right-eigenvector. + epsilon: nonnegative float, if two iterations of the power method differ (in + L2 norm) by no more than epsilon, we will terminate. + maximum_iterations: nonnegative int, if we perform this many iterations, we + will terminate. + + Result: + The maximal right-eigenvector of `matrix`. + + Raises: + ValueError: If the epsilon or maximum_iterations parameters violate their + bounds. + """ + if epsilon <= 0.0: + raise ValueError("epsilon must be strictly positive") + if maximum_iterations <= 0: + raise ValueError("maximum_iterations must be strictly positive") + + def while_loop_condition(iteration, eigenvector, old_eigenvector): + """Returns false if the while loop should terminate.""" + not_done = (iteration < maximum_iterations) + not_converged = (standard_ops.norm(eigenvector - old_eigenvector) > epsilon) + return standard_ops.logical_and(not_done, not_converged) + + def while_loop_body(iteration, eigenvector, old_eigenvector): + """Performs one iteration of the power method.""" + del old_eigenvector # Needed by the condition, but not the body. + iteration += 1 + # We need to use tf.matmul() and tf.expand_dims(), instead of + # tf.tensordot(), since the former will infer the shape of the result, while + # the latter will not (tf.while_loop() needs the shapes). + new_eigenvector = standard_ops.matmul( + matrix, standard_ops.expand_dims(eigenvector, 1))[:, 0] + new_eigenvector /= standard_ops.norm(new_eigenvector) + return (iteration, new_eigenvector, eigenvector) + + iteration = standard_ops.constant(0) + eigenvector = standard_ops.ones_like(matrix[:, 0]) + eigenvector /= standard_ops.norm(eigenvector) + + # We actually want a do-while loop, so we explicitly call while_loop_body() + # once before tf.while_loop(). + iteration, eigenvector, old_eigenvector = while_loop_body( + iteration, eigenvector, eigenvector) + iteration, eigenvector, old_eigenvector = control_flow_ops.while_loop( + while_loop_condition, + while_loop_body, + loop_vars=(iteration, eigenvector, old_eigenvector), + name="power_method") + + return eigenvector + + +def _project_stochastic_matrix_wrt_euclidean_norm(matrix): + """Projects its argument onto the set of left-stochastic matrices. + + This algorithm is O(n^3) at worst, where `matrix` is n*n. It can be done in + O(n^2 * log(n)) time by sorting each column (and maybe better with a different + algorithm), but the algorithm implemented here is easier to implement in + TensorFlow. + + Args: + matrix: 2d square tensor, the matrix to project. + + Returns: + The 2d square tensor that results from projecting `matrix` onto the set of + left-stochastic matrices w.r.t. the Euclidean norm applied column-wise + (i.e. the Frobenius norm). + + Raises: + ValueError: if the `matrix` tensor does not have a fully-known shape, or is + not two-dimensional and square. + """ + matrix_shape = matrix.get_shape() + if matrix_shape is None: + raise ValueError("matrix must have known shape") + if matrix_shape.ndims != 2: + raise ValueError( + "matrix must be two dimensional (instead is %d-dimensional)" % + matrix_shape.ndims) + if matrix_shape[0] != matrix_shape[1]: + raise ValueError("matrix must be be square (instead has shape (%d,%d))" % + (matrix_shape[0], matrix_shape[1])) + dimension = matrix_shape[0].value + if dimension is None: + raise ValueError("matrix must have fully-known shape") + + def while_loop_condition(iteration, matrix, inactive, old_inactive): + """Returns false if the while loop should terminate.""" + del matrix # Needed by the body, but not the condition. + not_done = (iteration < dimension) + not_converged = standard_ops.reduce_any( + standard_ops.not_equal(inactive, old_inactive)) + return standard_ops.logical_and(not_done, not_converged) + + def while_loop_body(iteration, matrix, inactive, old_inactive): + """Performs one iteration of the projection.""" + del old_inactive # Needed by the condition, but not the body. + iteration += 1 + scale = (1.0 - standard_ops.reduce_sum( + matrix, axis=0, keep_dims=True)) / standard_ops.maximum( + 1.0, standard_ops.reduce_sum(inactive, axis=0, keep_dims=True)) + matrix += scale * inactive + new_inactive = standard_ops.to_float(matrix > 0) + matrix *= new_inactive + return (iteration, matrix, new_inactive, inactive) + + iteration = standard_ops.constant(0) + inactive = standard_ops.ones_like(matrix) + + # We actually want a do-while loop, so we explicitly call while_loop_body() + # once before tf.while_loop(). + iteration, matrix, inactive, old_inactive = while_loop_body( + iteration, matrix, inactive, inactive) + iteration, matrix, inactive, old_inactive = control_flow_ops.while_loop( + while_loop_condition, + while_loop_body, + loop_vars=(iteration, matrix, inactive, old_inactive), + name="euclidean_projection") + + return matrix + + +def _project_log_stochastic_matrix_wrt_kl_divergence(log_matrix): + """Projects its argument onto the set of log-left-stochastic matrices. + + Args: + log_matrix: 2d square tensor, the element-wise logarithm of the matrix to + project. + + Returns: + The 2d square tensor that results from projecting exp(`matrix`) onto the set + of left-stochastic matrices w.r.t. the KL-divergence applied column-wise. + """ + + # For numerical reasons, make sure that the largest matrix element is zero + # before exponentiating. + log_matrix -= standard_ops.reduce_max(log_matrix, axis=0, keep_dims=True) + log_matrix -= standard_ops.log( + standard_ops.reduce_sum( + standard_ops.exp(log_matrix), axis=0, keep_dims=True)) + return log_matrix + + +@six.add_metaclass(abc.ABCMeta) +class _SwapRegretOptimizer(constrained_optimizer.ConstrainedOptimizer): + """Base class representing a `_SwapRegretOptimizer`. + + This class contains most of the logic for performing constrained optimization, + minimizing external regret for the constraints player. What it *doesn't* do is + keep track of the internal state (the stochastic matrix). Instead, the state + is accessed via the _initial_state(), _stochastic_matrix(), + _constraint_grad_and_var() and _projection_op() methods. + + The reason for this is that we want to make it easy to implement different + representations of the internal state. For example, for additive updates, it's + most natural to store the stochastic matrix directly, whereas for + multiplicative updates, it's most natural to store its element-wise logarithm. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by `_SwapRegretOptimizer`s can be found in Definition 2, + and is discussed in Section 4. Such optimizers are most similar to Algorithm + 2 in Section 4. Most notably, the internal state is a left-stochastic matrix + of shape (m+1,m+1), where m is the number of constraints. + """ + + def __init__(self, optimizer, constraint_optimizer=None): + """Constructs a new `_SwapRegretOptimizer`. + + The difference between `optimizer` and `constraint_optimizer` (if the latter + is provided) is that the former is used for learning the model parameters, + while the latter us used for the update to the constraint/objective weight + matrix (the analogue of Lagrange multipliers). If no `constraint_optimizer` + is provided, then `optimizer` is used for both. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multiplier analogues. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multiplier analogues. + + Returns: + A new `_SwapRegretOptimizer`. + """ + super(_SwapRegretOptimizer, self).__init__(optimizer=optimizer) + self._constraint_optimizer = constraint_optimizer + + @property + def constraint_optimizer(self): + """Returns the `tf.train.Optimizer` used for the matrix.""" + return self._constraint_optimizer + + @abc.abstractmethod + def _initial_state(self, num_constraints): + pass + + @abc.abstractmethod + def _stochastic_matrix(self, state): + pass + + def _distribution(self, state): + distribution = _maximal_eigenvector_power_method( + self._stochastic_matrix(state)) + distribution = standard_ops.abs(distribution) + distribution /= standard_ops.reduce_sum(distribution) + return distribution + + @abc.abstractmethod + def _constraint_grad_and_var(self, state, gradient): + pass + + @abc.abstractmethod + def _projection_op(self, state, name=None): + pass + + def minimize_constrained(self, + minimization_problem, + global_step=None, + var_list=None, + gate_gradients=train_optimizer.Optimizer.GATE_OP, + aggregation_method=None, + colocate_gradients_with_ops=False, + name=None, + grad_loss=None): + """Returns an `Op` for minimizing the constrained problem. + + The `optimizer` constructor parameter will be used to update the model + parameters, while the constraint/objective weight matrix (the analogue of + Lagrange multipliers) will be updated using `constrained_optimizer` (if + provided) or `optimizer` (if not). Whether the matrix updates are additive + or multiplicative depends on the derived class. + + Args: + minimization_problem: ConstrainedMinimizationProblem, the problem to + optimize. + global_step: as in `tf.train.Optimizer`'s `minimize` method. + var_list: as in `tf.train.Optimizer`'s `minimize` method. + gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. + aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. + colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` + method. + name: as in `tf.train.Optimizer`'s `minimize` method. + grad_loss: as in `tf.train.Optimizer`'s `minimize` method. + + Returns: + TensorFlow Op. + """ + objective = minimization_problem.objective + + constraints = minimization_problem.constraints + proxy_constraints = minimization_problem.proxy_constraints + if proxy_constraints is None: + proxy_constraints = constraints + # Flatten both constraints tensors to 1d. + num_constraints = minimization_problem.num_constraints + constraints = standard_ops.reshape(constraints, shape=(num_constraints,)) + proxy_constraints = standard_ops.reshape( + proxy_constraints, shape=(num_constraints,)) + + # We use a lambda to initialize the state so that, if this function call is + # inside the scope of a tf.control_dependencies() block, the dependencies + # will not be applied to the initializer. + state = standard_ops.Variable( + lambda: self._initial_state(num_constraints), + trainable=False, + name="swap_regret_optimizer_state") + + zero_and_constraints = standard_ops.concat( + (standard_ops.zeros((1,)), constraints), axis=0) + objective_and_proxy_constraints = standard_ops.concat( + (standard_ops.expand_dims(objective, 0), proxy_constraints), axis=0) + + distribution = self._distribution(state) + loss = standard_ops.tensordot(distribution, objective_and_proxy_constraints, + 1) + matrix_gradient = standard_ops.matmul( + standard_ops.expand_dims(zero_and_constraints, 1), + standard_ops.expand_dims(distribution, 0)) + + update_ops = [] + if self.constraint_optimizer is None: + # If we don't have a separate constraint_optimizer, then we use + # self._optimizer for both the update of the model parameters, and that of + # the internal state. + grads_and_vars = self.optimizer.compute_gradients( + loss, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + grad_loss=grad_loss) + grads_and_vars.append( + self._constraint_grad_and_var(state, matrix_gradient)) + update_ops.append( + self.optimizer.apply_gradients(grads_and_vars, name="update")) + else: + # If we have a separate constraint_optimizer, then we use self._optimizer + # for the update of the model parameters, and self._constraint_optimizer + # for that of the internal state. + grads_and_vars = self.optimizer.compute_gradients( + loss, + var_list=var_list, + gate_gradients=gate_gradients, + aggregation_method=aggregation_method, + colocate_gradients_with_ops=colocate_gradients_with_ops, + grad_loss=grad_loss) + matrix_grads_and_vars = [ + self._constraint_grad_and_var(state, matrix_gradient) + ] + + gradients = [ + gradient for gradient, _ in grads_and_vars + matrix_grads_and_vars + if gradient is not None + ] + with ops.control_dependencies(gradients): + update_ops.append( + self.optimizer.apply_gradients(grads_and_vars, name="update")) + update_ops.append( + self.constraint_optimizer.apply_gradients( + matrix_grads_and_vars, name="optimizer_state_update")) + + with ops.control_dependencies(update_ops): + if global_step is None: + # If we don't have a global step, just project, and we're done. + return self._projection_op(state, name=name) + else: + # If we have a global step, then we need to increment it in addition to + # projecting. + projection_op = self._projection_op(state, name="project") + with ops.colocate_with(global_step): + global_step_op = state_ops.assign_add( + global_step, 1, name="global_step_increment") + return control_flow_ops.group(projection_op, global_step_op, name=name) + + +class AdditiveSwapRegretOptimizer(_SwapRegretOptimizer): + """A `ConstrainedOptimizer` based on swap-regret minimization. + + This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly + minimize over the model parameters, and maximize over constraint/objective + weight matrix (the analogue of Lagrange multipliers), with the latter + maximization using additive updates and an algorithm that minimizes swap + regret. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by this optimizer can be found in Definition 2, and is + discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with + the differences being that it uses `tf.train.Optimizer`s, instead of SGD, for + the "inner" updates, and performs additive (instead of multiplicative) updates + of the stochastic matrix. + """ + + def __init__(self, optimizer, constraint_optimizer=None): + """Constructs a new `AdditiveSwapRegretOptimizer`. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multiplier analogues. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multiplier analogues. + + Returns: + A new `AdditiveSwapRegretOptimizer`. + """ + # TODO(acotter): add a parameter determining the initial values of the + # matrix elements (like initial_multiplier_radius in + # MultiplicativeSwapRegretOptimizer). + super(AdditiveSwapRegretOptimizer, self).__init__( + optimizer=optimizer, constraint_optimizer=constraint_optimizer) + + def _initial_state(self, num_constraints): + # For an AdditiveSwapRegretOptimizer, the internal state is a tensor of + # shape (m+1,m+1), where m is the number of constraints, representing a + # left-stochastic matrix. + dimension = num_constraints + 1 + # Initialize by putting all weight on the objective, and none on the + # constraints. + return standard_ops.concat( + (standard_ops.ones( + (1, dimension)), standard_ops.zeros((dimension - 1, dimension))), + axis=0) + + def _stochastic_matrix(self, state): + return state + + def _constraint_grad_and_var(self, state, gradient): + # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True? + return (-gradient, state) + + def _projection_op(self, state, name=None): + with ops.colocate_with(state): + return state_ops.assign( + state, + _project_stochastic_matrix_wrt_euclidean_norm(state), + name=name) + + +class MultiplicativeSwapRegretOptimizer(_SwapRegretOptimizer): + """A `ConstrainedOptimizer` based on swap-regret minimization. + + This `ConstrainedOptimizer` uses the given `tf.train.Optimizer`s to jointly + minimize over the model parameters, and maximize over constraint/objective + weight matrix (the analogue of Lagrange multipliers), with the latter + maximization using multiplicative updates and an algorithm that minimizes swap + regret. + + For more specifics, please refer to: + + > Cotter, Jiang and Sridharan. "Two-Player Games for Efficient Non-Convex + > Constrained Optimization". + > [https://arxiv.org/abs/1804.06500](https://arxiv.org/abs/1804.06500) + + The formulation used by this optimizer can be found in Definition 2, and is + discussed in Section 4. It is most similar to Algorithm 2 in Section 4, with + the difference being that it uses `tf.train.Optimizer`s, instead of SGD, for + the "inner" updates. + """ + + def __init__(self, + optimizer, + constraint_optimizer=None, + minimum_multiplier_radius=1e-3, + initial_multiplier_radius=None): + """Constructs a new `MultiplicativeSwapRegretOptimizer`. + + Args: + optimizer: tf.train.Optimizer, used to optimize the objective and + proxy_constraints portion of ConstrainedMinimizationProblem. If + constraint_optimizer is not provided, this will also be used to optimize + the Lagrange multiplier analogues. + constraint_optimizer: optional tf.train.Optimizer, used to optimize the + Lagrange multiplier analogues. + minimum_multiplier_radius: float, each element of the matrix will be lower + bounded by `minimum_multiplier_radius` divided by one plus the number of + constraints. + initial_multiplier_radius: float, the initial value of each element of the + matrix associated with a constraint (i.e. excluding those elements + associated with the objective) will be `initial_multiplier_radius` + divided by one plus the number of constraints. Defaults to the value of + `minimum_multiplier_radius`. + + Returns: + A new `MultiplicativeSwapRegretOptimizer`. + + Raises: + ValueError: If the two radius parameters are inconsistent. + """ + super(MultiplicativeSwapRegretOptimizer, self).__init__( + optimizer=optimizer, constraint_optimizer=constraint_optimizer) + + if (minimum_multiplier_radius <= 0.0) or (minimum_multiplier_radius >= 1.0): + raise ValueError("minimum_multiplier_radius must be in the range (0,1)") + if initial_multiplier_radius is None: + initial_multiplier_radius = minimum_multiplier_radius + elif (initial_multiplier_radius < + minimum_multiplier_radius) or (minimum_multiplier_radius > 1.0): + raise ValueError("initial_multiplier_radius must be in the range " + "[minimum_multiplier_radius,1]") + + self._minimum_multiplier_radius = minimum_multiplier_radius + self._initial_multiplier_radius = initial_multiplier_radius + + def _initial_state(self, num_constraints): + # For a MultiplicativeSwapRegretOptimizer, the internal state is a tensor of + # shape (m+1,m+1), where m is the number of constraints, representing the + # element-wise logarithm of a left-stochastic matrix. + dimension = num_constraints + 1 + # Initialize by putting as much weight as possible on the objective, and as + # little as possible on the constraints. + log_initial_one = math.log(1.0 - (self._initial_multiplier_radius * + (dimension - 1) / (dimension))) + log_initial_zero = math.log(self._initial_multiplier_radius / dimension) + return standard_ops.concat( + (standard_ops.constant( + log_initial_one, dtype=dtypes.float32, shape=(1, dimension)), + standard_ops.constant( + log_initial_zero, + dtype=dtypes.float32, + shape=(dimension - 1, dimension))), + axis=0) + + def _stochastic_matrix(self, state): + return standard_ops.exp(state) + + def _constraint_grad_and_var(self, state, gradient): + # TODO(acotter): tf.colocate_with(), if colocate_gradients_with_ops is True? + return (-gradient, state) + + def _projection_op(self, state, name=None): + with ops.colocate_with(state): + # Gets the dimension of the state (num_constraints + 1)--all of these + # assertions are of things that should be impossible, since the state + # passed into this method will have the same shape as that returned by + # _initial_state(). + state_shape = state.get_shape() + assert state_shape is not None + assert state_shape.ndims == 2 + assert state_shape[0] == state_shape[1] + dimension = state_shape[0].value + assert dimension is not None + + minimum_log_multiplier = standard_ops.log( + self._minimum_multiplier_radius / standard_ops.to_float(dimension)) + + return state_ops.assign( + state, + standard_ops.maximum( + _project_log_stochastic_matrix_wrt_kl_divergence(state), + minimum_log_multiplier), + name=name) diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py new file mode 100644 index 0000000000000000000000000000000000000000..34c4543dca97e12c8335e4c90b849820edaefa81 --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer_test.py @@ -0,0 +1,212 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for constrained_optimization.python.swap_regret_optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.constrained_optimization.python import swap_regret_optimizer +from tensorflow.contrib.constrained_optimization.python import test_util + +from tensorflow.python.ops import standard_ops +from tensorflow.python.platform import test +from tensorflow.python.training import gradient_descent + + +class AdditiveSwapRegretOptimizerWrapper( + swap_regret_optimizer.AdditiveSwapRegretOptimizer): + """Testing wrapper class around AdditiveSwapRegretOptimizer. + + This class is identical to AdditiveSwapRegretOptimizer, except that it caches + the internal optimization state when _stochastic_matrix() is called, so that + we can test that the stochastic matrices take on their expected values. + """ + + def __init__(self, optimizer, constraint_optimizer=None): + """Same as AdditiveSwapRegretOptimizer.__init__().""" + super(AdditiveSwapRegretOptimizerWrapper, self).__init__( + optimizer=optimizer, constraint_optimizer=constraint_optimizer) + self._cached_stochastic_matrix = None + + @property + def stochastic_matrix(self): + """Returns the cached stochastic matrix.""" + return self._cached_stochastic_matrix + + def _stochastic_matrix(self, state): + """Caches the internal state for testing.""" + self._cached_stochastic_matrix = super(AdditiveSwapRegretOptimizerWrapper, + self)._stochastic_matrix(state) + return self._cached_stochastic_matrix + + +class MultiplicativeSwapRegretOptimizerWrapper( + swap_regret_optimizer.MultiplicativeSwapRegretOptimizer): + """Testing wrapper class around MultiplicativeSwapRegretOptimizer. + + This class is identical to MultiplicativeSwapRegretOptimizer, except that it + caches the internal optimization state when _stochastic_matrix() is called, so + that we can test that the stochastic matrices take on their expected values. + """ + + def __init__(self, + optimizer, + constraint_optimizer=None, + minimum_multiplier_radius=None, + initial_multiplier_radius=None): + """Same as MultiplicativeSwapRegretOptimizer.__init__().""" + super(MultiplicativeSwapRegretOptimizerWrapper, self).__init__( + optimizer=optimizer, + constraint_optimizer=constraint_optimizer, + minimum_multiplier_radius=1e-3, + initial_multiplier_radius=initial_multiplier_radius) + self._cached_stochastic_matrix = None + + @property + def stochastic_matrix(self): + """Returns the cached stochastic matrix.""" + return self._cached_stochastic_matrix + + def _stochastic_matrix(self, state): + """Caches the internal state for testing.""" + self._cached_stochastic_matrix = super( + MultiplicativeSwapRegretOptimizerWrapper, + self)._stochastic_matrix(state) + return self._cached_stochastic_matrix + + +class SwapRegretOptimizerTest(test.TestCase): + + def test_maximum_eigenvector_power_method(self): + """Tests power method routine on some known left-stochastic matrices.""" + matrix1 = np.matrix([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9], [0.4, 0.3, 0.0]]) + matrix2 = np.matrix([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5], [0.4, 0.5, 0.3]]) + + with self.test_session() as session: + eigenvector1 = session.run( + swap_regret_optimizer._maximal_eigenvector_power_method( + standard_ops.constant(matrix1))) + eigenvector2 = session.run( + swap_regret_optimizer._maximal_eigenvector_power_method( + standard_ops.constant(matrix2))) + + # Check that eigenvector1 and eigenvector2 are eigenvectors of matrix1 and + # matrix2 (respectively) with associated eigenvalue 1. + matrix_eigenvector1 = np.tensordot(matrix1, eigenvector1, axes=1) + matrix_eigenvector2 = np.tensordot(matrix2, eigenvector2, axes=1) + self.assertAllClose(eigenvector1, matrix_eigenvector1, rtol=0, atol=1e-6) + self.assertAllClose(eigenvector2, matrix_eigenvector2, rtol=0, atol=1e-6) + + def test_project_stochastic_matrix_wrt_euclidean_norm(self): + """Tests Euclidean projection routine on some known values.""" + matrix = standard_ops.constant([[-0.1, -0.1, 0.4], [-0.8, 0.4, 1.2], + [-0.3, 0.1, 0.2]]) + expected_projected_matrix = np.array([[0.6, 0.1, 0.1], [0.0, 0.6, 0.9], + [0.4, 0.3, 0.0]]) + + with self.test_session() as session: + projected_matrix = session.run( + swap_regret_optimizer._project_stochastic_matrix_wrt_euclidean_norm( + matrix)) + + self.assertAllClose( + expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6) + + def test_project_log_stochastic_matrix_wrt_kl_divergence(self): + """Tests KL-divergence projection routine on some known values.""" + matrix = standard_ops.constant([[0.2, 0.8, 0.6], [0.1, 0.2, 1.5], + [0.2, 1.0, 0.9]]) + expected_projected_matrix = np.array([[0.4, 0.4, 0.2], [0.2, 0.1, 0.5], + [0.4, 0.5, 0.3]]) + + with self.test_session() as session: + projected_matrix = session.run( + standard_ops.exp( + swap_regret_optimizer. + _project_log_stochastic_matrix_wrt_kl_divergence( + standard_ops.log(matrix)))) + + self.assertAllClose( + expected_projected_matrix, projected_matrix, rtol=0, atol=1e-6) + + def test_additive_swap_regret_optimizer(self): + """Tests that the stochastic matrices update as expected.""" + minimization_problem = test_util.ConstantMinimizationProblem( + np.array([0.6, -0.1, 0.4])) + optimizer = AdditiveSwapRegretOptimizerWrapper( + gradient_descent.GradientDescentOptimizer(1.0)) + train_op = optimizer.minimize_constrained(minimization_problem) + + # Calculated using a numpy+python implementation of the algorithm. + expected_matrices = [ + np.array([[1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]]), + np.array([[0.66666667, 1.0, 1.0, 1.0], [0.26666667, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0], [0.06666667, 0.0, 0.0, 0.0]]), + np.array([[0.41666667, 0.93333333, 1.0, + 0.98333333], [0.46666667, 0.05333333, 0.0, + 0.01333333], [0.0, 0.0, 0.0, 0.0], + [0.11666667, 0.01333333, 0.0, 0.00333333]]), + ] + + matrices = [] + with self.test_session() as session: + session.run(standard_ops.global_variables_initializer()) + while len(matrices) < len(expected_matrices): + matrices.append(session.run(optimizer.stochastic_matrix)) + session.run(train_op) + + for expected, actual in zip(expected_matrices, matrices): + self.assertAllClose(expected, actual, rtol=0, atol=1e-6) + + def test_multiplicative_swap_regret_optimizer(self): + """Tests that the stochastic matrices update as expected.""" + minimization_problem = test_util.ConstantMinimizationProblem( + np.array([0.6, -0.1, 0.4])) + optimizer = MultiplicativeSwapRegretOptimizerWrapper( + gradient_descent.GradientDescentOptimizer(1.0), + initial_multiplier_radius=0.8) + train_op = optimizer.minimize_constrained(minimization_problem) + + # Calculated using a numpy+python implementation of the algorithm. + expected_matrices = [ + np.array([[0.4, 0.4, 0.4, 0.4], [0.2, 0.2, 0.2, 0.2], + [0.2, 0.2, 0.2, 0.2], [0.2, 0.2, 0.2, 0.2]]), + np.array([[0.36999014, 0.38528351, 0.38528351, 0.38528351], [ + 0.23517483, 0.21720297, 0.21720297, 0.21720297 + ], [0.17774131, 0.18882719, 0.18882719, 0.18882719], + [0.21709373, 0.20868632, 0.20868632, 0.20868632]]), + np.array([[0.33972109, 0.36811863, 0.37118462, 0.36906575], [ + 0.27114826, 0.23738228, 0.23376693, 0.23626491 + ], [0.15712313, 0.17641793, 0.17858959, 0.17708679], + [0.23200752, 0.21808115, 0.21645886, 0.21758255]]), + ] + + matrices = [] + with self.test_session() as session: + session.run(standard_ops.global_variables_initializer()) + while len(matrices) < len(expected_matrices): + matrices.append(session.run(optimizer.stochastic_matrix)) + session.run(train_op) + + for expected, actual in zip(expected_matrices, matrices): + self.assertAllClose(expected, actual, rtol=0, atol=1e-6) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/contrib/constrained_optimization/python/test_util.py b/tensorflow/contrib/constrained_optimization/python/test_util.py new file mode 100644 index 0000000000000000000000000000000000000000..704b36ca4c9cf94e7c304f9bed4f6ac7ca275deb --- /dev/null +++ b/tensorflow/contrib/constrained_optimization/python/test_util.py @@ -0,0 +1,58 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains helpers used by tests.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.constrained_optimization.python import constrained_minimization_problem + +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import standard_ops + + +class ConstantMinimizationProblem( + constrained_minimization_problem.ConstrainedMinimizationProblem): + """A `ConstrainedMinimizationProblem` with constant constraint violations. + + This minimization problem is intended for use in performing simple tests of + the Lagrange multiplier (or equivalent) update in the optimizers. There is a + one-element "dummy" model parameter, but it should be ignored. + """ + + def __init__(self, constraints): + """Constructs a new `ConstantMinimizationProblem'. + + Args: + constraints: 1d numpy array, the constant constraint violations. + + Returns: + A new `ConstantMinimizationProblem'. + """ + # We make an fake 1-parameter linear objective so that we don't get a "no + # variables to optimize" error. + self._objective = standard_ops.Variable(0.0, dtype=dtypes.float32) + self._constraints = standard_ops.constant(constraints, dtype=dtypes.float32) + + @property + def objective(self): + """Returns the objective function.""" + return self._objective + + @property + def constraints(self): + """Returns the constant constraint violations.""" + return self._constraints diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py index 6fb56b0858786662546ecab425b1a2564fbd9a64..012b17cee88aecb44c45dd44ea72fd43b5494085 100644 --- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py +++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py @@ -1072,6 +1072,17 @@ class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase): class CudnnRNNTestTraining(test_util.TensorFlowTestCase): + def setUp(self): + super(CudnnRNNTestTraining, self).setUp() + self._reset_rnd_gen_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", + str(False)) + self._rnn_use_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0") + + def tearDown(self): + super(CudnnRNNTestTraining, self).tearDown() + os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = self._reset_rnd_gen_state + os.environ["TF_CUDNN_RNN_USE_V2"] = self._rnn_use_v2 + def _ComputeNumericGrad(self, sess, y, x, delta=1e-4, step=1): """Compute the numeric gradient of y wrt to x. @@ -1184,11 +1195,10 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase): def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, dtype, - delta, tolerance): + use_v2, delta, tolerance): # Gradient checking runs two forward ops with almost the same input. Need to # make sure the drop patterns across the two runs are the same. logging.info("Training test with config: %s", locals()) - old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False)) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True) np.random.seed(1234) @@ -1196,6 +1206,10 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase): has_input_c = (rnn_mode == CUDNN_LSTM) direction = (CUDNN_RNN_UNIDIRECTION if dir_count == 1 else CUDNN_RNN_BIDIRECTION) + if use_v2: + os.environ["TF_CUDNN_RNN_USE_V2"] = "1" + else: + os.environ["TF_CUDNN_RNN_USE_V2"] = "0" model = CudnnTestModel( rnn_mode, num_layers, @@ -1245,22 +1259,22 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase): self._GradientCheck( sess, total_sum, all_inputs, tolerance=tolerance, delta=delta) - os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state def _TestSimpleTrainingHelper(self, rnn_mode, test_configs): dropouts = [0, 0.5, 1.] - for config, dropout in itertools.product(test_configs, dropouts): + v2_options = [str(False), str(True)] + for config, dropout, use_v2 in itertools.product(test_configs, dropouts, + v2_options): dtype = config.get("dtype", dtypes.float32) delta = config.get("delta", 1e-4) tolerance = config.get("tolerance", 1e-6) dir_count = config.get("dir_count", 1) shape = config["shape"] with ops.Graph().as_default(): - self._TestOneSimpleTraining(rnn_mode, shape["num_layers"], - shape["num_units"], shape["input_size"], - shape["batch_size"], shape["seq_length"], - dir_count, dropout, dtype, delta, - tolerance) + self._TestOneSimpleTraining( + rnn_mode, shape["num_layers"], shape["num_units"], + shape["input_size"], shape["batch_size"], shape["seq_length"], + dir_count, dropout, dtype, use_v2, delta, tolerance) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py index a1ede4471ef24adbfb7db32ee7ea276f33ed20a9..73a961992e19fabec5d0f75be1b52dbba20eb7af 100644 --- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py +++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py @@ -17,6 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import os from tensorflow.contrib.checkpoint.python import split_dependency from tensorflow.contrib.rnn.python.ops import lstm_ops from tensorflow.python.framework import common_shapes @@ -901,19 +902,27 @@ def _cudnn_rnn(inputs, check_direction(direction) check_input_mode(input_mode) seed, seed2 = random_seed.get_seed(seed) - outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn( - input=inputs, - input_h=input_h, - input_c=input_c, - params=params, - is_training=is_training, - rnn_mode=rnn_mode, - input_mode=input_mode, - direction=direction, - dropout=dropout, - seed=seed, - seed2=seed2, - name=name) + # TODO(jamesqin): switch default value to "1" on May 25th 2018, and get rid + # of V1 ops. + use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0") + args = { + "input": inputs, + "input_h": input_h, + "input_c": input_c, + "params": params, + "is_training": is_training, + "rnn_mode": rnn_mode, + "input_mode": input_mode, + "direction": direction, + "dropout": dropout, + "seed": seed, + "seed2": seed2, + "name": name + } + if use_cudnn_v2 is not "1": + outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args) + else: + outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args) return (outputs, output_h, output_c) diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD index f5641e89fb8eb26140b972e69725bdf83f5b3ae8..d59dd17aea42618075e69516bcfa4ee2b9eafc81 100644 --- a/tensorflow/contrib/data/python/kernel_tests/BUILD +++ b/tensorflow/contrib/data/python/kernel_tests/BUILD @@ -343,6 +343,7 @@ py_test( "//tensorflow/python:dtypes", "//tensorflow/python:errors", "//tensorflow/python/data/ops:dataset_ops", + "//tensorflow/python/eager:context", "//third_party/py/numpy", ], ) diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py index e0494736b72ae52f586cb80d42a5c1e50ac17a61..1a97a84b2cba13e82c8af9c4c8ee413ee8264a5e 100644 --- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py +++ b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py @@ -24,9 +24,11 @@ import numpy as np from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base from tensorflow.contrib.data.python.ops import scan_ops from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -57,19 +59,24 @@ class ScanDatasetTest(test.TestCase): with self.assertRaises(errors.OutOfRangeError): sess.run(next_element) + @test_util.run_in_graph_and_eager_modes() def testFibonacci(self): iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply( scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1])) ).make_one_shot_iterator() - next_element = iterator.get_next() - with self.test_session() as sess: - self.assertEqual(1, sess.run(next_element)) - self.assertEqual(1, sess.run(next_element)) - self.assertEqual(2, sess.run(next_element)) - self.assertEqual(3, sess.run(next_element)) - self.assertEqual(5, sess.run(next_element)) - self.assertEqual(8, sess.run(next_element)) + if context.executing_eagerly(): + next_element = iterator.get_next + else: + get_next = iterator.get_next() + next_element = lambda: get_next + + self.assertEqual(1, self.evaluate(next_element())) + self.assertEqual(1, self.evaluate(next_element())) + self.assertEqual(2, self.evaluate(next_element())) + self.assertEqual(3, self.evaluate(next_element())) + self.assertEqual(5, self.evaluate(next_element())) + self.assertEqual(8, self.evaluate(next_element())) def testChangingStateShape(self): # Test the fixed-point shape invariant calculations: start with diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py index fe49ee8b1946c03dca37c2bc17dde71646b85bbd..60ef7efba4bb2bc281bc624ec3f58117ffa9a824 100644 --- a/tensorflow/contrib/data/python/ops/scan_ops.py +++ b/tensorflow/contrib/data/python/ops/scan_ops.py @@ -144,6 +144,7 @@ class _ScanDataset(dataset_ops.Dataset): weakened_state_shapes) self._scan_func = tf_scan_func + self._scan_func.add_to_graph(ops.get_default_graph()) def _as_variant_tensor(self): input_t = self._input_dataset._as_variant_tensor() # pylint: disable=protected-access diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py index c5a520ab5aeafb932092ebbbaaf07480cf40403b..34410a6470185ac2821bc6a59de9230ff478aeb6 100644 --- a/tensorflow/contrib/distribute/python/estimator_integration_test.py +++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py @@ -61,7 +61,8 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase, mode=['graph'], distribution=[ combinations.one_device_strategy, - combinations.mirrored_strategy_with_gpu_and_cpu + combinations.mirrored_strategy_with_gpu_and_cpu, + combinations.mirrored_strategy_with_two_gpus ])) def test_complete_flow_with_mode(self, distribution): label_dimension = 2 diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py new file mode 100644 index 0000000000000000000000000000000000000000..a5f5219588fb3be67beb797ba68ed8148e9e9fd2 --- /dev/null +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/ordered_test.py @@ -0,0 +1,109 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for Bijector.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.distributions.python.ops.bijectors.ordered import Ordered +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops +from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite +from tensorflow.python.platform import test + + + +class OrderedBijectorTest(test.TestCase): + """Tests correctness of the ordered transformation.""" + + def setUp(self): + self._rng = np.random.RandomState(42) + + @test_util.run_in_graph_and_eager_modes() + def testBijectorVector(self): + with self.test_session(): + ordered = Ordered() + self.assertEqual("ordered", ordered.name) + x = np.asarray([[2., 3, 4], [4., 8, 13]]) + y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]] + self.assertAllClose(y, self.evaluate(ordered.forward(x))) + self.assertAllClose(x, self.evaluate(ordered.inverse(y))) + self.assertAllClose( + np.sum(np.asarray(y)[..., 1:], axis=-1), + self.evaluate(ordered.inverse_log_det_jacobian(y, event_ndims=1)), + atol=0., + rtol=1e-7) + self.assertAllClose( + self.evaluate(-ordered.inverse_log_det_jacobian(y, event_ndims=1)), + self.evaluate(ordered.forward_log_det_jacobian(x, event_ndims=1)), + atol=0., + rtol=1e-7) + + def testBijectorUnknownShape(self): + with self.test_session(): + ordered = Ordered() + self.assertEqual("ordered", ordered.name) + x = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32) + real_x = np.asarray([[2., 3, 4], [4., 8, 13]]) + y = array_ops.placeholder(shape=[2, None], dtype=dtypes.float32) + real_y = [[2., 0, 0], [4., np.log(4.), np.log(5.)]] + self.assertAllClose(real_y, ordered.forward(x).eval( + feed_dict={x: real_x})) + self.assertAllClose(real_x, ordered.inverse(y).eval( + feed_dict={y: real_y})) + self.assertAllClose( + np.sum(np.asarray(real_y)[..., 1:], axis=-1), + ordered.inverse_log_det_jacobian(y, event_ndims=1).eval( + feed_dict={y: real_y}), + atol=0., + rtol=1e-7) + self.assertAllClose( + -ordered.inverse_log_det_jacobian(y, event_ndims=1).eval( + feed_dict={y: real_y}), + ordered.forward_log_det_jacobian(x, event_ndims=1).eval( + feed_dict={x: real_x}), + atol=0., + rtol=1e-7) + + @test_util.run_in_graph_and_eager_modes() + def testShapeGetters(self): + with self.test_session(): + x = tensor_shape.TensorShape([4]) + y = tensor_shape.TensorShape([4]) + bijector = Ordered(validate_args=True) + self.assertAllEqual(y, bijector.forward_event_shape(x)) + self.assertAllEqual(y.as_list(), + self.evaluate(bijector.forward_event_shape_tensor( + x.as_list()))) + self.assertAllEqual(x, bijector.inverse_event_shape(y)) + self.assertAllEqual(x.as_list(), + self.evaluate(bijector.inverse_event_shape_tensor( + y.as_list()))) + + def testBijectiveAndFinite(self): + with self.test_session(): + ordered = Ordered() + x = np.sort(self._rng.randn(3, 10), axis=-1).astype(np.float32) + y = (self._rng.randn(3, 10)).astype(np.float32) + assert_bijective_and_finite(ordered, x, y, event_ndims=1) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py index babce80396cfc41b53e99f91038d4f077c7efe82..51478dbeffaabc58ce3662f25f06bc579e8a407e 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py @@ -30,6 +30,7 @@ @@Invert @@Kumaraswamy @@MaskedAutoregressiveFlow +@@Ordered @@Permute @@PowerTransform @@RealNVP @@ -67,6 +68,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.inline import * from tensorflow.contrib.distributions.python.ops.bijectors.invert import * from tensorflow.contrib.distributions.python.ops.bijectors.kumaraswamy import * from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import * +from tensorflow.contrib.distributions.python.ops.bijectors.ordered import * from tensorflow.contrib.distributions.python.ops.bijectors.permute import * from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import * from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import * diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py index caae2adcfac7643cdc8f76dd1cccddd516105410..ecdb8967f43e5960b2285de05125d0c3dbafe63c 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py @@ -170,7 +170,7 @@ class CholeskyOuterProduct(bijector.Bijector): sum_weighted_log_diag = array_ops.squeeze( math_ops.matmul(math_ops.log(diag), exponents[..., array_ops.newaxis]), - squeeze_dims=-1) + axis=-1) fldj = p_float * np.log(2.) + sum_weighted_log_diag return fldj diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py index 1904239a0e7009c35cc4f3c8876fd749463a2b83..84a3289ba2160ed22a2bc7030dd612ba9ca6f6df 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py @@ -18,14 +18,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.python.ops.distributions import bijector as bijector_lib +from tensorflow.python.ops.distributions import bijector __all__ = [ "Invert", ] -class Invert(bijector_lib.Bijector): +class Invert(bijector.Bijector): """Bijector which inverts another Bijector. Example Use: [ExpGammaDistribution (see Background & Context)]( diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py index ef56cf6ddda4dca2b1575e844b2584689e531b81..83667b0e80cfcc1c4f0617cdc739221f24439665 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py @@ -32,7 +32,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops from tensorflow.python.ops import template as template_ops from tensorflow.python.ops import variable_scope as variable_scope_lib -from tensorflow.python.ops.distributions import bijector as bijector_lib +from tensorflow.python.ops.distributions import bijector __all__ = [ @@ -42,7 +42,7 @@ __all__ = [ ] -class MaskedAutoregressiveFlow(bijector_lib.Bijector): +class MaskedAutoregressiveFlow(bijector.Bijector): """Affine MaskedAutoregressiveFlow bijector for vector-valued events. The affine autoregressive flow [(Papamakarios et al., 2016)][3] provides a diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py new file mode 100644 index 0000000000000000000000000000000000000000..3f03592f314cc13e8a9ea7e2ae18c5bb1f14e74f --- /dev/null +++ b/tensorflow/contrib/distributions/python/ops/bijectors/ordered.py @@ -0,0 +1,125 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Ordered bijector.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +from tensorflow.python.framework import tensor_shape +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import check_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops.distributions import bijector + + +__all__ = [ + "Ordered", +] + + +class Ordered(bijector.Bijector): + """Bijector which maps a tensor x_k that has increasing elements in the last + dimension to an unconstrained tensor y_k. + + Both the domain and the codomain of the mapping is [-inf, inf], however, + the input of the forward mapping must be strictly increasing. + The inverse of the bijector applied to a normal random vector `y ~ N(0, 1)` + gives back a sorted random vector with the same distribution `x ~ N(0, 1)` + where `x = sort(y)` + + On the last dimension of the tensor, Ordered bijector performs: + `y[0] = x[0]` + `y[1:] = math_ops.log(x[1:] - x[:-1])` + + #### Example Use: + + ```python + bijector.Ordered().forward([2, 3, 4]) + # Result: [2., 0., 0.] + + bijector.Ordered().inverse([0.06428002, -1.07774478, -0.71530371]) + # Result: [0.06428002, 0.40464228, 0.8936858] + ``` + """ + + def __init__(self, validate_args=False, name="ordered"): + super(Ordered, self).__init__( + forward_min_event_ndims=1, + validate_args=validate_args, + name=name) + + def _forward_event_shape(self, input_shape): + if input_shape.ndims is None or input_shape[-1] is None: + return input_shape + return tensor_shape.TensorShape([input_shape[-1]]) + + def _forward_event_shape_tensor(self, input_shape): + return (input_shape[-1])[..., array_ops.newaxis] + + def _inverse_event_shape(self, output_shape): + if output_shape.ndims is None or output_shape[-1] is None: + return output_shape + if output_shape[-1] <= 1: + raise ValueError("output_shape[-1] = %d <= 1" % output_shape[-1]) + return tensor_shape.TensorShape([output_shape[-1]]) + + def _inverse_event_shape_tensor(self, output_shape): + if self.validate_args: + is_greater_one = check_ops.assert_greater( + output_shape[-1], 1, message="Need last dimension greater than 1.") + output_shape = control_flow_ops.with_dependencies( + [is_greater_one], output_shape) + return (output_shape[-1])[..., array_ops.newaxis] + + def _forward(self, x): + x = self._maybe_assert_valid_x(x) + y0 = x[..., 0, array_ops.newaxis] + yk = math_ops.log(x[..., 1:] - x[..., :-1]) + y = array_ops.concat([y0, yk], axis=-1) + return y + + def _inverse(self, y): + x0 = y[..., 0, array_ops.newaxis] + xk = math_ops.exp(y[..., 1:]) + x = array_ops.concat([x0, xk], axis=-1) + return math_ops.cumsum(x, axis=-1) + + def _inverse_log_det_jacobian(self, y): + # The Jacobian of the inverse mapping is lower + # triangular, with the diagonal elements being: + # J[i,i] = 1 if i=1, and + # exp(y_i) if 1GPU->CPU, which forces - # a sync. This is a roundabout way, yes. + # then this will force a copy from CPU->NON_CPU_DEVICE->CPU, + # which forces a sync. This is a roundabout way, yes. tf.constant(1.).cpu() - def _benchmark_eager_apply(self, label, defun=False, execution_mode=None): + def _benchmark_eager_apply(self, label, defun=False, execution_mode=None, + device_and_format=None): with tfe.execution_mode(execution_mode): - device, data_format = device_and_data_format() + device, data_format = device_and_format or device_and_data_format() model = resnet50.ResNet50(data_format) if defun: model.call = tfe.defun(model.call) @@ -207,7 +213,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): num_burn = 5 num_iters = 30 with tf.device(device): - images, _ = random_batch(batch_size) + images, _ = random_batch(batch_size, device_and_format) for _ in xrange(num_burn): model(images, training=False).cpu() if execution_mode: @@ -220,7 +226,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): tfe.async_wait() self._report(label, start, num_iters, device, batch_size, data_format) - def benchmark_eager_apply(self): + def benchmark_eager_apply_sync(self): self._benchmark_eager_apply('eager_apply', defun=False) def benchmark_eager_apply_async(self): @@ -234,11 +240,12 @@ class ResNet50Benchmarks(tf.test.Benchmark): label, make_iterator, defun=False, - execution_mode=None): + execution_mode=None, + device_and_format=None): with tfe.execution_mode(execution_mode): - device, data_format = device_and_data_format() + device, data_format = device_and_format or device_and_data_format() for batch_size in self._train_batch_sizes(): - (images, labels) = random_batch(batch_size) + (images, labels) = random_batch(batch_size, device_and_format) num_burn = 3 num_iters = 10 model = resnet50.ResNet50(data_format) @@ -253,7 +260,7 @@ class ResNet50Benchmarks(tf.test.Benchmark): train_one_step(model, images, labels, optimizer) if execution_mode: tfe.async_wait() - self._force_gpu_sync() + self._force_device_sync() gc.collect() start = time.time() @@ -262,10 +269,10 @@ class ResNet50Benchmarks(tf.test.Benchmark): train_one_step(model, images, labels, optimizer) if execution_mode: tfe.async_wait() - self._force_gpu_sync() + self._force_device_sync() self._report(label, start, num_iters, device, batch_size, data_format) - def benchmark_eager_train(self): + def benchmark_eager_train_sync(self): self._benchmark_eager_train('eager_train', MockIterator, defun=False) def benchmark_eager_train_async(self): diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD index 62ddb3d290e6c6455d79af50b2cfd894ff9f449a..b473de86ee8be92e8111ee5098b2536d4b957a8c 100644 --- a/tensorflow/contrib/estimator/BUILD +++ b/tensorflow/contrib/estimator/BUILD @@ -367,6 +367,7 @@ py_library( "//tensorflow/python:sparse_tensor", "//tensorflow/python:state_ops", "//tensorflow/python:training", + "//tensorflow/python:util", "//tensorflow/python:variable_scope", "//tensorflow/python/estimator:export_output", "//tensorflow/python/estimator:model_fn", diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py index a8774d6dab9205439e6e312827f9cd1306e3f1ea..f8564446e5da3e785b85010998d18dca0424d16b 100644 --- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py +++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py @@ -47,8 +47,12 @@ from tensorflow.python.ops.losses import losses from tensorflow.python.platform import tf_logging from tensorflow.python.training import device_setter as device_setter_lib from tensorflow.python.training import optimizer as optimizer_lib +from tensorflow.python.util import deprecation +@deprecation.deprecated( + '2018-05-31', + 'Please use `tf.contrib.distribute.MirroredStrategy` instead.') def replicate_model_fn(model_fn, loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS, devices=None): @@ -255,6 +259,9 @@ class TowerOptimizer(optimizer_lib.Optimizer): COLLECTION_FOR_GRAPH_STATES = 'replicate_model_fn_graph_states' + @deprecation.deprecated( + '2018-05-31', + 'Please use `tf.contrib.distribute.MirroredStrategy` instead.') def __init__(self, optimizer_or_optimizer_fn): """Wrap an existing optimizer for gathering gradients across towers. diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py index 144b45982c8aec2e2b115c812b24e8843d60ce1e..dd8a3a95f1b83bfd29e8a38ec1512f90e22968d9 100644 --- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py +++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py @@ -540,59 +540,6 @@ class ReplicateAcrossASingleDeviceWithoutTowerOptimizer( self.assertEqual(7.0, session.run(c)) -class UseTowerEstimatorWithoutReplication(test_util.TensorFlowTestCase): - - def model_fn(self, mode, features, labels, params): - c = variable_scope.get_variable( - 'c', - initializer=constant_op.constant(10, dtype=dtypes.float64), - dtype=dtypes.float64) - - features = features['features'] - predictions = math_ops.multiply(features, c) - - loss = losses.absolute_difference( - labels=labels, predictions=predictions, reduction=losses.Reduction.SUM) - loss = math_ops.reduce_sum(loss) - - metrics = { - 'accuracy': metrics_lib.accuracy(labels, predictions), - 'auc': metrics_lib.auc(labels, predictions) - } - - optimizer = replicate_model_fn.TowerOptimizer( - gradient_descent.GradientDescentOptimizer(params['learning_rate'])) - - return model_fn_lib.EstimatorSpec( - mode=mode, - loss=loss, - eval_metric_ops=metrics, - predictions={'probabilities': predictions}, - train_op=optimizer.minimize(loss)) - - @property - def params(self): - params = {} - params['learning_rate'] = 1.0 - return params - - def test_train_single_tower(self): - features = np.array([[1.0], [2.0]]) - labels = np.array([[1.0], [2.0]]) - - train_input_fn = numpy_io.numpy_input_fn( - x={'features': features}, y=labels, batch_size=2, shuffle=False) - - with self.test_session(): - estimator = estimator_lib.Estimator( - model_fn=self.model_fn, - model_dir=tempfile.mkdtemp(), - params=self.params) - estimator.train(train_input_fn, steps=1) - - self.assertEqual(7.0, estimator.get_variable_value('c')) - - class MakeSureSyncReplicasOptimizerWorks(test_util.TensorFlowTestCase): def model_fn(self, mode, features, labels, params): diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD index 0a648d5d40e431bedb42017b15cabe078ac22fa7..f28d95401c3a5dd89ace9ff2014dd2207a3ea743 100644 --- a/tensorflow/contrib/factorization/BUILD +++ b/tensorflow/contrib/factorization/BUILD @@ -215,6 +215,7 @@ tf_py_test( "//tensorflow/python:platform_test", "//tensorflow/python:sparse_tensor", ], + tags = ["noasan"], # times out b/78588193 ) # Estimators tests diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/contrib/factorization/kernels/clustering_ops.cc index 2a6c97e8b9526894eba057505a2bf823ad778f56..025534d540bb82cdb87bb2977d08dfa4f02f1bc8 100644 --- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc +++ b/tensorflow/contrib/factorization/kernels/clustering_ops.cc @@ -32,6 +32,7 @@ #include "tensorflow/core/lib/gtl/top_n.h" #include "tensorflow/core/lib/random/philox_random.h" #include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py index ccdd679d6ae197ab21d5fed5e89daa26bc15a809..e076631bc16fd379a2ad31af9055a7388d98c7ca 100644 --- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py +++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py @@ -397,7 +397,7 @@ class GmmAlgorithm(object): # Compute the effective number of data points assigned to component k. with ops.control_dependencies(self._w): points_in_k = array_ops.squeeze( - math_ops.add_n(self._points_in_k), squeeze_dims=[0]) + math_ops.add_n(self._points_in_k), axis=[0]) # Update alpha. if 'w' in self._params: final_points_in_k = points_in_k / num_batches diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc index 35341406a08dc681c861aea30fcff784e3b963ef..cca1a054193815793846a8753678f75bdfd72a6c 100644 --- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc +++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc @@ -28,7 +28,7 @@ #include "tensorflow/core/lib/io/path.h" #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" -#include "tensorflow/core/platform/cpu_info.h" +#include "tensorflow/core/platform/byte_order.h" #include "tensorflow/core/platform/env.h" using tensorflow::strings::StrCat; diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD index b1c8ad49eaf8d2400e431fcf4820fca6e0314557..249debbdf6dff412a5be6cb1032fc4a3567c7d0b 100644 --- a/tensorflow/contrib/framework/BUILD +++ b/tensorflow/contrib/framework/BUILD @@ -93,6 +93,7 @@ tf_kernel_library( ], deps = [ "//tensorflow/core:framework", + "//tensorflow/core:framework_headers_lib", "//third_party/eigen3", ], alwayslink = 1, @@ -177,6 +178,8 @@ cuda_py_test( "//tensorflow/python:platform_test", "//tensorflow/python:resource_variable_ops", "//tensorflow/python:tensor_array_ops", + "//tensorflow/python/data/ops:dataset_ops", + "//tensorflow/python/eager:context", ], ) diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py index 11397e86bd811e376ff4b60f86109d9ee5dd1a62..10d1ecc738de6777784200ba934a521dff592e28 100644 --- a/tensorflow/contrib/framework/__init__.py +++ b/tensorflow/contrib/framework/__init__.py @@ -108,6 +108,7 @@ from __future__ import print_function # pylint: disable=unused-import,wildcard-import from tensorflow.contrib.framework.python.framework import * +from tensorflow.contrib.framework.python.framework import nest from tensorflow.contrib.framework.python.ops import * # pylint: enable=unused-import,wildcard-import @@ -126,5 +127,20 @@ from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d from tensorflow.python.util.all_util import remove_undocumented _allowed_symbols = ['nest', 'broadcast_to'] - +_nest_allowed_symbols = [ + 'assert_same_structure', + 'is_sequence', + 'flatten', + 'flatten_dict_items', + 'pack_sequence_as', + 'map_structure', + 'assert_shallow_structure', + 'flatten_up_to', + 'map_structure_up_to', + 'get_traverse_shallow_structure', + 'yield_flat_paths', + 'flatten_with_joined_string_paths', +] + +remove_undocumented(nest.__name__, allowed_exception_list=_nest_allowed_symbols) remove_undocumented(__name__, allowed_exception_list=_allowed_symbols) diff --git a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc index 5bf6b67529579e71a615c27e035111a58d5c02e0..6ab3f460b36d5dd632daee1af68d62529df9cb09 100644 --- a/tensorflow/contrib/framework/kernels/zero_initializer_op.cc +++ b/tensorflow/contrib/framework/kernels/zero_initializer_op.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/resource_var.h" namespace tensorflow { @@ -85,4 +86,74 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_KERNELS +template +class ZeroVarInitializer : public OpKernel { + public: + explicit ZeroVarInitializer(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_)); + } + + void Compute(OpKernelContext* ctx) override { + Var* variable = nullptr; + OP_REQUIRES_OK(ctx, LookupOrCreateResource( + ctx, HandleFromInput(ctx, 0), &variable, + [this, ctx](Var** var_ptr) { + *var_ptr = new Var(dtype_); + PersistentTensor unused; + Tensor* var_tensor = nullptr; + AllocatorAttributes attr; + attr.set_gpu_compatible(true); + attr.set_nic_compatible(true); + TF_RETURN_IF_ERROR(ctx->allocate_persistent( + dtype_, shape_, &unused, &var_tensor, attr)); + + functor::TensorSetZero()( + ctx->eigen_device(), + var_tensor->flat()); + + *(*var_ptr)->tensor() = *var_tensor; + + return Status::OK(); + })); + + core::ScopedUnref scoped(variable); + mutex_lock ml(*variable->mu()); + + OP_REQUIRES(ctx, !variable->is_initialized, + errors::InvalidArgument("input is already initialized")); + + variable->is_initialized = true; + + Tensor* output = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output)); + output->scalar()() = HandleFromInput(ctx, 0); + } + + private: + DataType dtype_; + TensorShape shape_; +}; + +#define REGISTER_CPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("dtype"), \ + ZeroVarInitializer); + +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS + +#if GOOGLE_CUDA +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_KERNEL_BUILDER(Name("ZeroVarInitializer") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("dtype") \ + .HostMemory("var"), \ + ZeroVarInitializer); + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#endif // GOOGLE_CUDA + } // namespace tensorflow diff --git a/tensorflow/contrib/framework/ops/variable_ops.cc b/tensorflow/contrib/framework/ops/variable_ops.cc index 706134ba9a51de6253ba7463b17ff662ea740ed0..f6ee6cdb5713c113aff2228db58244ac73536d9a 100644 --- a/tensorflow/contrib/framework/ops/variable_ops.cc +++ b/tensorflow/contrib/framework/ops/variable_ops.cc @@ -39,4 +39,33 @@ ref: Should be from a `Variable` node. output_ref:= Same as "ref". )doc"); +REGISTER_OP("ZeroVarInitializer") + .Input("var: resource") + .Output("output_var: resource") + .Attr("dtype: type") + .Attr("shape: shape") + .SetAllowsUninitializedInput() + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->Scalar()); + DataType t; + TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t)); + PartialTensorShape p; + TF_RETURN_IF_ERROR(c->GetAttr("shape", &p)); + shape_inference::ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s)); + c->set_output_handle_shapes_and_types( + 0, std::vector{{s, t}}); + + return Status::OK(); + }) + .Doc(R"doc( +Initialize 'var' with all zeros. This op requires that the resource var is not +initialized. The var will first be allocated memory, then be filled with all +zeros. This op is intended to save memory during initialization, +if you use this op, you should not run initializer of the var. + +var: Should be a ResourceVariable. +output_var:= Same as "var". +)doc"); + } // namespace tensorflow diff --git a/tensorflow/contrib/framework/python/ops/critical_section_test.py b/tensorflow/contrib/framework/python/ops/critical_section_test.py index ba660295cb3c97d26da7bf892c78bceee53cf2d4..df7d7e9dae80722569efccbc9cc0d1b75e90cf03 100644 --- a/tensorflow/contrib/framework/python/ops/critical_section_test.py +++ b/tensorflow/contrib/framework/python/ops/critical_section_test.py @@ -19,6 +19,8 @@ from __future__ import division from __future__ import print_function from tensorflow.contrib.framework.python.ops import critical_section_ops +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import context from tensorflow.python.framework import ops from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops @@ -330,6 +332,25 @@ class CriticalSectionTest(test.TestCase): self.evaluate(v.initializer) self.assertEqual(10, self.evaluate(out)) + @test_util.run_in_graph_and_eager_modes() + def testInsideFunction(self): + cs = critical_section_ops.CriticalSection() + v = resource_variable_ops.ResourceVariable(1) + def fn(): + return v.read_value() + + # map() creates a TensorFlow function. + ds = dataset_ops.Dataset.range(1).map(lambda _: cs.execute(fn)) + + def get_first(): + if context.executing_eagerly(): + return self.evaluate(ds.make_one_shot_iterator().get_next()) + itr = ds.make_initializable_iterator() + self.evaluate([v.initializer, itr.initializer]) + return self.evaluate(itr.get_next()) + + self.assertEqual(1, get_first()) + # TODO(ebrevdo): Re-enable once CriticalSection is in core. # # def testCriticalSectionAndExecuteOpSaverRoundTrip(self): diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py index 0754c3e0e30a340910a43a3ce86f6ca10afe848e..40ae01bfcce1dde580e6a5f6d9c8ec1aa1abb83f 100644 --- a/tensorflow/contrib/framework/python/ops/variables.py +++ b/tensorflow/contrib/framework/python/ops/variables.py @@ -32,6 +32,7 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import resource_loader from tensorflow.python.platform import tf_logging as logging @@ -82,7 +83,12 @@ def zero_initializer(ref, use_locking=True, name="zero_initializer"): """ loader.load_op_library( resource_loader.get_path_to_datafile("_variable_ops.so")) - return gen_variable_ops.zero_initializer(ref, name=name) + if resource_variable_ops.is_resource_variable(ref): + return gen_variable_ops.zero_var_initializer( + ref.handle, shape=ref.shape, dtype=ref.dtype, name=name) + else: + return gen_variable_ops.zero_initializer(ref, name=name) + @deprecated(None, "Please switch to tf.train.assert_global_step") def assert_global_step(global_step_tensor): diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py index 2f06df93acb0a4c0b36c68839ff531e3c22c5ee3..37ea6eb12aba7d25656f19cbbc86475c1228d916 100644 --- a/tensorflow/contrib/framework/python/ops/variables_test.py +++ b/tensorflow/contrib/framework/python/ops/variables_test.py @@ -1284,6 +1284,32 @@ class ZeroInitializerOpTest(test.TestCase): [10, 20], dtype=dtype), use_init) +class ZeroVarInitializerOpTest(test.TestCase): + + def _testZeroVarInitializer(self, shape, initializer, use_init): + var = resource_variable_ops.ResourceVariable(initializer) + var_zero = variables_lib2.zero_initializer(var) + + with self.test_session() as sess: + with self.assertRaisesOpError('Error while reading resource variable'): + var.eval() + if use_init: + sess.run(var.initializer) + with self.assertRaisesOpError('input is already initialized'): + var_zero.eval() + self.assertAllClose(np.ones(shape), var.eval()) + else: + var_zero.eval() + self.assertAllClose(np.zeros(shape), var.eval()) + + def testZeroVarInitializer(self): + for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64): + for use_init in (False, True): + self._testZeroVarInitializer([10, 20], + array_ops.ones([10, 20], dtype=dtype), + use_init) + + class FilterVariablesTest(test.TestCase): def setUp(self): diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc index 0e06575d96f9b9538f0245b12d48cfd7c0e8d981..2458f7554afdc12709571c551a8323cda7fa5c17 100644 --- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc +++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc @@ -247,7 +247,7 @@ class FusedConv2DBiasActivationOp : public OpKernel { }; #if GOOGLE_CUDA -namespace dnn = ::perftools::gputools::dnn; +namespace dnn = se::dnn; // A dummy type to group forward convolution autotune results together. struct ConvBiasActivationAutoTuneGroup { @@ -543,7 +543,8 @@ void LaunchFusedConv2DBiasActivationOp:: fused_conv_parameters, &algorithm_config)) { std::vector algorithms; CHECK(stream->parent()->GetConvolveAlgorithms( - fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo(), + fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo( + stream->parent()), &algorithms)); dnn::ProfileResult best_result; dnn::ProfileResult best_result_no_scratch; diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py index 47e51415fd9e7daa360ca06a11078f6edcf63b5b..d914f549457a1e893ed43a3b8bc1ae5be7bb4303 100644 --- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py +++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py @@ -488,25 +488,25 @@ def frechet_classifier_distance(real_images, The Frechet Inception distance. A floating-point scalar of the same type as the output of `classifier_fn`. """ - real_images_list = array_ops.split( real_images, num_or_size_splits=num_batches) generated_images_list = array_ops.split( generated_images, num_or_size_splits=num_batches) - imgs = array_ops.stack(real_images_list + generated_images_list) + real_imgs = array_ops.stack(real_images_list) + generated_imgs = array_ops.stack(generated_images_list) # Compute the activations using the memory-efficient `map_fn`. - activations = functional_ops.map_fn( - fn=classifier_fn, - elems=imgs, - parallel_iterations=1, - back_prop=False, - swap_memory=True, - name='RunClassifier') + def compute_activations(elems): + return functional_ops.map_fn(fn=classifier_fn, + elems=elems, + parallel_iterations=1, + back_prop=False, + swap_memory=True, + name='RunClassifier') - # Split the activations by the real and generated images. - real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0) + real_a = compute_activations(real_imgs) + gen_a = compute_activations(generated_imgs) # Ensure the activations have the right shapes. real_a = array_ops.concat(array_ops.unstack(real_a), 0) @@ -697,18 +697,20 @@ def frechet_classifier_distance_from_activations(real_activations, # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) - num_examples = math_ops.to_double(array_ops.shape(real_activations)[0]) + num_examples_real = math_ops.to_double(array_ops.shape(real_activations)[0]) + num_examples_generated = math_ops.to_double( + array_ops.shape(generated_activations)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T real_centered = real_activations - m sigma = math_ops.matmul( real_centered, real_centered, transpose_a=True) / ( - num_examples - 1) + num_examples_real - 1) gen_centered = generated_activations - m_w sigma_w = math_ops.matmul( gen_centered, gen_centered, transpose_a=True) / ( - num_examples - 1) + num_examples_generated - 1) # Find the Tr(sqrt(sigma sigma_w)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_w) diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc index 645abbf0b0ea5465dadf55d065e997e16940c18d..bbb3a3b18fd7bfdc68e8b8532568985245154794 100644 --- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc +++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc @@ -59,7 +59,7 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count, delta_h, scale_s, scale_v, tranformation_matrix.flat().data(), tranformation_matrix.flat().size()); // Call cuBlas C = A * B directly. - auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; + auto no_transpose = se::blas::Transpose::kNoTranspose; auto a_ptr = AsDeviceMemory(input->flat().data(), input->flat().size()); auto b_ptr = AsDeviceMemory(tranformation_matrix.flat().data(), diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py index 2a3592c53fdda488561e504ba2712aadc3214cc4..432b67e5690003d37bc0a4f53b3b091468b40a5c 100644 --- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py +++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py @@ -814,6 +814,21 @@ class ConvInputKroneckerFactorTest(ConvFactorTestCase): new_cov = sess.run(factor.make_covariance_update_op(0.)) self.assertAllClose([[(1. + 4.) / 2.]], new_cov) + def testSubSample(self): + with tf_ops.Graph().as_default(): + patches_1 = array_ops.constant(1, shape=(10, 2)) + patches_2 = array_ops.constant(1, shape=(10, 8)) + patches_3 = array_ops.constant(1, shape=(3, 3)) + patches_1_sub = ff._subsample_for_cov_computation(patches_1) + patches_2_sub = ff._subsample_for_cov_computation(patches_2) + patches_3_sub = ff._subsample_for_cov_computation(patches_3) + patches_1_sub_batch_size = patches_1_sub.shape.as_list()[0] + patches_2_sub_batch_size = patches_2_sub.shape.as_list()[0] + patches_3_sub_batch_size = patches_3_sub.shape.as_list()[0] + self.assertEqual(2, patches_1_sub_batch_size) + self.assertEqual(8, patches_2_sub_batch_size) + self.assertEqual(3, patches_3_sub_batch_size) + class ConvOutputKroneckerFactorTest(ConvFactorTestCase): diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD index b897fd68a080e819042cd36f2a1acfcf175e656b..cb0917bb851cff2dd25e398ec9dc3cc0ffcddedf 100644 --- a/tensorflow/contrib/kfac/python/ops/BUILD +++ b/tensorflow/contrib/kfac/python/ops/BUILD @@ -37,10 +37,13 @@ py_library( deps = [ ":utils", "//tensorflow/python:array_ops", + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", "//tensorflow/python:init_ops", "//tensorflow/python:linalg_ops", "//tensorflow/python:math_ops", + "//tensorflow/python:random_ops", "//tensorflow/python:special_math_ops", "//tensorflow/python:training", "//tensorflow/python:variable_scope", diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py index 0d40d265a1727075d0ba721b0d9a756c38269a96..7988a3b92bf0139b382e43acc534664ca5bb261e 100644 --- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py +++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py @@ -32,6 +32,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops from tensorflow.python.ops import special_math_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables @@ -55,6 +56,22 @@ EIGENVALUE_DECOMPOSITION_THRESHOLD = 2 # matrix powers. Must be nonnegative. EIGENVALUE_CLIPPING_THRESHOLD = 0.0 +# Used to subsample the flattened extracted image patches. The number of +# outer products per row of the covariance matrix should not exceed this +# value. This parameter is used only if `_SUB_SAMPLE_OUTER_PRODUCTS` is True. +_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = 1 + +# Used to subsample the inputs passed to the extract image patches. The batch +# size of number of inputs to extract image patches is multiplied by this +# factor. This parameter is used only if `_SUB_SAMPLE_INPUTS` is True. +_INPUTS_TO_EXTRACT_PATCHES_FACTOR = 0.5 + +# If True, then subsamples the tensor passed to compute the covaraince matrix. +_SUB_SAMPLE_OUTER_PRODUCTS = False + +# If True, then subsamples the tensor passed to compute the covaraince matrix. +_SUB_SAMPLE_INPUTS = False + # TOWER_STRATEGY can be one of "concat" or "separate". If "concat", the data # passed to the factors from the blocks will be concatenated across towers # (lazilly via PartitionedTensor objects). Otherwise a tuple of tensors over @@ -67,12 +84,20 @@ def set_global_constants(init_covariances_at_zero=None, zero_debias=None, eigenvalue_decomposition_threshold=None, eigenvalue_clipping_threshold=None, + max_num_outer_products_per_cov_row=None, + sub_sample_outer_products=None, + inputs_to_extract_patches_factor=None, + sub_sample_inputs=None, tower_strategy=None): """Sets various global constants used by the classes in this module.""" global INIT_COVARIANCES_AT_ZERO global ZERO_DEBIAS global EIGENVALUE_DECOMPOSITION_THRESHOLD global EIGENVALUE_CLIPPING_THRESHOLD + global _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW + global _SUB_SAMPLE_OUTER_PRODUCTS + global _INPUTS_TO_EXTRACT_PATCHES_FACTOR + global _SUB_SAMPLE_INPUTS global TOWER_STRATEGY if init_covariances_at_zero is not None: @@ -83,6 +108,14 @@ def set_global_constants(init_covariances_at_zero=None, EIGENVALUE_DECOMPOSITION_THRESHOLD = eigenvalue_decomposition_threshold if eigenvalue_clipping_threshold is not None: EIGENVALUE_CLIPPING_THRESHOLD = eigenvalue_clipping_threshold + if max_num_outer_products_per_cov_row is not None: + _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW = max_num_outer_products_per_cov_row + if sub_sample_outer_products is not None: + _SUB_SAMPLE_OUTER_PRODUCTS = sub_sample_outer_products + if inputs_to_extract_patches_factor is not None: + _INPUTS_TO_EXTRACT_PATCHES_FACTOR = inputs_to_extract_patches_factor + if sub_sample_inputs is not None: + _SUB_SAMPLE_INPUTS = sub_sample_inputs if tower_strategy is not None: TOWER_STRATEGY = tower_strategy @@ -227,6 +260,58 @@ def graph_func_to_string(func): return list_to_string(func.func_id) +def _subsample_for_cov_computation(array, name=None): + """Subsamples the first dimension of the array. + + `array`(A) is a tensor of shape `[batch_size, dim_2]`. Then the covariance + matrix(A^TA) is of shape `dim_2 ** 2`. Subsample only if the number of outer + products per row of the covariance matrix is greater than + `_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW`. + + Args: + array: Tensor, of shape `[batch_size, dim_2]`. + name: `string`, Default(None) + + Returns: + A tensor of shape `[max_samples, dim_2]`. + + Raises: + ValueError: If array's is not matrix-shaped. + ValueError: If array's batch_size cannot be inferred. + + """ + with tf_ops.name_scope(name, "subsample", [array]): + array = tf_ops.convert_to_tensor(array) + if len(array.shape) != 2: + raise ValueError("Input param array must be a matrix.") + + batch_size = array.shape.as_list()[0] + if batch_size is None: + raise ValueError("Unable to get batch_size from input param array.") + + num_cov_rows = array.shape.as_list()[-1] + max_batch_size = int(_MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW * num_cov_rows) + if batch_size <= max_batch_size: + return array + + return _random_tensor_gather(array, max_batch_size) + + +def _random_tensor_gather(array, max_size): + """Generates a random set of indices and gathers the value at the indcices. + + Args: + array: Tensor, of shape `[batch_size, dim_2]`. + max_size: int, Number of indices to sample. + + Returns: + A tensor of shape `[max_size, ...]`. + """ + batch_size = array.shape.as_list()[0] + indices = random_ops.random_shuffle(math_ops.range(0, batch_size))[:max_size] + return array_ops.gather(array, indices) + + @six.add_metaclass(abc.ABCMeta) class FisherFactor(object): """Base class for objects modeling factors of approximate Fisher blocks. @@ -1153,7 +1238,9 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): dilation_rate=None, data_format=None, extract_patches_fn=None, - has_bias=False): + has_bias=False, + sub_sample_inputs=None, + sub_sample_patches=None): """Initializes ConvInputKroneckerFactor. Args: @@ -1173,6 +1260,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): patches. One of "extract_convolution_patches", "extract_image_patches", "extract_pointwise_conv2d_patches". has_bias: bool. If True, append 1 to in_channel. + sub_sample_inputs: `bool`. If True, then subsample the inputs from which + the image patches are extracted. (Default: None) + sub_sample_patches: `bool`, If `True` then subsample the extracted + patches.(Default: None) """ self._inputs = inputs self._filter_shape = filter_shape @@ -1182,7 +1273,15 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): self._data_format = data_format self._extract_patches_fn = extract_patches_fn self._has_bias = has_bias + if sub_sample_inputs is None: + self._sub_sample_inputs = _SUB_SAMPLE_INPUTS + else: + self._sub_sample_inputs = sub_sample_inputs + if sub_sample_patches is None: + self._sub_sample_patches = _SUB_SAMPLE_OUTER_PRODUCTS + else: + self._sub_sample_patches = sub_sample_patches super(ConvInputKroneckerFactor, self).__init__() @property @@ -1215,6 +1314,10 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): assert source == 0 inputs = self._inputs[tower] + if self._sub_sample_inputs: + batch_size = inputs.shape.as_list()[0] + max_size = int(batch_size * _INPUTS_TO_EXTRACT_PATCHES_FACTOR) + inputs = _random_tensor_gather(inputs, max_size) # TODO(b/64144716): there is potential here for a big savings in terms of # memory use. @@ -1260,8 +1363,12 @@ class ConvInputKroneckerFactor(InverseProvidingFactor): # |Delta| = number of spatial offsets, and J = number of input maps # for convolutional layer l. patches_flat = array_ops.reshape(patches, [-1, flatten_size]) + # We append a homogenous coordinate to patches_flat if the layer has # bias parameters. This gives us [[A_l]]_H from the paper. + if self._sub_sample_patches: + patches_flat = _subsample_for_cov_computation(patches_flat) + if self._has_bias: patches_flat = append_homog(patches_flat) # We call compute_cov without passing in a normalizer. compute_cov uses diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py index 9f904cc3028121fb53f6622e3a0773010989241c..02d294c68f1e10108d774c5fe23b6371a7a9f0e6 100644 --- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py +++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py @@ -45,7 +45,6 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import nest -from tensorflow.python.util import tf_inspect __all__ = ["rev_block", "RevBlock", "recompute_grad"] @@ -430,13 +429,12 @@ def enable_with_args(dec): @enable_with_args -def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False, - tensor_arg_names=None): +def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False): """Decorator that recomputes the function on the backwards pass. Args: - fn: the subgraph-producing function to wrap and recompute when computing - gradients. Provide `tensor_arg_names` if not all arguments are `Tensor`s. + fn: a function that takes Tensors (all as positional arguments) and returns + a tuple of Tensors. use_data_dep: `bool`, if `True` will use a dummy data dependency to force the recompute to happen. If `False` will use a control dependency. By default will be `True` if in an XLA context and `False` otherwise. XLA @@ -445,25 +443,17 @@ def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False, that all gradients are produced before any are consumed by downstream ops. If `use_data_dep` is also `True`, will use a data dependency instead of a control dependency. - tensor_arg_names: `list`, names of the `Tensor` arguments to `fn`. If - `None`, assumes all arguments are `Tensor`s. Returns: A wrapped fn that is identical to fn when called, but its activations will be discarded and recomputed on the backwards pass (i.e. on a call to tf.gradients). """ - if tensor_arg_names: - if not isinstance(tensor_arg_names, (list, tuple)): - raise TypeError("tensor_arg_names must be a list") @functools.wraps(fn) - def wrapped(*args, **kwargs): - tensor_only_fn, tensor_args = _make_tensor_only(fn, args, kwargs, - tensor_arg_names) + def wrapped(*args): return _recompute_grad( - tensor_only_fn, tensor_args, use_data_dep=use_data_dep, - tupleize_grads=tupleize_grads) + fn, args, use_data_dep=use_data_dep, tupleize_grads=tupleize_grads) return wrapped @@ -473,59 +463,11 @@ def _is_on_tpu(): return control_flow_util.GetContainingXLAContext(ctxt) is not None -def _make_tensor_only(fn, args, kwargs, tensor_arg_names): - """Return fn such that it only takes Tensor args for tensor_arg_names.""" - argspec = tf_inspect.getargspec(fn) - if argspec.varargs is not None or argspec.keywords is not None: - raise ValueError("Function decorated with recompute_grad must not use " - "*args or **kwargs.") - fn_arg_names = list(argspec.args) - - # name_to_arg is a dict of argument name to argument value, including both - # positional and keyword arguments passed. - name_to_arg = {} - # Populate positional arguments. - for name, arg in zip(fn_arg_names[:len(args)], args): - name_to_arg[name] = arg - # Populate keyword arguments. - name_to_arg.update(kwargs) - - # Separate the Tensor arguments from the non-Tensor arguments. - # The default is that all arguments are Tensor arguments. - tensor_arg_names = tensor_arg_names or fn_arg_names - for name in tensor_arg_names: - if name not in name_to_arg: - raise ValueError("Must provide Tensor argument %s" % name) - tensor_args = [name_to_arg[name] for name in tensor_arg_names] - non_tensor_kwargs = dict([(name, arg) for name, arg in name_to_arg.items() - if name not in tensor_arg_names]) - - # Check that Tensor arguments are in fact Tensors and that non-Tensor - # arguments are not. - for name, arg in zip(tensor_arg_names, tensor_args): - if not isinstance(arg, framework_ops.Tensor): - raise TypeError("Fn argument %s must be a Tensor." % name) - for name, arg in non_tensor_kwargs.items(): - if isinstance(arg, framework_ops.Tensor): - raise TypeError("Fn argument %s must not be a Tensor." % name) - - # Construct a Tensor-only wrapper function that will pass the non-Tensor - # arguments as well when called. - def tensor_only_fn(*tensors): - all_kwargs = dict(zip(tensor_arg_names, tensors)) - all_kwargs.update(non_tensor_kwargs) - return fn(**all_kwargs) - - return tensor_only_fn, tensor_args - - -def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, - tupleize_grads=False): +def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False): """See recompute_grad.""" for arg in args: if not isinstance(arg, framework_ops.Tensor): raise ValueError("All inputs to function must be Tensors") - use_data_dep_ = use_data_dep if use_data_dep_ == _USE_DEFAULT: use_data_dep_ = _is_on_tpu() @@ -559,11 +501,14 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, grad_vars = grads[len(inputs):] return grad_inputs, grad_vars - # TODO(rsepassi): Replace with tf.custom_gradient @_fn_with_custom_grad(grad_fn) def fn_with_recompute(*args): cached_vs.append(variable_scope.get_variable_scope()) - cached_arg_scope.append(contrib_framework_ops.current_arg_scope()) + # TODO(rsepassi): Rm conditional in TF 1.4 + if hasattr(contrib_framework_ops, "current_arg_scope"): + cached_arg_scope.append(contrib_framework_ops.current_arg_scope()) + else: + cached_arg_scope.append({}) return fn(*args) return fn_with_recompute(*args) diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py index 66309b9d89b415224087ff821938bb0281afcf55..8c118402a4c85d4b0504754fcd0436ce8b00862d 100644 --- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py +++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py @@ -318,108 +318,6 @@ class RecomputeTest(test.TestCase): self.assertEqual(1, len(grads)) self.assertTrue(grads[0] is not None) - def testWithNontensorArgs(self): - @rev_block_lib.recompute_grad(tupleize_grads=True, - tensor_arg_names=["inputs"]) - def layer_with_recompute(inputs, plus=None): - var = variable_scope.get_variable("var", ()) - self.assertFalse(plus) # called with False below - if plus: - return var + inputs - else: - return var * inputs - - inputs = array_ops.ones((), dtypes.float32) - outputs = layer_with_recompute(inputs, plus=False) - loss = math_ops.square(outputs) - grads = gradients_impl.gradients(loss, variables.trainable_variables()) - self.assertEqual(1, len(grads)) - self.assertTrue(grads[0] is not None) - - -class MakeTensorOnlyTest(test.TestCase): - - def testMakeTensorOnly(self): - def fn(a, b, c, d=1, e=None, f=7): - return (a, b, c, d, e, f) - - t1 = array_ops.ones(()) - t2 = array_ops.ones(()) - t3 = array_ops.ones(()) - args = [1, t1, 3, t2] - kwargs = {"e": t3} - tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only( - fn, args, kwargs, ["b", "d", "e"]) - self.assertAllEqual(tensor_args, [t1, t2, t3]) - out = tensor_only_fn(*tensor_args) - self.assertAllEqual(out, (1, t1, 3, t2, t3, 7)) - - def testMakeTensorOnlyPositionalArgsOnly(self): - def fn(a, b, c): - return (a, b, c) - - t1 = array_ops.ones(()) - t2 = array_ops.ones(()) - args = [t1, 3, t2] - tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only( - fn, args, {}, ["a", "c"]) - self.assertAllEqual(tensor_args, [t1, t2]) - out = tensor_only_fn(*tensor_args) - self.assertAllEqual(out, (t1, 3, t2)) - - def testMakeTensorOnlyKwargsArgsOnly(self): - def fn(a=1, b=2, c=3): - return (a, b, c) - - t1 = array_ops.ones(()) - t2 = array_ops.ones(()) - args = [t1] - kwargs = {"c": t2} - tensor_only_fn, tensor_args = rev_block_lib._make_tensor_only( - fn, args, kwargs, ["a", "c"]) - self.assertAllEqual(tensor_args, [t1, t2]) - out = tensor_only_fn(*tensor_args) - self.assertAllEqual(out, (t1, 2, t2)) - - def testErrorOnMissingTensorArg(self): - def fn(a, b): - return (a, b) - - with self.assertRaisesWithPredicateMatch( - ValueError, "provide Tensor argument"): - rev_block_lib._make_tensor_only(fn, [], {"b": 2}, ["a"]) - - def testErrorOnSignatureSplats(self): - def fn1(a, *args): - return (a, args) - - err_msg = r"must not use \*args or \*\*kwargs" - with self.assertRaisesWithPredicateMatch(ValueError, err_msg): - rev_block_lib._make_tensor_only(fn1, [1, 2], {}, ["a"]) - - def fn2(a, **kwargs): - return (a, kwargs) - - with self.assertRaisesWithPredicateMatch(ValueError, err_msg): - rev_block_lib._make_tensor_only(fn2, [], {"a": 1, "b": 2}, ["a"]) - - def testErrorOnNonTensorForTensor(self): - def fn(a, b): - return (a, b) - - with self.assertRaisesWithPredicateMatch(TypeError, "must be a Tensor"): - rev_block_lib._make_tensor_only(fn, [2, 3], {}, ["a"]) - - def testErrorOnTensorForNonTensor(self): - def fn(a, b): - return (a, b) - - with self.assertRaisesWithPredicateMatch( - TypeError, "must not be a Tensor"): - t1 = array_ops.ones(()) - t2 = array_ops.ones(()) - rev_block_lib._make_tensor_only(fn, [t1, t2], {}, ["a"]) - class FnWithCustomGradTest(test.TestCase): diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py index 3e639a180ef11af5f7f498c647eb25417f918eb9..69bb6be81453f5f5487f25547f017dc5f87c2f2c 100644 --- a/tensorflow/contrib/layers/python/layers/target_column.py +++ b/tensorflow/contrib/layers/python/layers/target_column.py @@ -270,7 +270,7 @@ class _RegressionTargetColumn(_TargetColumn): def logits_to_predictions(self, logits, proba=False): if self.num_label_columns == 1: - return array_ops.squeeze(logits, squeeze_dims=[1]) + return array_ops.squeeze(logits, axis=[1]) return logits def get_eval_ops(self, features, logits, labels, metrics=None): @@ -418,7 +418,7 @@ def _softmax_cross_entropy_loss(logits, target): "Instead got %s." % target.dtype) # sparse_softmax_cross_entropy_with_logits requires [batch_size] target. if len(target.get_shape()) == 2: - target = array_ops.squeeze(target, squeeze_dims=[1]) + target = array_ops.squeeze(target, axis=[1]) loss_vec = nn.sparse_softmax_cross_entropy_with_logits( labels=target, logits=logits) return loss_vec diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py index d81a534b79bc90fe91ffd3cb97a7865a7cb4c2a9..9e5aaf3118dfed4ce64dd244a915860b5a2eef44 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py @@ -715,7 +715,9 @@ class EstimatorTest(test.TestCase): ckpt = checkpoint_state_pb2.CheckpointState() text_format.Merge(checkpoint_file_content, ckpt) self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5') - self.assertAllEqual(['model.ckpt-1', 'model.ckpt-5'], + # TODO(b/78461127): Please modify tests to not directly rely on names of + # checkpoints. + self.assertAllEqual(['model.ckpt-0', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths) def test_train_save_copy_reload(self): diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py index 2b4b6eff39f4fc8a20a149edfc07d2f4f27a9bae..e28e6854a5097d66cb486be3e82f3726f5cc70fd 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/head.py +++ b/tensorflow/contrib/learn/python/learn/estimators/head.py @@ -777,7 +777,7 @@ class _RegressionHead(_SingleHead): key = prediction_key.PredictionKey.SCORES with ops.name_scope(None, "predictions", (logits,)): if self.logits_dimension == 1: - logits = array_ops.squeeze(logits, squeeze_dims=(1,), name=key) + logits = array_ops.squeeze(logits, axis=(1,), name=key) return {key: self._link_fn(logits)} def _metrics(self, eval_loss, predictions, labels, weights): @@ -974,7 +974,7 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None): is_squeezed_labels = False # TODO(ptucker): This will break for dynamic shapes. if len(labels.get_shape()) == 2: - labels = array_ops.squeeze(labels, squeeze_dims=(1,)) + labels = array_ops.squeeze(labels, axis=(1,)) is_squeezed_labels = True loss = nn.sparse_softmax_cross_entropy_with_logits( diff --git a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py index 92976d1539c7ddc226b81f903beee82b798ec8db..9f2cadb01747c5a8e4ee75ac38f423f85e11bbba 100644 --- a/tensorflow/contrib/learn/python/learn/ops/losses_ops.py +++ b/tensorflow/contrib/learn/python/learn/ops/losses_ops.py @@ -40,7 +40,7 @@ def mean_squared_error_regressor(tensor_in, labels, weights, biases, name=None): [tensor_in, labels]): predictions = nn.xw_plus_b(tensor_in, weights, biases) if len(labels.get_shape()) == 1 and len(predictions.get_shape()) == 2: - predictions = array_ops_.squeeze(predictions, squeeze_dims=[1]) + predictions = array_ops_.squeeze(predictions, axis=[1]) return predictions, losses.mean_squared_error(labels, predictions) diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py index 38bd66b13f79ee918acbdc2d33a6367e4e34d4b1..554854da84715ee8c8d00ec7f8e3156642b43d80 100644 --- a/tensorflow/contrib/linalg/__init__.py +++ b/tensorflow/contrib/linalg/__init__.py @@ -18,6 +18,9 @@ See the @{$python/contrib.linalg} guide. @@LinearOperator @@LinearOperatorBlockDiag +@@LinearOperatorCirculant +@@LinearOperatorCirculant2D +@@LinearOperatorCirculant3D @@LinearOperatorDiag @@LinearOperatorIdentity @@LinearOperatorScaledIdentity @@ -39,6 +42,7 @@ from tensorflow.contrib.linalg.python.ops.linear_operator_addition import * from tensorflow.contrib.linalg.python.ops.linear_operator_block_diag import * from tensorflow.contrib.linalg.python.ops.linear_operator_kronecker import * from tensorflow.python.ops.linalg.linear_operator import * +from tensorflow.python.ops.linalg.linear_operator_circulant import * from tensorflow.python.ops.linalg.linear_operator_composition import * from tensorflow.python.ops.linalg.linear_operator_diag import * from tensorflow.python.ops.linalg.linear_operator_full_matrix import * diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py index 6e6c812adcbaf7e90d7c10c05fdfc0e150829329..b5741967ab52568725d7c9f03a0cc0b0f63f7459 100644 --- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py +++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py @@ -39,8 +39,8 @@ from tensorflow.python.ops import variables as variables_lib from tensorflow.python.platform import googletest _MAX_ITERATIONS = 100 -_SHARD_NUMBERS = [None, 1, 3, 10] -_NUM_LOSS_PARTITIONS = [2, 4] +_SHARD_NUMBERS = [None, 1, 3] +_NUM_LOSS_PARTITIONS = [4] def make_example_proto(feature_dict, target, value=1.0): diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl index b8f6b7fd59af9834edb4aa7aefa524c25ede66d2..85216776823eab2ab3ac2a3bc666f21e312acc6c 100644 --- a/tensorflow/contrib/lite/build_def.bzl +++ b/tensorflow/contrib/lite/build_def.bzl @@ -124,19 +124,19 @@ def tf_to_tflite(name, src, options, out): out: name of the output flatbuffer file. """ - toco = "//tensorflow/contrib/lite/toco:toco" + toco_cmdline = " ".join([ + "//tensorflow/contrib/lite/toco:toco", + "--input_format=TENSORFLOW_GRAPHDEF", + "--output_format=TFLITE", + ("--input_file=$(location %s)" % src), + ("--output_file=$(location %s)" % out), + ] + options ) native.genrule( name = name, - srcs=[src, options], + srcs=[src], outs=[out], - cmd = ("$(location %s) " + - " --input_file=$(location %s) " + - " --output_file=$(location %s) " + - " --input_format=TENSORFLOW_GRAPHDEF" + - " --output_format=TFLITE" + - " `cat $(location %s)`") - % (toco, src, out, options), - tools= [toco], + cmd = toco_cmdline, + tools= ["//tensorflow/contrib/lite/toco:toco"], ) def tflite_to_json(name, src, out): diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h index 0b38f43cd32fbdfa0296eec7ef81aab76ebe5461..12841d233cc1d3c5e1219fc505b1975d2a7fa3e3 100644 --- a/tensorflow/contrib/lite/context.h +++ b/tensorflow/contrib/lite/context.h @@ -275,7 +275,7 @@ typedef struct { typedef struct TfLiteContext { // Number of tensors in the context. - int tensors_size; + size_t tensors_size; // The execution plan contains a list of the node indices in execution // order. execution_plan->size is the current number of nodes. And, @@ -397,13 +397,13 @@ typedef struct _TfLiteDelegate { // This can be null if the delegate doesn't use its own buffer. TfLiteStatus (*CopyFromBufferHandle)(TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, - void* data, int size); + void* data, size_t size); // Copy the data from raw memory to delegate buffer handle. // This can be null if the delegate doesn't use its own buffer. TfLiteStatus (*CopyToBufferHandle)(TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, - void* data, int size); + void* data, size_t size); // Free the Delegate Buffer Handle. Note: This only frees the handle, but // this doesn't release the underlying resource (e.g. textures). The diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc index 91b6c414bf036fbf57f53fc75f570b05449fa89e..9d8ea55fd1edc0dacc821536cc2b564c59f65b71 100644 --- a/tensorflow/contrib/lite/interpreter.cc +++ b/tensorflow/contrib/lite/interpreter.cc @@ -308,7 +308,12 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label, for (int i = 0; i < length; i++) { int index = indices[i]; - if (index < kOptionalTensor || index >= context_.tensors_size) { + // Continue if index == kOptionalTensor before additional comparisons below, + // size_t(-1) is always >= context_tensors_size. + if (index == kOptionalTensor) { + continue; + } + if (index < 0 || static_cast(index) >= context_.tensors_size) { ReportError(&context_, "Invalid tensor index %d in %s\n", index, label); consistent_ = false; return kTfLiteError; @@ -318,7 +323,7 @@ TfLiteStatus Interpreter::CheckTensorIndices(const char* label, } TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims, - int dims_size, size_t* bytes) { + size_t dims_size, size_t* bytes) { // TODO(aselle): Check for overflow here using overflow.h in TensorFlow // MultiplyWithoutOverflow. TF_LITE_ENSURE(&context_, bytes != nullptr); @@ -645,7 +650,7 @@ TfLiteStatus Interpreter::GetNodeAndRegistration( } TfLiteStatus Interpreter::SetTensorParametersReadOnly( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization, const char* buffer, size_t bytes, const Allocation* allocation) { if (state_ == kStateInvokableAndImmutable) { @@ -691,7 +696,7 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly( // bytes. The lifetime of buffer must be ensured to be greater or equal // to Interpreter. TfLiteStatus Interpreter::SetTensorParametersReadWrite( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization) { if (state_ == kStateInvokableAndImmutable) { ReportError( diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h index a49134b95ee47b97ad1e56d07f737fa44f89badc..6f3433abcf71b6090b434d47e925775a2e517064 100644 --- a/tensorflow/contrib/lite/interpreter.h +++ b/tensorflow/contrib/lite/interpreter.h @@ -150,7 +150,7 @@ class Interpreter { }; TfLiteStatus SetTensorParametersReadOnly( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization, const char* buffer, size_t bytes, const Allocation* allocation = nullptr); @@ -165,7 +165,7 @@ class Interpreter { dims.data(), quantization); } TfLiteStatus SetTensorParametersReadWrite( - int tensor_index, TfLiteType type, const char* name, const int rank, + int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantizationParams quantization); // Functions to access tensor data @@ -189,10 +189,10 @@ class Interpreter { } // Return the number of tensors in the model. - int tensors_size() const { return context_.tensors_size; } + size_t tensors_size() const { return context_.tensors_size; } // Return the number of ops in the model. - int nodes_size() const { return nodes_and_registration_.size(); } + size_t nodes_size() const { return nodes_and_registration_.size(); } // WARNING: Experimental interface, subject to change const std::vector& execution_plan() const { return execution_plan_; } @@ -406,7 +406,7 @@ class Interpreter { // Compute the number of bytes required to represent a tensor with dimensions // specified by the array dims (of length dims_size). Returns the status code // and bytes. - TfLiteStatus BytesRequired(TfLiteType type, const int* dims, int dims_size, + TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size, size_t* bytes); // Request an tensor be resized implementation. If the given tensor is of @@ -467,7 +467,7 @@ class Interpreter { // tensors. After calling this function, adding `kTensorsCapacityHeadroom` // more tensors won't invalidate the pointer to existing tensors. void EnsureTensorsVectorCapacity() { - const int required_capacity = tensors_size() + kTensorsCapacityHeadroom; + const size_t required_capacity = tensors_size() + kTensorsCapacityHeadroom; if (required_capacity > tensors_.capacity()) { tensors_.reserve(required_capacity); context_.tensors = tensors_.data(); diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc index 131e088079857af34478645b7f1559364d03a493..453c1ada1cf6263be14a3b170f209e3a30580cc3 100644 --- a/tensorflow/contrib/lite/interpreter_test.cc +++ b/tensorflow/contrib/lite/interpreter_test.cc @@ -887,15 +887,15 @@ class TestDelegate : public ::testing::Test { TfLiteIntArrayFree(nodes_to_separate); return kTfLiteOk; }; - delegate_.CopyToBufferHandle = [](TfLiteDelegate* delegate, - TfLiteBufferHandle buffer_handle, - void* data, int size) -> TfLiteStatus { + delegate_.CopyToBufferHandle = + [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, + void* data, size_t size) -> TfLiteStatus { // TODO(ycling): Implement tests to test buffer copying logic. return kTfLiteOk; }; delegate_.CopyFromBufferHandle = [](TfLiteDelegate* delegate, TfLiteBufferHandle buffer_handle, - void* data, int size) -> TfLiteStatus { + void* data, size_t size) -> TfLiteStatus { // TODO(ycling): Implement tests to test buffer copying logic. return kTfLiteOk; }; diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java index 18f64651889d7eeb4be961afc47554cbcc51a410..4f5662bc2d15f1bf6bfec0b9ec79b09f9e124186 100644 --- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java +++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java @@ -47,6 +47,8 @@ import android.os.HandlerThread; import android.support.annotation.NonNull; import android.support.v13.app.FragmentCompat; import android.support.v4.content.ContextCompat; +import android.text.SpannableString; +import android.text.SpannableStringBuilder; import android.util.Log; import android.util.Size; import android.view.LayoutInflater; @@ -207,14 +209,21 @@ public class Camera2BasicFragment extends Fragment * * @param text The message to show */ - private void showToast(final String text) { + private void showToast(String s) { + SpannableStringBuilder builder = new SpannableStringBuilder(); + SpannableString str1 = new SpannableString(s); + builder.append(str1); + showToast(builder); + } + + private void showToast(SpannableStringBuilder builder) { final Activity activity = getActivity(); if (activity != null) { activity.runOnUiThread( new Runnable() { @Override public void run() { - textView.setText(text); + textView.setText(builder, TextView.BufferType.SPANNABLE); } }); } @@ -682,8 +691,9 @@ public class Camera2BasicFragment extends Fragment showToast("Uninitialized Classifier or invalid context."); return; } + SpannableStringBuilder textToShow = new SpannableStringBuilder(); Bitmap bitmap = textureView.getBitmap(classifier.getImageSizeX(), classifier.getImageSizeY()); - String textToShow = classifier.classifyFrame(bitmap); + classifier.classifyFrame(bitmap, textToShow); bitmap.recycle(); showToast(textToShow); } diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java index d32c0779101cf8e795ee9d7e970401c2c03bb33a..7bb6afd9d8b77159bb180fad6bbe43ca454f9d14 100644 --- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java +++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java @@ -19,10 +19,11 @@ import android.app.Activity; import android.content.res.AssetFileDescriptor; import android.graphics.Bitmap; import android.os.SystemClock; +import android.text.SpannableString; +import android.text.SpannableStringBuilder; +import android.text.style.ForegroundColorSpan; +import android.text.style.RelativeSizeSpan; import android.util.Log; - -import org.tensorflow.lite.Interpreter; - import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; @@ -37,11 +38,15 @@ import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.PriorityQueue; +import org.tensorflow.lite.Interpreter; /** * Classifies images with Tensorflow Lite. */ public abstract class ImageClassifier { + // Display preferences + private static final float GOOD_PROB_THRESHOLD = 0.3f; + private static final int SMALL_COLOR = 0xffddaa88; /** Tag for the {@link Log}. */ private static final String TAG = "TfLiteCameraDemo"; @@ -99,10 +104,12 @@ public abstract class ImageClassifier { } /** Classifies a frame from the preview stream. */ - String classifyFrame(Bitmap bitmap) { + void classifyFrame(Bitmap bitmap, SpannableStringBuilder builder) { + printTopKLabels(builder); + if (tflite == null) { Log.e(TAG, "Image classifier has not been initialized; Skipped."); - return "Uninitialized Classifier."; + builder.append(new SpannableString("Uninitialized Classifier.")); } convertBitmapToByteBuffer(bitmap); // Here's where the magic happens!!! @@ -115,9 +122,10 @@ public abstract class ImageClassifier { applyFilter(); // Print the results. - String textToShow = printTopKLabels(); - textToShow = Long.toString(endTime - startTime) + "ms" + textToShow; - return textToShow; + long duration = endTime - startTime; + SpannableString span = new SpannableString(duration + " ms"); + span.setSpan(new ForegroundColorSpan(android.graphics.Color.LTGRAY), 0, span.length(), 0); + builder.append(span); } void applyFilter() { @@ -202,7 +210,7 @@ public abstract class ImageClassifier { } /** Prints top-K labels, to be shown in UI as the results. */ - private String printTopKLabels() { + private void printTopKLabels(SpannableStringBuilder builder) { for (int i = 0; i < getNumLabels(); ++i) { sortedLabels.add( new AbstractMap.SimpleEntry<>(labelList.get(i), getNormalizedProbability(i))); @@ -210,13 +218,27 @@ public abstract class ImageClassifier { sortedLabels.poll(); } } - String textToShow = ""; + final int size = sortedLabels.size(); - for (int i = 0; i < size; ++i) { + for (int i = 0; i < size; i++) { Map.Entry label = sortedLabels.poll(); - textToShow = String.format("\n%s: %4.2f", label.getKey(), label.getValue()) + textToShow; + SpannableString span = + new SpannableString(String.format("%s: %4.2f\n", label.getKey(), label.getValue())); + int color; + // Make it white when probability larger than threshold. + if (label.getValue() > GOOD_PROB_THRESHOLD) { + color = android.graphics.Color.WHITE; + } else { + color = SMALL_COLOR; + } + // Make first item bigger. + if (i == size - 1) { + float sizeScale = (i == size - 1) ? 1.25f : 0.8f; + span.setSpan(new RelativeSizeSpan(sizeScale), 0, span.length(), 0); + } + span.setSpan(new ForegroundColorSpan(color), 0, span.length(), 0); + builder.insert(0, span); } - return textToShow; } /** diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png index c22509d8dfccae14d9470e3042a9ed5b469ca2c9..52cf2ab95296d675dd42533bb9136707adebd98c 100644 Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png index d68af39186ca9cd2bc755cad8397467a11844a1d..b75f892c462a12cae4f09851d019db23b286f843 100644 Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png index 15e419b7ccd88651bd21dac36853a827fc4075b8..36e14c48d14a8d3e5bf37d3caaee661061cec3be 100644 Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png index 342ce34e1663960d8d7050a9be57face3571d336..06dd2a740ec2abaec4919c991dd17ee007ffcf28 100644 Binary files a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..b94bcfc081e0b036fbba271d7cbfb986575d4abf Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png differ diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml index a84f1bbfa0cb48a3fc335c9bc4aa7d8e93d20e75..20f520814d7154764932638c5e9dddc32639b677 100644 --- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml +++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml @@ -14,37 +14,50 @@ limitations under the License. --> - - - - - + + + + + + + + + + - - + diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml new file mode 100644 index 0000000000000000000000000000000000000000..72a229ecdb19f5309994e994d82e0b5b5ed617a2 --- /dev/null +++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml @@ -0,0 +1,88 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml index db557ad62f619e88f72426a48a74bffb0f57b818..d12435d5abda45917b8a4f12c4b3179997eae689 100644 --- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml +++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml @@ -14,9 +14,30 @@ limitations under the License. --> + + + + + + - + + + + android:layout_alignParentStart="true" + android:layout_alignTop="@+id/control" + android:layout_marginLeft="300dp" + android:layout_marginStart="300dp" + android:background="#bb7700"> - + android:layout_alignParentLeft="true" + android:layout_alignParentStart="true" /> - + + & input_dims, output_activation_max, output_data, output_dims, gemm_context); } +// Internal function doing the actual arithmetic work for +// ExperimentalShuffledFullyConnected. +// May be called either directly by it (single-threaded case) or may be used +// as the 'task' for worker threads to run (multi-threaded case, see +// ExperimentalShuffledFullyConnectedWorkerTask below). +inline void ExperimentalShuffledFullyConnectedWorkerImpl( + const uint8* shuffled_input_workspace_data, + const int8* shuffled_weights_data, int batches, int output_depth, + int output_stride, int accum_depth, const int32* bias_data, + int32 output_multiplier, int output_shift, int16* output_data) { +#if defined USE_NEON + const int8* shuffled_weights_ptr = shuffled_weights_data; + if (batches == 1) { + const int right_shift = output_shift > 0 ? output_shift : 0; + const int left_shift = output_shift > 0 ? 0 : -output_shift; + for (int c = 0; c < output_depth; c += 4) { + // Accumulation loop. + int32x4_t row_accum0 = vdupq_n_s32(0); + int32x4_t row_accum1 = vdupq_n_s32(0); + int32x4_t row_accum2 = vdupq_n_s32(0); + int32x4_t row_accum3 = vdupq_n_s32(0); + for (int d = 0; d < accum_depth; d += 16) { + int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0); + int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16); + int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32); + int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48); + shuffled_weights_ptr += 64; + int8x16_t input = + vreinterpretq_s8_u8(vld1q_u8(shuffled_input_workspace_data + d)); + int16x8_t local_accum0 = + vmull_s8(vget_low_s8(weights0), vget_low_s8(input)); + int16x8_t local_accum1 = + vmull_s8(vget_low_s8(weights1), vget_low_s8(input)); + int16x8_t local_accum2 = + vmull_s8(vget_low_s8(weights2), vget_low_s8(input)); + int16x8_t local_accum3 = + vmull_s8(vget_low_s8(weights3), vget_low_s8(input)); + local_accum0 = + vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input)); + local_accum1 = + vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input)); + local_accum2 = + vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input)); + local_accum3 = + vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input)); + row_accum0 = vpadalq_s16(row_accum0, local_accum0); + row_accum1 = vpadalq_s16(row_accum1, local_accum1); + row_accum2 = vpadalq_s16(row_accum2, local_accum2); + row_accum3 = vpadalq_s16(row_accum3, local_accum3); + } + // Horizontally reduce accumulators + int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, + pairwise_reduced_acc_2, pairwise_reduced_acc_3; + pairwise_reduced_acc_0 = + vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0)); + pairwise_reduced_acc_1 = + vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1)); + pairwise_reduced_acc_2 = + vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2)); + pairwise_reduced_acc_3 = + vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3)); + const int32x2_t reduced_lo = + vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); + const int32x2_t reduced_hi = + vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); + int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); + // Add bias values. + int32x4_t bias_vec = vld1q_s32(bias_data + c); + reduced = vaddq_s32(reduced, bias_vec); + reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); + // Multiply by the fixed-point multiplier. + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + // Rounding-shift-right. + using gemmlowp::RoundingDivideByPOT; + reduced = RoundingDivideByPOT(reduced, right_shift); + // Narrow values down to 16 bit signed. + const int16x4_t res16 = vqmovn_s32(reduced); + vst1_s16(output_data + c, res16); + } + } else if (batches == 4) { + const int right_shift = output_shift > 0 ? output_shift : 0; + const int left_shift = output_shift > 0 ? 0 : -output_shift; + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = + reinterpret_cast(shuffled_input_workspace_data); + // Accumulation loop. + int32x4_t row_accum00 = vdupq_n_s32(0); + int32x4_t row_accum10 = vdupq_n_s32(0); + int32x4_t row_accum20 = vdupq_n_s32(0); + int32x4_t row_accum30 = vdupq_n_s32(0); + int32x4_t row_accum01 = vdupq_n_s32(0); + int32x4_t row_accum11 = vdupq_n_s32(0); + int32x4_t row_accum21 = vdupq_n_s32(0); + int32x4_t row_accum31 = vdupq_n_s32(0); + int32x4_t row_accum02 = vdupq_n_s32(0); + int32x4_t row_accum12 = vdupq_n_s32(0); + int32x4_t row_accum22 = vdupq_n_s32(0); + int32x4_t row_accum32 = vdupq_n_s32(0); + int32x4_t row_accum03 = vdupq_n_s32(0); + int32x4_t row_accum13 = vdupq_n_s32(0); + int32x4_t row_accum23 = vdupq_n_s32(0); + int32x4_t row_accum33 = vdupq_n_s32(0); + for (int d = 0; d < accum_depth; d += 16) { + int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0); + int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16); + int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32); + int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48); + shuffled_weights_ptr += 64; + int8x16_t input0 = vld1q_s8(shuffled_input_ptr + 0); + int8x16_t input1 = vld1q_s8(shuffled_input_ptr + 16); + int8x16_t input2 = vld1q_s8(shuffled_input_ptr + 32); + int8x16_t input3 = vld1q_s8(shuffled_input_ptr + 48); + shuffled_input_ptr += 64; + int16x8_t local_accum0, local_accum1, local_accum2, local_accum3; +#define TFLITE_SHUFFLED_FC_ACCUM(B) \ + local_accum0 = vmull_s8(vget_low_s8(weights0), vget_low_s8(input##B)); \ + local_accum1 = vmull_s8(vget_low_s8(weights1), vget_low_s8(input##B)); \ + local_accum2 = vmull_s8(vget_low_s8(weights2), vget_low_s8(input##B)); \ + local_accum3 = vmull_s8(vget_low_s8(weights3), vget_low_s8(input##B)); \ + local_accum0 = \ + vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input##B)); \ + local_accum1 = \ + vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input##B)); \ + local_accum2 = \ + vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input##B)); \ + local_accum3 = \ + vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input##B)); \ + row_accum0##B = vpadalq_s16(row_accum0##B, local_accum0); \ + row_accum1##B = vpadalq_s16(row_accum1##B, local_accum1); \ + row_accum2##B = vpadalq_s16(row_accum2##B, local_accum2); \ + row_accum3##B = vpadalq_s16(row_accum3##B, local_accum3); + + TFLITE_SHUFFLED_FC_ACCUM(0) + TFLITE_SHUFFLED_FC_ACCUM(1) + TFLITE_SHUFFLED_FC_ACCUM(2) + TFLITE_SHUFFLED_FC_ACCUM(3) + +#undef TFLITE_SHUFFLED_FC_ACCUM + } + // Horizontally reduce accumulators + +#define TFLITE_SHUFFLED_FC_STORE(B) \ + { \ + int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, \ + pairwise_reduced_acc_2, pairwise_reduced_acc_3; \ + pairwise_reduced_acc_0 = \ + vpadd_s32(vget_low_s32(row_accum0##B), vget_high_s32(row_accum0##B)); \ + pairwise_reduced_acc_1 = \ + vpadd_s32(vget_low_s32(row_accum1##B), vget_high_s32(row_accum1##B)); \ + pairwise_reduced_acc_2 = \ + vpadd_s32(vget_low_s32(row_accum2##B), vget_high_s32(row_accum2##B)); \ + pairwise_reduced_acc_3 = \ + vpadd_s32(vget_low_s32(row_accum3##B), vget_high_s32(row_accum3##B)); \ + const int32x2_t reduced_lo = \ + vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); \ + const int32x2_t reduced_hi = \ + vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); \ + int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); \ + int32x4_t bias_vec = vld1q_s32(bias_data + c); \ + reduced = vaddq_s32(reduced, bias_vec); \ + reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); \ + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); \ + using gemmlowp::RoundingDivideByPOT; \ + reduced = RoundingDivideByPOT(reduced, right_shift); \ + const int16x4_t res16 = vqmovn_s32(reduced); \ + vst1_s16(output_data + c + B * output_stride, res16); \ + } + + TFLITE_SHUFFLED_FC_STORE(0); + TFLITE_SHUFFLED_FC_STORE(1); + TFLITE_SHUFFLED_FC_STORE(2); + TFLITE_SHUFFLED_FC_STORE(3); + +#undef TFLITE_SHUFFLED_FC_STORE + } + } else { + TFLITE_DCHECK(false); + return; + } +#else + if (batches == 1) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4] = {0}; + // Accumulation loop. + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_data[d + j]; + int8 weights_val = *shuffled_weights_ptr++; + accum[i] += weights_val * input_val; + } + } + } + for (int i = 0; i < 4; i++) { + // Add bias value + int acc = accum[i] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, -32768); + acc = std::min(acc, 32767); + output_ptr[c + i] = acc; + } + } + } else if (batches == 4) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = shuffled_input_data; + // Accumulation loop. + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4][4]; + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + accum[i][b] = 0; + } + } + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_ptr[16 * b + j]; + int8 weights_val = shuffled_weights_ptr[16 * i + j]; + accum[i][b] += weights_val * input_val; + } + } + } + shuffled_input_ptr += 64; + shuffled_weights_ptr += 64; + } + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + // Add bias value + int acc = accum[i][b] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The + // quantized multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, -32768); + acc = std::min(acc, 32767); + output_ptr[b * output_stride + c + i] = acc; + } + } + } + } else { + TFLITE_DCHECK(false); + return; + } +#endif +} + +// Wraps ExperimentalShuffledFullyConnectedWorkerImpl into a Task class +// to allow using gemmlowp's threadpool. +struct ExperimentalShuffledFullyConnectedWorkerTask : gemmlowp::Task { + ExperimentalShuffledFullyConnectedWorkerTask( + const uint8* input_data, const int8* shuffled_weights_data, int batches, + int output_depth, int output_stride, int accum_depth, + const int32* bias_data, int32 output_multiplier, int output_shift, + int16* output_data) + : input_data_(input_data), + shuffled_weights_data_(shuffled_weights_data), + batches_(batches), + output_depth_(output_depth), + output_stride_(output_stride), + accum_depth_(accum_depth), + bias_data_(bias_data), + output_multiplier_(output_multiplier), + output_shift_(output_shift), + output_data_(output_data) {} + + void Run() override { + ExperimentalShuffledFullyConnectedWorkerImpl( + input_data_, shuffled_weights_data_, batches_, output_depth_, + output_stride_, accum_depth_, bias_data_, output_multiplier_, + output_shift_, output_data_); + } + + const uint8* input_data_; + const int8* shuffled_weights_data_; + int batches_; + int output_depth_; + int output_stride_; + int accum_depth_; + const int32* bias_data_; + int32 output_multiplier_; + int output_shift_; + int16* output_data_; +}; + inline void ExperimentalShuffledFullyConnected( const uint8* input_data, const Dims<4>& input_dims, const uint8* shuffled_weights_data, const Dims<4>& weights_dims, const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, int output_shift, int32 output_activation_min, int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { + uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { gemmlowp::ScopedProfilingLabel label( "ExperimentalShuffledFullyConnected/8bit"); (void)gemm_context; // only used in optimized code. @@ -1226,117 +1543,100 @@ inline void ExperimentalShuffledFullyConnected( const int accum_depth = ArraySize(weights_dims, 0); TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - // The experimental shuffling is an optimization for matrix*vector product. - // We aren't interested in supporting non-matrix*vector-product cases, i.e. - // batches>1. - TFLITE_DCHECK_EQ(batches, 1); + TFLITE_DCHECK((accum_depth % 16) == 0); + TFLITE_DCHECK((output_depth % 4) == 0); // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) // so that just reinterpreting them as int8 values is equivalent to // subtracting 128 from them, thus implementing for free the subtraction of // the zero_point value 128. - const int8* shuffled_weights_ptr = + const int8* int8_shuffled_weights_data = reinterpret_cast(shuffled_weights_data); -#if defined USE_NEON - // We'll only need to xor signbit to the input activation values, as - // that xor-ing is pre-built into the shuffled weights values. - const uint8x16_t signbit = vdupq_n_u8(0x80); - const int right_shift = output_shift > 0 ? output_shift : 0; - const int left_shift = output_shift > 0 ? 0 : -output_shift; - for (int c = 0; c < output_depth; c += 4) { - // Accumulation loop. - int32x4_t row_accum0 = vdupq_n_s32(0); - int32x4_t row_accum1 = vdupq_n_s32(0); - int32x4_t row_accum2 = vdupq_n_s32(0); - int32x4_t row_accum3 = vdupq_n_s32(0); - for (int d = 0; d < accum_depth; d += 16) { - int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0); - int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16); - int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32); - int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48); - shuffled_weights_ptr += 64; - int8x16_t input = - vreinterpretq_s8_u8(veorq_u8(signbit, vld1q_u8(input_data + d))); - int16x8_t local_accum0 = - vmull_s8(vget_low_s8(weights0), vget_low_s8(input)); - int16x8_t local_accum1 = - vmull_s8(vget_low_s8(weights1), vget_low_s8(input)); - int16x8_t local_accum2 = - vmull_s8(vget_low_s8(weights2), vget_low_s8(input)); - int16x8_t local_accum3 = - vmull_s8(vget_low_s8(weights3), vget_low_s8(input)); - local_accum0 = - vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input)); - local_accum1 = - vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input)); - local_accum2 = - vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input)); - local_accum3 = - vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input)); - row_accum0 = vpadalq_s16(row_accum0, local_accum0); - row_accum1 = vpadalq_s16(row_accum1, local_accum1); - row_accum2 = vpadalq_s16(row_accum2, local_accum2); - row_accum3 = vpadalq_s16(row_accum3, local_accum3); + + // Shuffling and xoring of input activations into the workspace buffer + if (batches == 1) { +#ifdef USE_NEON + const uint8x16_t signbit = vdupq_n_u8(0x80); + for (int i = 0; i < accum_depth; i += 16) { + uint8x16_t val = vld1q_u8(input_data + i); + val = veorq_u8(val, signbit); + vst1q_u8(shuffled_input_workspace_data + i, val); } - // Horizontally reduce accumulators - int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, - pairwise_reduced_acc_2, pairwise_reduced_acc_3; - pairwise_reduced_acc_0 = - vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0)); - pairwise_reduced_acc_1 = - vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1)); - pairwise_reduced_acc_2 = - vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2)); - pairwise_reduced_acc_3 = - vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3)); - const int32x2_t reduced_lo = - vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); - const int32x2_t reduced_hi = - vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); - int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); - // Add bias values. - int32x4_t bias_vec = vld1q_s32(bias_data + c); - reduced = vaddq_s32(reduced, bias_vec); - reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); - // Multiply by the fixed-point multiplier. - reduced = vqrdmulhq_n_s32(reduced, output_multiplier); - // Rounding-shift-right. - using gemmlowp::RoundingDivideByPOT; - reduced = RoundingDivideByPOT(reduced, right_shift); - // Narrow values down to 16 bit signed. - const int16x4_t res16 = vqmovn_s32(reduced); - vst1_s16(output_data + c, res16); - } #else - for (int c = 0; c < output_depth; c += 4) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4] = {0}; - // Accumulation loop. - for (int d = 0; d < accum_depth; d += 16) { - for (int i = 0; i < 4; i++) { + for (int i = 0; i < accum_depth; i++) { + shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; + } +#endif + } else if (batches == 4) { + uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + int c = 0; +#ifdef USE_NEON + const uint8x16_t signbit = vdupq_n_u8(0x80); + for (c = 0; c < accum_depth; c += 16) { + const uint8* src_data_ptr = input_data + c; + uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth); + uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth); + uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth); + uint8x16_t val3 = vld1q_u8(src_data_ptr + 3 * accum_depth); + val0 = veorq_u8(val0, signbit); + val1 = veorq_u8(val1, signbit); + val2 = veorq_u8(val2, signbit); + val3 = veorq_u8(val3, signbit); + vst1q_u8(shuffled_input_workspace_ptr + 0, val0); + vst1q_u8(shuffled_input_workspace_ptr + 16, val1); + vst1q_u8(shuffled_input_workspace_ptr + 32, val2); + vst1q_u8(shuffled_input_workspace_ptr + 48, val3); + shuffled_input_workspace_ptr += 64; + } +#else + for (c = 0; c < accum_depth; c += 16) { + for (int b = 0; b < 4; b++) { + const uint8* src_data_ptr = input_data + b * accum_depth + c; for (int j = 0; j < 16; j++) { - int8 input_val = input_data[d + j] - 128; - int8 weights_val = *shuffled_weights_ptr++; - accum[i] += weights_val * input_val; + uint8 src_val = *src_data_ptr++; + // Flip the sign bit, so that the kernel will only need to + // reinterpret these uint8 values as int8, getting for free the + // subtraction of the zero_point value 128. + uint8 dst_val = src_val ^ 0x80; + *shuffled_input_workspace_ptr++ = dst_val; } } } - for (int i = 0; i < 4; i++) { - // Add bias value - int acc = accum[i] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = - MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[c + i] = acc; - } - } #endif + } else { + TFLITE_DCHECK(false); + return; + } + + static constexpr int kKernelRows = 4; + const int thread_count = gemmlowp::HowManyThreads( + gemm_context->max_num_threads(), output_depth, batches, accum_depth); + if (thread_count == 1) { + // Single-thread case: do the computation on the current thread, don't + // use a threadpool + ExperimentalShuffledFullyConnectedWorkerImpl( + shuffled_input_workspace_data, int8_shuffled_weights_data, batches, + output_depth, output_depth, accum_depth, bias_data, output_multiplier, + output_shift, output_data); + return; + } + + // Multi-threaded case: use the gemmlowp context's threadpool. + TFLITE_DCHECK_GT(thread_count, 1); + std::vector tasks(thread_count); + const int kRowsPerWorker = + gemmlowp::RoundUp(output_depth / thread_count); + int row_start = 0; + for (int i = 0; i < thread_count; i++) { + int row_end = std::min(output_depth, row_start + kRowsPerWorker); + tasks[i] = new ExperimentalShuffledFullyConnectedWorkerTask( + shuffled_input_workspace_data, + int8_shuffled_weights_data + row_start * accum_depth, batches, + row_end - row_start, output_depth, accum_depth, bias_data + row_start, + output_multiplier, output_shift, output_data + row_start); + row_start = row_end; + } + TFLITE_DCHECK_EQ(row_start, output_depth); + gemm_context->workers_pool()->Execute(tasks); } template @@ -5474,6 +5774,9 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims, const std::vector& right_paddings, T* output_data, const Dims<4>& output_dims, const int32_t pad_value) { gemmlowp::ScopedProfilingLabel label("Pad"); + TFLITE_DCHECK_EQ(left_paddings.size(), 4); + TFLITE_DCHECK_EQ(right_paddings.size(), 4); + const int output_batch = ArraySize(output_dims, 3); const int output_height = ArraySize(output_dims, 2); const int output_width = ArraySize(output_dims, 1); diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h index 4e324a5e107cf5a90c0042331899edab831c8e51..ff15f3e3b10324f8fa0ebcd924560c61221e7ee5 100644 --- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h +++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ -#define TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ +#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ +#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ // TODO(ghodrat): Remove this header file and the dependency to internal data // structure. @@ -135,4 +135,4 @@ void NeonReductionSumVector(const float* input_vector, float* output_vector, } // namespace tensor_utils } // namespace tflite -#endif // TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ +#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h index 9ad125b8eba7f6457d582cd3ee512b87234a0e95..4c8cbe42759d0af6bacca718d7f6bcae5efa1169 100644 --- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h @@ -608,8 +608,9 @@ inline void ExperimentalShuffledFullyConnected( const int32* bias_data, const Dims<4>& bias_dims, int32 output_multiplier, int output_shift, int32 output_activation_min, int32 output_activation_max, int16* output_data, const Dims<4>& output_dims, - gemmlowp::GemmContext* gemm_context) { + uint8* shuffled_input_workspace_data, gemmlowp::GemmContext* gemm_context) { (void)gemm_context; // only used in optimized code. + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); // TODO(benoitjacob): This really should be: // const int batches = ArraySize(output_dims, 1); @@ -622,44 +623,130 @@ inline void ExperimentalShuffledFullyConnected( const int accum_depth = ArraySize(weights_dims, 0); TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); - // The experimental shuffling is an optimization for matrix*vector product. - // We aren't interested in supporting non-matrix*vector-product cases, i.e. - // batches>1. - TFLITE_DCHECK_EQ(batches, 1); - // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to - // subtracting 128 from them, thus implementing for free the subtraction of - // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast(shuffled_weights_data); - for (int c = 0; c < output_depth; c += 4) { - // Internal accumulation. - // Initialize accumulator with the bias-value. - int32 accum[4] = {0}; - // Accumulation loop. - for (int d = 0; d < accum_depth; d += 16) { - for (int i = 0; i < 4; i++) { + TFLITE_DCHECK((accum_depth % 16) == 0); + TFLITE_DCHECK((output_depth % 4) == 0); + + // Shuffling and xoring of input activations into the workspace buffer + uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + if (batches == 1) { + for (int i = 0; i < accum_depth; i++) { + shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; + } + } else if (batches == 4) { + for (int c = 0; c < accum_depth; c += 16) { + for (int b = 0; b < 4; b++) { + const uint8* src_data_ptr = input_data + b * accum_depth + c; for (int j = 0; j < 16; j++) { - int8 input_val = input_data[d + j] - 128; - int8 weights_val = *shuffled_weights_ptr++; - accum[i] += weights_val * input_val; + uint8 src_val = *src_data_ptr++; + // Flip the sign bit, so that the kernel will only need to + // reinterpret these uint8 values as int8, getting for free the + // subtraction of the zero_point value 128. + uint8 dst_val = src_val ^ 0x80; + *shuffled_input_workspace_ptr++ = dst_val; } } } - for (int i = 0; i < 4; i++) { - // Add bias value - int acc = accum[i] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our - // (16-bit, typically 3 integer bits) fixed-point format. The quantized - // multiplier and shift here have been pre-computed offline - // (e.g. by toco). - acc = - MultiplyByQuantizedMultiplier(acc, output_multiplier, -output_shift); - // Saturate, cast to int16, and store to output array. - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_data[c + i] = acc; + } else { + TFLITE_DCHECK(false); + return; + } + + // Actual computation + if (batches == 1) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4] = {0}; + // Accumulation loop. + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_data[d + j]; + int8 weights_val = *shuffled_weights_ptr++; + accum[i] += weights_val * input_val; + } + } + } + for (int i = 0; i < 4; i++) { + // Add bias value + int acc = accum[i] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The quantized + // multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[c + i] = acc; + } } + } else if (batches == 4) { + int16* output_ptr = output_data; + // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) + // so that just reinterpreting them as int8 values is equivalent to + // subtracting 128 from them, thus implementing for free the subtraction of + // the zero_point value 128. + const int8* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); + // Likewise, we preshuffled and pre-xored the input data above. + const int8* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); + for (int c = 0; c < output_depth; c += 4) { + const int8* shuffled_input_ptr = shuffled_input_data; + // Accumulation loop. + // Internal accumulation. + // Initialize accumulator with the bias-value. + int32 accum[4][4]; + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + accum[i][b] = 0; + } + } + for (int d = 0; d < accum_depth; d += 16) { + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + for (int j = 0; j < 16; j++) { + int8 input_val = shuffled_input_ptr[16 * b + j]; + int8 weights_val = shuffled_weights_ptr[16 * i + j]; + accum[i][b] += weights_val * input_val; + } + } + } + shuffled_input_ptr += 64; + shuffled_weights_ptr += 64; + } + for (int i = 0; i < 4; i++) { + for (int b = 0; b < 4; b++) { + // Add bias value + int acc = accum[i][b] + bias_data[c + i]; + // Down-scale the final int32 accumulator to the scale used by our + // (16-bit, typically 3 integer bits) fixed-point format. The + // quantized multiplier and shift here have been pre-computed offline + // (e.g. by toco). + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, + -output_shift); + // Saturate, cast to int16, and store to output array. + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_ptr[b * output_depth + c + i] = acc; + } + } + } + } else { + TFLITE_DCHECK(false); + return; } } @@ -2993,6 +3080,9 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims, const std::vector& left_paddings, const std::vector& right_paddings, T* output_data, const Dims<4>& output_dims, const int32_t pad_value) { + TFLITE_DCHECK_EQ(left_paddings.size(), 4); + TFLITE_DCHECK_EQ(right_paddings.size(), 4); + const int output_batch = ArraySize(output_dims, 3); const int output_height = ArraySize(output_dims, 2); const int output_width = ArraySize(output_dims, 1); diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h index a9064d54e7704d52eefa34f6bf446ec1cfe68fe1..a5f345e98a9d4fda57072684298fbc084b8b9a61 100644 --- a/tensorflow/contrib/lite/kernels/test_util.h +++ b/tensorflow/contrib/lite/kernels/test_util.h @@ -88,7 +88,9 @@ struct TensorData { class SingleOpResolver : public OpResolver { public: SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration) - : op_(op), registration_(registration) {} + : op_(op), registration_(registration) { + registration_->builtin_code = op; + } TfLiteRegistration* FindOp(BuiltinOperator op) const override { if (op == op_) { return registration_; diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc index 2dd6d67e078619df41524e8242a0475320c02013..f45f39d1e6f874ad548c3cd2b64c2c1ee1d2fa0d 100644 --- a/tensorflow/contrib/lite/model.cc +++ b/tensorflow/contrib/lite/model.cc @@ -194,7 +194,6 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() { builtin_code); status = kTfLiteError; } else if (builtin_code != BuiltinOperator_CUSTOM) { - flatbuffer_op_index_to_registration_types_.push_back(builtin_code); registration = op_resolver_.FindOp(builtin_code); if (registration == nullptr) { error_reporter_->Report("Didn't find op for builtin opcode '%s'\n", @@ -208,8 +207,6 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() { } else { const char* name = opcode->custom_code()->c_str(); registration = op_resolver_.FindOp(name); - flatbuffer_op_index_to_registration_types_.push_back( - BuiltinOperator_CUSTOM); if (registration == nullptr) { error_reporter_->Report("Didn't find custom op for name '%s'\n", name); status = kTfLiteError; @@ -702,8 +699,7 @@ TfLiteStatus InterpreterBuilder::ParseNodes( continue; } - auto op_type = - flatbuffer_op_index_to_registration_types_[op->opcode_index()]; + BuiltinOperator op_type = static_cast(reg->builtin_code); if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) { error_reporter_->Report( "Found builtin operator %s with custom options.\n", diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h index 5a55b031a8c28085e02782608eb820a3cfe78dde..a7d7f3ea1096799608d9fcd324de65638f1a6d6f 100644 --- a/tensorflow/contrib/lite/model.h +++ b/tensorflow/contrib/lite/model.h @@ -188,7 +188,6 @@ class InterpreterBuilder { ErrorReporter* error_reporter_; std::vector flatbuffer_op_index_to_registration_; - std::vector flatbuffer_op_index_to_registration_types_; const Allocation* allocation_ = nullptr; }; diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc index e1366639c78a4e90740aaf42a9ba5770ec65cb78..dfdd80ea8a42af683632be1d7e8ab0062847077d 100644 --- a/tensorflow/contrib/lite/optional_debug_tools.cc +++ b/tensorflow/contrib/lite/optional_debug_tools.cc @@ -72,7 +72,7 @@ const char* AllocTypeName(TfLiteAllocationType type) { // Prints a dump of what tensors and what nodes are in the interpreter. void PrintInterpreterState(Interpreter* interpreter) { - printf("Interpreter has %d tensors and %d nodes\n", + printf("Interpreter has %zu tensors and %zu nodes\n", interpreter->tensors_size(), interpreter->nodes_size()); printf("Inputs:"); PrintIntVector(interpreter->inputs()); diff --git a/tensorflow/contrib/lite/profiling/profile_buffer.h b/tensorflow/contrib/lite/profiling/profile_buffer.h index 3bfe02571ba59f05ff316d327d2a964f6d5b4e1e..b2f565376c3d1c9517e43167aacdd6f4d873ca1c 100644 --- a/tensorflow/contrib/lite/profiling/profile_buffer.h +++ b/tensorflow/contrib/lite/profiling/profile_buffer.h @@ -37,9 +37,9 @@ struct ProfileEvent { // Label of the event. This usually describes the event. const char* tag; // Timestamp in microseconds when the event began. - int64_t begin_timestamp_ms; + int64_t begin_timestamp_us; // Timestamp in microseconds when the event ended. - int64_t end_timestamp_ms; + int64_t end_timestamp_us; // The field containing the type of event. This must be one of the event types // in EventType. EventType event_type; @@ -79,8 +79,8 @@ class ProfileBuffer { event_buffer_[index].tag = tag; event_buffer_[index].event_type = event_type; event_buffer_[index].event_metadata = event_metadata; - event_buffer_[index].begin_timestamp_ms = timestamp; - event_buffer_[index].end_timestamp_ms = 0; + event_buffer_[index].begin_timestamp_us = timestamp; + event_buffer_[index].end_timestamp_us = 0; current_index_++; return index; } @@ -103,7 +103,7 @@ class ProfileBuffer { } int event_index = event_handle % max_size; - event_buffer_[event_index].end_timestamp_ms = NowMicros(); + event_buffer_[event_index].end_timestamp_us = NowMicros(); } // Returns the size of the buffer. diff --git a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc index 0c5f0cd31495d67beb867beb835cfc897d7d6265..b8784cca455cfc301f2cc30c9c6d031b7174f829 100644 --- a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc +++ b/tensorflow/contrib/lite/profiling/profile_buffer_test.cc @@ -49,13 +49,13 @@ TEST(ProfileBufferTest, AddEvent) { auto event = GetProfileEvents(buffer)[0]; EXPECT_EQ(event->tag, "hello"); - EXPECT_GT(event->begin_timestamp_ms, 0); + EXPECT_GT(event->begin_timestamp_us, 0); EXPECT_EQ(event->event_type, ProfileEvent::EventType::DEFAULT); EXPECT_EQ(event->event_metadata, 42); buffer.EndEvent(event_handle); EXPECT_EQ(1, buffer.Size()); - EXPECT_GE(event->end_timestamp_ms, event->begin_timestamp_ms); + EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us); } TEST(ProfileBufferTest, OverFlow) { diff --git a/tensorflow/contrib/lite/profiling/profiler_test.cc b/tensorflow/contrib/lite/profiling/profiler_test.cc index 994523a8fb74b511a18f2ec53147ce81323bec1b..7914f36a319b855090ca43556967fa9abbc4e1b3 100644 --- a/tensorflow/contrib/lite/profiling/profiler_test.cc +++ b/tensorflow/contrib/lite/profiling/profiler_test.cc @@ -30,7 +30,7 @@ namespace { void AssertDurationOfEventAroundMs(const ProfileEvent* event, double expected_ms, double eps_ms) { double duration_ms = - (event->end_timestamp_ms - event->begin_timestamp_ms) / 1e3; + (event->end_timestamp_us - event->begin_timestamp_us) / 1e3; EXPECT_NEAR(expected_ms, duration_ms, eps_ms); } diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD index 926896d609d83aac3b875d33dfe3c4dc7ae89ccd..e6dcc7aa099ccde848ef6cd9322639ea2d0f1c01 100644 --- a/tensorflow/contrib/lite/python/BUILD +++ b/tensorflow/contrib/lite/python/BUILD @@ -39,16 +39,35 @@ py_test( py_library( name = "lite", srcs = ["lite.py"], - # data = [ - # "//tensorflow/contrib/lite/toco/python:toco_from_protos", - # ], srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ + ":convert", + ":convert_saved_model", ":op_hint", + ], +) + +py_library( + name = "lite_constants", + srcs = ["lite_constants.py"], + srcs_version = "PY2AND3", + deps = [ + "//tensorflow/contrib/lite/toco:toco_flags_proto_py", + ], +) + +py_library( + name = "convert", + srcs = ["convert.py"], + srcs_version = "PY2AND3", + visibility = ["//visibility:public"], + deps = [ + ":lite_constants", "//tensorflow/contrib/lite/toco:model_flags_proto_py", "//tensorflow/contrib/lite/toco:toco_flags_proto_py", "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco", + "//tensorflow/contrib/lite/toco/python:toco_from_protos", "//tensorflow/python:platform", ], ) @@ -66,15 +85,15 @@ py_library( ) py_test( - name = "lite_test", - srcs = ["lite_test.py"], + name = "convert_test", + srcs = ["convert_test.py"], srcs_version = "PY2AND3", tags = [ "no-internal-py3", "no_oss", ], deps = [ - ":lite", + ":convert", ":op_hint", "//tensorflow/python:array_ops", "//tensorflow/python:client_testlib", @@ -84,13 +103,14 @@ py_test( ], ) -py_binary( +py_library( name = "convert_saved_model", srcs = ["convert_saved_model.py"], srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = [ - ":lite", + ":convert", + ":lite_constants", "//tensorflow/contrib/saved_model:saved_model_py", "//tensorflow/python:graph_util", "//tensorflow/python/tools:freeze_graph_lib", @@ -130,6 +150,15 @@ py_test( ], ) +py_binary( + name = "convert_saved_model_to_frozen_graph", + srcs = ["convert_saved_model_to_frozen_graph.py"], + srcs_version = "PY2AND3", + deps = [ + ":convert_saved_model", + ], +) + # Transitive dependencies of this target will be included in the pip package. py_library( name = "tf_lite_py_pip", diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py new file mode 100644 index 0000000000000000000000000000000000000000..c4200c879ba0e17b3bd183f4004eb75ebdd2f5ee --- /dev/null +++ b/tensorflow/contrib/lite/python/convert.py @@ -0,0 +1,187 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Converts a frozen graph into a TFLite FlatBuffer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os as _os +import subprocess as _subprocess +import tempfile as _tempfile + +from tensorflow.contrib.lite.python import lite_constants +from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2 +from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2 +from tensorflow.python.framework import dtypes as _dtypes +from tensorflow.python.platform import resource_loader as _resource_loader +from tensorflow.python.util.lazy_loader import LazyLoader + + +# Lazy load since some of the performance benchmark skylark rules +# break dependencies. +_toco_python = LazyLoader( + "tensorflow_wrap_toco", globals(), + "tensorflow.contrib.lite.toco.python." + "tensorflow_wrap_toco") +del LazyLoader + +# Find the toco_from_protos binary using the resource loader if using from +# bazel, otherwise we are in a pip where console_scripts already has +# the toco_from_protos tool. +if lite_constants.EXPERIMENTAL_USE_TOCO_API_DIRECTLY: + _toco_from_proto_bin = "" +else: + _toco_from_proto_bin = _resource_loader.get_path_to_datafile( + "../toco/python/toco_from_protos") + +if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin): + _toco_from_proto_bin = "toco_from_protos" + + +def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str): + """Convert `input_data_str` according to model and toco parameters. + + Unless you know what you are doing consider using + the more friendly @{tf.contrib.lite.toco_convert}}. + + Args: + model_flags_str: Serialized proto describing model properties, see + `toco/model_flags.proto`. + toco_flags_str: Serialized proto describing conversion properties, see + `toco/toco_flags.proto`. + input_data_str: Input data in serialized form (e.g. a graphdef is common) + Returns: + Converted model in serialized form (e.g. a TFLITE model is common). + Raises: + RuntimeError: When conversion fails, an exception is raised with the error + message embedded. + """ + # TODO(aselle): When toco does not use fatal errors for failure, we can + # switch this on. + if not _toco_from_proto_bin: + return _toco_python.TocoConvert( + model_flags_str, toco_flags_str, input_data_str) + + with _tempfile.NamedTemporaryFile() as fp_toco, \ + _tempfile.NamedTemporaryFile() as fp_model, \ + _tempfile.NamedTemporaryFile() as fp_input, \ + _tempfile.NamedTemporaryFile() as fp_output: + fp_model.write(model_flags_str) + fp_toco.write(toco_flags_str) + fp_input.write(input_data_str) + fp_model.flush() + fp_toco.flush() + fp_input.flush() + + cmd = [ + _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name, + fp_output.name + ] + cmdline = " ".join(cmd) + proc = _subprocess.Popen( + cmdline, + shell=True, + stdout=_subprocess.PIPE, + stderr=_subprocess.STDOUT, + close_fds=True) + stdout, stderr = proc.communicate() + exitcode = proc.returncode + if exitcode == 0: + stuff = fp_output.read() + return stuff + else: + raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" % + (stdout, stderr)) + + +def tensor_name(x): + return x.name.split(":")[0] + + +def toco_convert(input_data, + input_tensors, + output_tensors, + inference_type=lite_constants.FLOAT, + input_format=lite_constants.TENSORFLOW_GRAPHDEF, + output_format=lite_constants.TFLITE, + quantized_input_stats=None, + drop_control_dependency=True): + """Convert a model using TOCO from `input_format` to `output_format`. + + Typically this is to convert from TensorFlow GraphDef to TFLite, in which + case the default `input_format` and `output_format` are sufficient. + + Args: + input_data: Input data (i.e. often `sess.graph_def`). + input_tensors: List of input tensors. Type and shape are computed using + `foo.get_shape()` and `foo.dtype`. + output_tensors: List of output tensors (only .name is used from this). + inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`. + input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF). + output_format: Type of data to write (currently must be TFLITE or + GRAPHVIZ_DOT) + quantized_input_stats: For each member of input_tensors the mean and + std deviation of training data. Only needed if `inference_type` is + `QUANTIZED_UINT8`. + drop_control_dependency: Drops control dependencies silently. This is due + to tf lite not supporting control dependencies. + + Returns: + The converted data. For example if tflite was the destination, then + this will be a tflite flatbuffer in a bytes array. + + Raises: + ValueError: If the input tensor type is unknown + RuntimeError: If TOCO fails to convert (in which case the runtime error's + error text will contain the TOCO error log) + """ + toco = _toco_flags_pb2.TocoFlags() + toco.input_format = input_format + toco.output_format = output_format + toco.drop_control_dependency = drop_control_dependency + model = _model_flags_pb2.ModelFlags() + toco.inference_type = inference_type + for idx, input_tensor in enumerate(input_tensors): + if input_tensor.dtype == _dtypes.float32: + tflite_input_type = lite_constants.FLOAT + elif input_tensor.dtype == _dtypes.int32: + tflite_input_type = lite_constants.INT32 + elif input_tensor.dtype == _dtypes.int64: + tflite_input_type = lite_constants.INT64 + # TODO(aselle): Insert strings when they are available + else: + raise ValueError("Tensors %s not known type %r" % (input_tensor.name, + input_tensor.dtype)) + + input_array = model.input_arrays.add() + + if inference_type == lite_constants.QUANTIZED_UINT8: + if tflite_input_type == lite_constants.FLOAT: + tflite_input_type = lite_constants.QUANTIZED_UINT8 + input_array.mean_value, input_array.std_value = quantized_input_stats[idx] + + input_array.name = tensor_name(input_tensor) + input_array.shape.dims.extend(map(int, input_tensor.get_shape())) + + for output_tensor in output_tensors: + model.output_arrays.append(tensor_name(output_tensor)) + + # TODO(aselle): Consider handling the case of allowing quantized + # inputs to be converted to float (via the toco.inference_input_type field). + data = toco_convert_protos(model.SerializeToString(), + toco.SerializeToString(), + input_data.SerializeToString()) + return data diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py index a2b5ef488ec1feb455b2c8d5d1c4005c3b2f60d6..a7eddf3408f54dff5fa49ff6fa7b61cd0b8a22e4 100644 --- a/tensorflow/contrib/lite/python/convert_saved_model.py +++ b/tensorflow/contrib/lite/python/convert_saved_model.py @@ -12,52 +12,43 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -r"""TensorFlow Lite flatbuffer generation from saved_models. +"""Functions to convert SavedModel to frozen GraphDefs.""" -Example: - -bazel run third_party/tensorflow/contrib/lite/python:convert_saved_model -- \ - --saved_model_dir=/tmp/test_saved_model/1519865537 \ - --output_tflite=/tmp/test.lite - -""" from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.lite.python import lite +from tensorflow.contrib.lite.python import convert +from tensorflow.contrib.lite.python import lite_constants +from tensorflow.contrib.lite.toco import model_flags_pb2 from tensorflow.contrib.saved_model.python.saved_model import reader from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils from tensorflow.core.framework import types_pb2 from tensorflow.python.client import session from tensorflow.python.framework import graph_util as tf_graph_util from tensorflow.python.framework import ops -from tensorflow.python.platform import app -from tensorflow.python.platform import flags from tensorflow.python.platform import gfile from tensorflow.python.platform import tf_logging as logging from tensorflow.python.saved_model import loader from tensorflow.python.saved_model import signature_constants from tensorflow.python.saved_model import tag_constants -flags.DEFINE_string("saved_model_dir", "", "Saved model directory to convert.") -flags.DEFINE_string("output_tflite", None, "File path to write flatbuffer.") -flags.DEFINE_string("output_arrays", None, - "List of output tensor names, the default value is None, " - "which means the conversion will keep all outputs.") -flags.DEFINE_integer("batch_size", 1, - "If input tensor shape has None at first dimension, " - "e.g. (None,224,224,3), replace None with batch_size.") -flags.DEFINE_string("tag_set", tag_constants.SERVING, - "Group of tag(s) of the MetaGraphDef in the saved_model, " - "in string format, separated by ','. For tag-set contains " - "multiple tags, all tags must be passed in.") -flags.DEFINE_string("signature_key", - signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, - "This is signature key to extract inputs, outputs.") - - -def log_tensor_details(tensor_info): + +def _write_and_flush_file(file_path, data_str): + """Writes data to file path. + + Args: + file_path: Full path of the file to store data in. + data_str: Data represented as a string. + + Returns: None. + """ + with gfile.Open(file_path, "wb") as data_file: + data_file.write(data_str) + data_file.flush() + + +def _log_tensor_details(tensor_info): """Log tensor details: name, shape, and type.""" for key in tensor_info: val = tensor_info[key] @@ -73,7 +64,7 @@ def log_tensor_details(tensor_info): dtype) -def get_meta_graph_def(saved_model_dir, tag_set): +def _get_meta_graph_def(saved_model_dir, tag_set): """Validate saved_model and extract MetaGraphDef. Args: @@ -103,7 +94,7 @@ def get_meta_graph_def(saved_model_dir, tag_set): "values are '{}'. ".format(tag_set, tag_sets)) -def get_signature_def(meta_graph, signature_key): +def _get_signature_def(meta_graph, signature_key): """Get the signature def from meta_graph with given signature_key. Args: @@ -130,11 +121,11 @@ def get_signature_def(meta_graph, signature_key): return signature_def -def get_inputs_outputs(signature_def): - """Get inputs and outputs from signature def. +def _get_inputs_outputs(signature_def): + """Get inputs and outputs from SignatureDef. Args: - signature_def: signatuer def in the meta_graph_def for conversion. + signature_def: SignatureDef in the meta_graph_def for conversion. Returns: The inputs and outputs in the graph for conversion. @@ -142,9 +133,9 @@ def get_inputs_outputs(signature_def): inputs_tensor_info = signature_def.inputs outputs_tensor_info = signature_def.outputs logging.info("input tensors info: ") - log_tensor_details(inputs_tensor_info) + _log_tensor_details(inputs_tensor_info) logging.info("output tensors info: ") - log_tensor_details(outputs_tensor_info) + _log_tensor_details(outputs_tensor_info) def gather_names(tensor_info): return [tensor_info[key].name for key in tensor_info] @@ -154,109 +145,277 @@ def get_inputs_outputs(signature_def): return inputs, outputs -def convert(saved_model_dir, - output_tflite=None, - output_arrays=None, - tag_set=None, - signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, - batch_size=1): - """Convert a saved_model to tflite flatbuffer. +def _get_tensors(graph, signature_def_tensor_names=None, + user_tensor_names=None): + """Gets the tensors associated with the tensor names. + + Either signature_def_tensor_names or user_tensor_names should be provided. If + the user provides tensors, the tensors associated with the user provided + tensor names are provided. Otherwise, the tensors associated with the names in + the SignatureDef are provided. Args: - saved_model_dir: Saved model directory to convert. - output_tflite: File path to write result flatbuffer. - output_arrays: List of output tensor names, the default value is None, which - means conversion keeps all output tensors. This is also used to filter - tensors that are from Op currently not supported in tflite, e.g., Argmax). - tag_set: This is the set of tags to get meta_graph_def in saved_model. - signature_key: This is the signature key to extract inputs, outputs. - batch_size: If input tensor shape has None at first dimension, - e.g. (None,224,224,3), replace None with batch_size. + graph: GraphDef representing graph. + signature_def_tensor_names: Tensor names stored in either the inputs or + outputs of a SignatureDef. (default None) + user_tensor_names: Tensor names provided by the user. (default None) Returns: - The converted data. For example if tflite was the destination, then - this will be a tflite flatbuffer in a bytes array. + List of tensors. + + Raises: + ValueError: + signature_def_tensors and user_tensor_names are undefined or empty. + user_tensor_names are not valid. + """ + tensors = [] + if user_tensor_names: + # Get the list of all of the tensors with and without the tensor index. + all_tensor_names = [ + tensor.name for op in graph.get_operations() for tensor in op.outputs + ] + all_tensor_names_only = [name.split(":")[0] for name in all_tensor_names] + + # Sort the tensor names. + user_tensor_names = sorted(user_tensor_names) + + # Get the tensors associated with the tensor names. + tensors = [] + invalid_tensors = [] + for name in user_tensor_names: + if name not in all_tensor_names_only: + invalid_tensors.append(name) + else: + idx = all_tensor_names_only.index(name) + tensors.append(graph.get_tensor_by_name(all_tensor_names[idx])) + + # Throw ValueError if any user input names are not valid tensors. + if invalid_tensors: + raise ValueError("Invalid tensors '{}' were found.".format( + ",".join(invalid_tensors))) + elif signature_def_tensor_names: + tensors = [ + graph.get_tensor_by_name(name) + for name in sorted(signature_def_tensor_names) + ] + else: + # Throw ValueError if signature_def_tensors and user_tensor_names are both + # either undefined or empty. + raise ValueError( + "Specify either signature_def_tensor_names or user_tensor_names") + + return tensors + + +def _freeze_saved_model(saved_model_dir, input_arrays, input_shapes, + output_arrays, tag_set, signature_key, batch_size): + """Converts a SavedModel to a frozen graph. + + Args: + saved_model_dir: SavedModel directory to convert. + input_arrays: List of input tensors to freeze graph with. Uses input arrays + from SignatureDef when none are provided. (default None) + input_shapes: Map of strings representing input tensor names to list of + integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}). + Automatically determined when input shapes is None (e.g., {"foo" : None}). + (default None) + output_arrays: List of output tensors to freeze graph with. Uses output + arrays from SignatureDef when none are provided. (default None) + tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to + analyze. All tags in the tag set must be present. (default "serve") + signature_key: Key identifying SignatureDef containing inputs and outputs. + batch_size: Batch size for the model. Replaces the first dimension of an + input size array if undefined. (default 1) + + Returns: + frozen_graph_def: Frozen GraphDef. + in_tensors: List of input tensors for the graph. + out_tensors: List of output tensors for the graph. Raises: - ValueError: If tag_set does not indicate any meta_graph_def in saved_model, - or signature_key is not in relevant meta_graph_def, - or input shape has None beyond 1st dimension, e.g., (1,None, None, 3), - or given output_arrays are not valid causing empty outputs. + ValueError: + SavedModel doesn't contain a MetaGraphDef identified by tag_set. + signature_key is not in the MetaGraphDef. + input_shapes does not match the length of input_arrays. + input_shapes has a None value after the 1st dimension. + input_arrays or output_arrays are not valid. + Unable to load Session. """ + # Set default values for inputs if they are set to None. + if signature_key is None: + signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY if tag_set is None: tag_set = set([tag_constants.SERVING]) + if batch_size is None: + batch_size = 1 - meta_graph = get_meta_graph_def(saved_model_dir, tag_set) - signature_def = get_signature_def(meta_graph, signature_key) - inputs, outputs = get_inputs_outputs(signature_def) + # Read SignatureDef. + meta_graph = _get_meta_graph_def(saved_model_dir, tag_set) + signature_def = _get_signature_def(meta_graph, signature_key) + inputs, outputs = _get_inputs_outputs(signature_def) graph = ops.Graph() with session.Session(graph=graph) as sess: - + # TODO(nupurgarg): Throw ValueError if SavedModel has assets/ directory. loader.load(sess, meta_graph.meta_info_def.tags, saved_model_dir) - in_tensors = [graph.get_tensor_by_name(input_) for input_ in inputs] - - # Users can use output_arrays to filter output tensors for conversion. - # If output_arrays is None, we keep all output tensors. In future, we may - # use tflite supported Op list and check whether op is custom Op to - # automatically filter output arrays. - # TODO(zhixianyan): Use tflite supported Op list to filter outputs. - if output_arrays is not None: - output_arrays = output_arrays.split(",") - out_tensors = [ - graph.get_tensor_by_name(output) - for output in outputs - if output.split(":")[0] in output_arrays - ] - else: - out_tensors = [graph.get_tensor_by_name(output) for output in outputs] + # Gets input and output tensors. + # TODO(zhixianyan): Use TFLite supported Op list to filter outputs. + in_tensors = _get_tensors(graph, inputs, input_arrays) + out_tensors = _get_tensors(graph, outputs, output_arrays) - output_names = [node.split(":")[0] for node in outputs] + # Gets fully defined tensor shape. An input tensor with None in the first + # dimension, e.g. (None, 224, 224, 3), is replaced with the batch_size. + # Shapes with None after the first dimension result in a ValueError. + # TODO(zhixianyan): Add supports for input tensor with more None in shape. + for tensor in in_tensors: + if (input_shapes and tensor.name in input_shapes and + input_shapes[tensor.name] is not None): + shape = input_shapes[tensor.name] + else: + shape = tensor.get_shape().as_list() - if not out_tensors: - raise ValueError( - "No valid output tensors for '{}', possible values are '{}'".format( - output_arrays, output_names)) + if None in shape[1:]: + raise ValueError( + "None is only supported in the 1st dimension. Tensor '{0}' has " + "invalid shape '{1}'.".format(tensor.name, shape)) + elif shape[0] is None: + shape[0] = batch_size + tensor.set_shape(shape) + output_names = [node.split(":")[0] for node in outputs] frozen_graph_def = tf_graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), output_names) - # Toco requires fully defined tensor shape, for input tensor with None in - # their shape, e.g., (None, 224, 224, 3), we need to replace first None with - # a given batch size. For shape with more None, e.g. (None, None, None, 3), - # still be able to replace and convert, but require further investigation. - # TODO(zhixianyan): Add supports for input tensor with more None in shape. - for i in range(len(in_tensors)): - shape = in_tensors[i].get_shape().as_list() - if shape[0] is None: - shape[0] = batch_size - if None in shape[1:]: - raise ValueError( - "Only support None shape at 1st dim as batch_size. But tensor " - "'{}' 's shape '{}' has None at other dimension. ".format( - inputs[i], shape)) - in_tensors[i].set_shape(shape) + return frozen_graph_def, in_tensors, out_tensors + raise ValueError("Unable to load Session.") - result = lite.toco_convert(frozen_graph_def, in_tensors, out_tensors) - if output_tflite is not None: - with gfile.Open(output_tflite, "wb") as f: - f.write(result) - logging.info("Successfully converted to: %s", output_tflite) +def saved_model_to_frozen_graphdef( + saved_model_dir, + output_file_model, + output_file_flags, + input_arrays=None, + input_shapes=None, + output_arrays=None, + tag_set=None, + signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, + batch_size=1): + """Converts a SavedModel to a frozen graph. Writes graph to tmp directory. - return result + Stores frozen graph and command line flags in the tmp directory. + Args: + saved_model_dir: SavedModel directory to convert. + output_file_model: Full file path to save frozen graph. + output_file_flags: Full file path to save ModelFlags. + input_arrays: List of input tensors to freeze graph with. Uses input arrays + from SignatureDef when none are provided. (default None) + input_shapes: Map of strings representing input tensor names to list of + integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}). + Automatically determined when input shapes is None (e.g., {"foo" : None}). + (default None) + output_arrays: List of output tensors to freeze graph with. Uses output + arrays from SignatureDef when none are provided. (default None) + tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to + analyze. All tags in the tag set must be present. (default "serve") + signature_key: Key identifying SignatureDef containing inputs and outputs. + batch_size: Batch size for the model. Replaces the first dimension of an + input size array if undefined. (default 1) + + Returns: None. -def main(_): - convert( - saved_model_dir=flags.FLAGS.saved_model_dir, - output_tflite=flags.FLAGS.output_tflite, - output_arrays=flags.FLAGS.output_arrays, - batch_size=flags.FLAGS.batch_size, - tag_set=set(flags.FLAGS.tag_set.split(",")), - signature_key=flags.FLAGS.signature_key) + Raises: + ValueError: Unable to convert to frozen graph. + """ + frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model( + saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set, + signature_key, batch_size) + + # Initialize model flags. + model = model_flags_pb2.ModelFlags() + + for input_tensor in in_tensors: + input_array = model.input_arrays.add() + input_array.name = convert.tensor_name(input_tensor) + input_array.shape.dims.extend(map(int, input_tensor.get_shape())) + + for output_tensor in out_tensors: + model.output_arrays.append(convert.tensor_name(output_tensor)) + + # Write model and ModelFlags to file. ModelFlags contain input array and + # output array information that is parsed from the SignatureDef and used for + # analysis by TOCO. + _write_and_flush_file(output_file_model, frozen_graph_def.SerializeToString()) + _write_and_flush_file(output_file_flags, model.SerializeToString()) + + +def tflite_from_saved_model( + saved_model_dir, + output_file=None, + input_arrays=None, + input_shapes=None, + output_arrays=None, + tag_set=None, + signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, + batch_size=1, + inference_type=lite_constants.FLOAT, + input_format=lite_constants.TENSORFLOW_GRAPHDEF, + output_format=lite_constants.TFLITE, + quantized_input_stats=None, + drop_control_dependency=True): + """Converts a SavedModel to TFLite FlatBuffer. + Args: + saved_model_dir: SavedModel directory to convert. + output_file: File path to write result TFLite FlatBuffer. + input_arrays: List of input tensors to freeze graph with. Uses input arrays + from SignatureDef when none are provided. (default None) + input_shapes: Map of strings representing input tensor names to list of + integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}). + Automatically determined when input shapes is None (e.g., {"foo" : None}). + (default None) + output_arrays: List of output tensors to freeze graph with. Uses output + arrays from SignatureDef when none are provided. (default None) + tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to + analyze. All tags in the tag set must be present. (default "serve") + signature_key: Key identifying SignatureDef containing inputs and outputs. + batch_size: Batch size for the model. Replaces the first dimension of an + input size array if undefined. (default 1) + inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`. + input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF). + output_format: Type of data to write (currently must be TFLITE or + GRAPHVIZ_DOT) + quantized_input_stats: For each member of input_tensors the mean and + std deviation of training data. Only needed if `inference_type` is + `QUANTIZED_UINT8`. + drop_control_dependency: Drops control dependencies silently. This is due + to tf lite not supporting control dependencies. -if __name__ == "__main__": - app.run(main) + Returns: + The converted data. For example if tflite was the destination, then + this will be a tflite flatbuffer in a bytes array. + + Raises: + ValueError: Unable to convert to frozen graph. + """ + frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model( + saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set, + signature_key, batch_size) + + result = convert.toco_convert( + input_data=frozen_graph_def, + input_tensors=in_tensors, + output_tensors=out_tensors, + inference_type=inference_type, + input_format=input_format, + output_format=output_format, + quantized_input_stats=quantized_input_stats, + drop_control_dependency=drop_control_dependency) + + if output_file is not None: + with gfile.Open(output_file, "wb") as f: + f.write(result) + logging.info("Successfully converted to: %s", output_file) + + return result diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py index 734e42d619bdb79de0306a94e304ce46065d14d4..db95fc8ad7f94b52d33c72f6ec5819bdfe8cf05f 100644 --- a/tensorflow/contrib/lite/python/convert_saved_model_test.py +++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""TF Lite SavedModel Conversion test cases. - - - test on generated saved_models from simple graphs (sanity check) - - test mnist savedmodel generated on-the-fly +"""TFLite SavedModel conversion test cases. + - Tests converting simple SavedModel graph to TFLite FlatBuffer. + - Tests converting simple SavedModel graph to frozen graph. + - Tests converting MNIST SavedModel to TFLite FlatBuffer. """ from __future__ import absolute_import @@ -25,6 +25,7 @@ from __future__ import print_function import os from tensorflow.contrib.lite.python import convert_saved_model +from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2 from tensorflow.python import keras from tensorflow.python.client import session from tensorflow.python.estimator import estimator_lib as estimator @@ -37,6 +38,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.ops import random_ops from tensorflow.python.ops.losses import losses +from tensorflow.python.platform import gfile from tensorflow.python.platform import test from tensorflow.python.saved_model import saved_model from tensorflow.python.training import training as train @@ -45,7 +47,7 @@ from tensorflow.python.training import training as train class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase): def _createSimpleSavedModel(self, shape): - """Create a simple savedmodel on the fly.""" + """Create a simple SavedModel on the fly.""" saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel") with session.Session() as sess: in_tensor = array_ops.placeholder(shape=shape, dtype=dtypes.float32) @@ -56,44 +58,78 @@ class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase): return saved_model_dir def testSimpleSavedModel(self): - """Test a simple savedmodel created on the fly.""" - # Create a simple savedmodel + """Test a simple SavedModel created on the fly.""" + # Create a simple SavedModel saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) # Convert to tflite - result = convert_saved_model.convert(saved_model_dir=saved_model_dir) + result = convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir) self.assertTrue(result) def testSimpleSavedModelWithNoneBatchSizeInShape(self): - """Test a simple savedmodel, with None in input tensor's shape.""" + """Test a simple SavedModel, with None in input tensor's shape.""" saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3]) - result = convert_saved_model.convert(saved_model_dir=saved_model_dir) + result = convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir) self.assertTrue(result) def testSimpleSavedModelWithMoreNoneInShape(self): - """Test a simple savedmodel, fail as more None in input shape.""" + """Test a simple SavedModel, fail as more None in input shape.""" saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, None, 3]) # Convert to tflite: this should raise ValueError, as 3rd dim is None. with self.assertRaises(ValueError): - convert_saved_model.convert(saved_model_dir=saved_model_dir) + convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir) def testSimpleSavedModelWithWrongSignatureKey(self): - """Test a simple savedmodel, fail as given signature is invalid.""" + """Test a simple SavedModel, fail as given signature is invalid.""" saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) # Convert to tflite: this should raise ValueError, as # signature_key does not exit in the saved_model. with self.assertRaises(ValueError): - convert_saved_model.convert( + convert_saved_model.tflite_from_saved_model( saved_model_dir=saved_model_dir, signature_key="wrong-key") def testSimpleSavedModelWithWrongOutputArray(self): - """Test a simple savedmodel, fail as given output_arrays is invalid.""" - # Create a simple savedmodel + """Test a simple SavedModel, fail as given output_arrays is invalid.""" + # Create a simple SavedModel saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) # Convert to tflite: this should raise ValueError, as # output_arrays is not valid for the saved_model. with self.assertRaises(ValueError): - convert_saved_model.convert( - saved_model_dir=saved_model_dir, output_arrays="wrong-output") + convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, output_arrays=["wrong-output"]) + + def testSimpleSavedModelWithWrongInputArrays(self): + """Test a simple SavedModel, fail as given input_arrays is invalid.""" + saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) + # Checks invalid input_arrays. + with self.assertRaises(ValueError): + convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, input_arrays=["wrong-input"]) + # Checks valid and invalid input_arrays. + with self.assertRaises(ValueError): + convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, + input_arrays=["Placeholder", "wrong-input"]) + + def testSimpleSavedModelWithCorrectArrays(self): + """Test a simple SavedModel, with correct input_arrays and output_arrays.""" + saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3]) + result = convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, + input_arrays=["Placeholder"], + output_arrays=["add"]) + self.assertTrue(result) + + def testSimpleSavedModelWithCorrectInputArrays(self): + """Test a simple SavedModel, with correct input_arrays and input_shapes.""" + saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) + result = convert_saved_model.tflite_from_saved_model( + saved_model_dir=saved_model_dir, + input_arrays=["Placeholder"], + input_shapes={"Placeholder": [1, 16, 16, 3]}) + self.assertTrue(result) def testMultipleMetaGraphDef(self): """Test saved model with multiple MetaGraphDef.""" @@ -119,20 +155,103 @@ class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase): sess, tags=[saved_model.tag_constants.SERVING, "additional_test_tag"], signature_def_map=signature_def_map) + # MetaGraphDef 2 builder.add_meta_graph(tags=["tflite"]) builder.save(True) # Convert to tflite - convert_saved_model.convert( + convert_saved_model.tflite_from_saved_model( saved_model_dir=saved_model_dir, tag_set=set([saved_model.tag_constants.SERVING, "additional_test_tag"])) +class ConvertSavedModelTestBasicGraphToText(test_util.TensorFlowTestCase): + + def _createSimpleSavedModel(self, shape): + """Create a simple SavedModel.""" + saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel") + with session.Session() as sess: + in_tensor_1 = array_ops.placeholder( + shape=shape, dtype=dtypes.float32, name="inputB") + in_tensor_2 = array_ops.placeholder( + shape=shape, dtype=dtypes.float32, name="inputA") + out_tensor = in_tensor_1 + in_tensor_2 + inputs = {"x": in_tensor_1, "y": in_tensor_2} + outputs = {"z": out_tensor} + saved_model.simple_save(sess, saved_model_dir, inputs, outputs) + return saved_model_dir + + def _getInputArrayNames(self, model_proto): + return [data.name for data in model_proto.input_arrays] + + def _getInputArrayShapes(self, model_proto): + return [ + [dim for dim in data.shape.dims] for data in model_proto.input_arrays + ] + + def _get_model_flags_proto_from_file(self, filename): + proto = _model_flags_pb2.ModelFlags() + with gfile.Open(filename, "rb") as output_file: + proto.ParseFromString(output_file.read()) + output_file.close() + return proto + + def testSimpleSavedModel(self): + """Test a simple SavedModel.""" + saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) + output_file_model = os.path.join(self.get_temp_dir(), "model.pb") + output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt") + + convert_saved_model.saved_model_to_frozen_graphdef( + saved_model_dir=saved_model_dir, + output_file_model=output_file_model, + output_file_flags=output_file_flags, + input_arrays=["inputB", "inputA"]) + + proto = self._get_model_flags_proto_from_file(output_file_flags) + self.assertEqual(proto.output_arrays, ["add"]) + self.assertEqual(self._getInputArrayNames(proto), ["inputA", "inputB"]) + self.assertEqual( + self._getInputArrayShapes(proto), [[1, 16, 16, 3], [1, 16, 16, 3]]) + + def testSimpleSavedModelWithDifferentInputNames(self): + """Test a simple SavedModel.""" + saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3]) + output_file_model = os.path.join(self.get_temp_dir(), "model.pb") + output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt") + + # Check case where input shape is given. + convert_saved_model.saved_model_to_frozen_graphdef( + saved_model_dir=saved_model_dir, + output_file_model=output_file_model, + output_file_flags=output_file_flags, + input_arrays=["inputA"], + input_shapes={"inputA": [1, 16, 16, 3]}) + + proto = self._get_model_flags_proto_from_file(output_file_flags) + self.assertEqual(proto.output_arrays, ["add"]) + self.assertEqual(self._getInputArrayNames(proto), ["inputA"]) + self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]]) + + # Check case where input shape is None. + convert_saved_model.saved_model_to_frozen_graphdef( + saved_model_dir=saved_model_dir, + output_file_model=output_file_model, + output_file_flags=output_file_flags, + input_arrays=["inputA"], + input_shapes={"inputA": None}) + + proto = self._get_model_flags_proto_from_file(output_file_flags) + self.assertEqual(proto.output_arrays, ["add"]) + self.assertEqual(self._getInputArrayNames(proto), ["inputA"]) + self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]]) + + class Model(keras.Model): """Model to recognize digits in the MNIST dataset. - Train and export savedmodel, used for testOnflyTrainMnistSavedModel + Train and export SavedModel, used for testOnflyTrainMnistSavedModel Network structure is equivalent to: https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py @@ -238,7 +357,7 @@ def dummy_input_fn(): class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase): def testTrainedMnistSavedModel(self): - """Test mnist savedmodel, trained with dummy data and small steps.""" + """Test mnist SavedModel, trained with dummy data and small steps.""" # Build classifier classifier = estimator.Estimator( model_fn=model_fn, @@ -253,21 +372,20 @@ class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase): "image": image, }) - # Export savedmodel + # Export SavedModel saved_model_dir = os.path.join(self.get_temp_dir(), "mnist_savedmodel") classifier.export_savedmodel(saved_model_dir, pred_input_fn) # Convert to tflite and test output saved_model_name = os.listdir(saved_model_dir)[0] saved_model_final_dir = os.path.join(saved_model_dir, saved_model_name) - output_tflite = os.path.join(saved_model_dir, - saved_model_final_dir + ".lite") + output_file = os.path.join(saved_model_dir, saved_model_final_dir + ".lite") # TODO(zhixianyan): no need to limit output_arrays to `Softmax' # once b/74205001 fixed and argmax implemented in tflite. - result = convert_saved_model.convert( + result = convert_saved_model.tflite_from_saved_model( saved_model_dir=saved_model_final_dir, - output_arrays="Softmax", - output_tflite=output_tflite) + output_arrays=["Softmax"], + output_file=output_file) self.assertTrue(result) diff --git a/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..4d9782f4a6a9e853c3afdbd97d4264a818937e63 --- /dev/null +++ b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py @@ -0,0 +1,106 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Python console command for generating frozen models from SavedModels. + +This exists to add SavedModel compatibility to TOCO. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import sys +from tensorflow.contrib.lite.python.convert_saved_model import saved_model_to_frozen_graphdef +from tensorflow.python.platform import app + +FLAGS = None + + +def execute(unused_args): + """Calls function to convert the SavedModel to a frozen graph.""" + # Error handling. + if FLAGS.input_shapes and not FLAGS.input_arrays: + raise ValueError("Input shapes requires input arrays to be specified.") + + # Calls saved_model_to_frozen_graphdef function to generate frozen graph. + input_arrays = (FLAGS.input_arrays.split(",") if FLAGS.input_arrays else None) + input_shapes = None + if FLAGS.input_shapes: + input_shapes = { + input_arrays[idx]: shape.split(",") + for idx, shape in enumerate(FLAGS.input_shapes.split(":")) + } + output_arrays = ( + FLAGS.output_arrays.split(",") if FLAGS.output_arrays else None) + tag_set = set(FLAGS.tag_set.split(",")) if FLAGS.tag_set else None + + saved_model_to_frozen_graphdef( + saved_model_dir=FLAGS.saved_model_directory, + output_file_model=FLAGS.output_file_model, + output_file_flags=FLAGS.output_file_flags, + input_arrays=input_arrays, + input_shapes=input_shapes, + output_arrays=output_arrays, + tag_set=tag_set, + signature_key=FLAGS.signature_key, + batch_size=FLAGS.batch_size) + + +def main(): + global FLAGS + # Parses flags. + parser = argparse.ArgumentParser( + description="Invoke SavedModel to frozen model converter.") + parser.add_argument( + "saved_model_directory", + type=str, + help="Full path to directory containing the SavedModel.") + parser.add_argument( + "output_file_model", + type=str, + help="Full file path to save frozen graph.") + parser.add_argument( + "output_file_flags", type=str, help="Full file path to save ModelFlags.") + parser.add_argument( + "--input_arrays", + type=str, + help="Name of the input arrays, comma-separated.") + parser.add_argument( + "--input_shapes", + type=str, + help="Shapes corresponding to --input_arrays, colon-separated.") + parser.add_argument( + "--output_arrays", + type=str, + help="Name of the output arrays, comma-separated.") + parser.add_argument( + "--tag_set", type=str, help="Name of output arrays, comma-separated.") + parser.add_argument( + "--signature_key", + type=str, + help="Key identifying SignatureDef containing inputs and outputs.") + parser.add_argument( + "--batch_size", + type=int, + help="Batch size for the model. Replaces the first dimension of an " + "input size array if undefined.") + + FLAGS, unparsed = parser.parse_known_args() + + app.run(main=execute, argv=[sys.argv[0]] + unparsed) + + +if __name__ == "__main__": + main() diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/convert_test.py similarity index 82% rename from tensorflow/contrib/lite/python/lite_test.py rename to tensorflow/contrib/lite/python/convert_test.py index b8b4510188bee867b32ffde714b27f41a1df778a..dc21a9b66933f595a5f31b0b91ff247a5458dad6 100644 --- a/tensorflow/contrib/lite/python/lite_test.py +++ b/tensorflow/contrib/lite/python/convert_test.py @@ -17,8 +17,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.lite.python import lite -from tensorflow.contrib.lite.python.op_hint import _tensor_name_base as _tensor_name_base +from tensorflow.contrib.lite.python import convert +from tensorflow.contrib.lite.python import lite_constants +from tensorflow.contrib.lite.python import op_hint from tensorflow.python.client import session from tensorflow.python.framework import dtypes from tensorflow.python.framework import test_util @@ -29,7 +30,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -class LiteTest(test_util.TensorFlowTestCase): +class ConvertTest(test_util.TensorFlowTestCase): def testBasic(self): in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3], @@ -37,13 +38,13 @@ class LiteTest(test_util.TensorFlowTestCase): out_tensor = in_tensor + in_tensor sess = session.Session() # Try running on valid graph - result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor]) + result = convert.toco_convert(sess.graph_def, [in_tensor], [out_tensor]) self.assertTrue(result) # TODO(aselle): remove tests that fail (we must get TOCO to not fatal # all the time). # Try running on identity graph (known fail) # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"): - # result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor]) + # result = convert.toco_convert(sess.graph_def, [in_tensor], [in_tensor]) def testQuantization(self): in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3], @@ -51,13 +52,14 @@ class LiteTest(test_util.TensorFlowTestCase): out_tensor = array_ops.fake_quant_with_min_max_args(in_tensor + in_tensor, min=0., max=1.) sess = session.Session() - result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor], - inference_type=lite.QUANTIZED_UINT8, - quantized_input_stats=[(0., 1.)]) + result = convert.toco_convert( + sess.graph_def, [in_tensor], [out_tensor], + inference_type=lite_constants.QUANTIZED_UINT8, + quantized_input_stats=[(0., 1.)]) self.assertTrue(result) -class LiteTestOpHint(test_util.TensorFlowTestCase): +class ConvertTestOpHint(test_util.TensorFlowTestCase): """Test the hint to stub functionality.""" def _getGraphOpTypes(self, graphdef, output_nodes): @@ -99,7 +101,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): swish_scale = array_ops.constant(1.0) def _swish(input_tensor, scale): - custom = lite.OpHint("cool_activation") + custom = op_hint.OpHint("cool_activation") input_tensor, scale = custom.add_inputs(input_tensor, scale) output = math_ops.sigmoid(input_tensor) * input_tensor * scale output, = custom.add_outputs(output) @@ -111,11 +113,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): # and 1 final output). self.assertEqual(self._countIdentities(sess.graph_def.node), 4) - stubbed_graphdef = lite.convert_op_hints_to_stubs(sess) + stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess) self.assertCountEqual( self._getGraphOpTypes( - stubbed_graphdef, output_nodes=[_tensor_name_base(output)]), + stubbed_graphdef, + output_nodes=[op_hint._tensor_name_base(output)]), ["cool_activation", "Const", "Identity"]) def testScaleAndBiasAndIdentity(self): @@ -125,7 +128,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): b = array_ops.constant([4., 5.]) def _scaled_and_bias_and_identity(a, x, b): - custom = lite.OpHint("scale_and_bias_and_identity") + custom = op_hint.OpHint("scale_and_bias_and_identity") a, x, b = custom.add_inputs(a, x, b) return custom.add_outputs(a * x + b, x) output = array_ops.identity(_scaled_and_bias_and_identity(a, x, b), @@ -136,11 +139,12 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): # +1 for the final output self.assertEqual(self._countIdentities(sess.graph_def.node), 6) - stubbed_graphdef = lite.convert_op_hints_to_stubs(sess) + stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess) self.assertCountEqual( self._getGraphOpTypes( - stubbed_graphdef, output_nodes=[_tensor_name_base(output)]), + stubbed_graphdef, + output_nodes=[op_hint._tensor_name_base(output)]), ["scale_and_bias_and_identity", "Const", "Identity", "Pack"]) def testTwoFunctions(self): @@ -148,7 +152,7 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): a = array_ops.constant([1.]) b = array_ops.constant([1.]) def _double_values(x): - custom = lite.OpHint("add_test") + custom = op_hint.OpHint("add_test") x = custom.add_inputs(x) output = math_ops.multiply(x, x) output, = custom.add_outputs(output) @@ -160,10 +164,11 @@ class LiteTestOpHint(test_util.TensorFlowTestCase): # make sure one identity for each input (2) and output (2) => 2 + 2 # +1 for the final output self.assertEqual(self._countIdentities(sess.graph_def.node), 5) - stubbed_graphdef = lite.convert_op_hints_to_stubs(sess) + stubbed_graphdef = op_hint.convert_op_hints_to_stubs(sess) self.assertCountEqual( self._getGraphOpTypes( - stubbed_graphdef, output_nodes=[_tensor_name_base(output)]), + stubbed_graphdef, + output_nodes=[op_hint._tensor_name_base(output)]), ["add_test", "Const", "Identity", "Add"]) diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py index cf50f9d4d65cb7a36af8f82e2d29babbc9884d23..4ea40201f73bb0aabee60866dde2337862149dc7 100644 --- a/tensorflow/contrib/lite/python/lite.py +++ b/tensorflow/contrib/lite/python/lite.py @@ -18,6 +18,7 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice. @@toco_convert @@toco_convert_protos +@@tflite_from_saved_model @@OpHint @@convert_op_hints_to_stubs @@ -25,208 +26,11 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice. from __future__ import absolute_import from __future__ import division from __future__ import print_function -import os as _os -import subprocess as _subprocess -import tempfile as _tempfile # pylint: disable=unused-import +from tensorflow.contrib.lite.python.convert import toco_convert +from tensorflow.contrib.lite.python.convert import toco_convert_protos +from tensorflow.contrib.lite.python.convert_saved_model import tflite_from_saved_model from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs from tensorflow.contrib.lite.python.op_hint import OpHint # pylint: enable=unused-import -from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2 -from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2 -from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2 -from tensorflow.python.framework import dtypes as _dtypes -from tensorflow.python.platform import resource_loader as _resource_loader -from tensorflow.python.util.all_util import remove_undocumented -from tensorflow.python.util.lazy_loader import LazyLoader - -# Lazy load since some of the performance benchmark skylark rules -# break dependencies. -_toco_python = LazyLoader( - "tensorflow_wrap_toco", globals(), - "tensorflow.contrib.lite.toco.python." - "tensorflow_wrap_toco") -del LazyLoader - -# Enum types from the protobuf promoted to the API -FLOAT = _types_pb2.FLOAT -INT32 = _types_pb2.INT32 -INT64 = _types_pb2.INT64 -STRING = _types_pb2.STRING -QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8 -TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF -TFLITE = _toco_flags_pb2.TFLITE -GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT - -# Currently the default mode of operation is to shell to another python process -# to protect against crashes. However, it breaks some dependent targets because -# it forces us to depend on an external py_binary. The experimental API doesn't -# have that drawback. -EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False - -# Find the toco_from_protos binary using the resource loader if using from -# bazel, otherwise we are in a pip where console_scripts already has -# the toco_from_protos tool. -if EXPERIMENTAL_USE_TOCO_API_DIRECTLY: - _toco_from_proto_bin = "" -else: - _toco_from_proto_bin = _resource_loader.get_path_to_datafile( - "../toco/python/toco_from_protos") - -if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin): - _toco_from_proto_bin = "toco_from_protos" - - -def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str): - """Convert `input_data_str` according to model and toco parameters. - - Unless you know what you are doing consider using - the more friendly @{tf.contrib.lite.toco_convert}}. - - Args: - model_flags_str: Serialized proto describing model properties, see - `toco/model_flags.proto`. - toco_flags_str: Serialized proto describing conversion properties, see - `toco/toco_flags.proto`. - input_data_str: Input data in serialized form (e.g. a graphdef is common) - Returns: - Converted model in serialized form (e.g. a TFLITE model is common). - Raises: - RuntimeError: When conversion fails, an exception is raised with the error - message embedded. - """ - # TODO(aselle): When toco does not use fatal errors for failure, we can - # switch this on. - if not _toco_from_proto_bin: - return _toco_python.TocoConvert( - model_flags_str, toco_flags_str, input_data_str) - - with _tempfile.NamedTemporaryFile() as fp_toco, \ - _tempfile.NamedTemporaryFile() as fp_model, \ - _tempfile.NamedTemporaryFile() as fp_input, \ - _tempfile.NamedTemporaryFile() as fp_output: - fp_model.write(model_flags_str) - fp_toco.write(toco_flags_str) - fp_input.write(input_data_str) - fp_model.flush() - fp_toco.flush() - fp_input.flush() - - cmd = [ - _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name, - fp_output.name - ] - cmdline = " ".join(cmd) - proc = _subprocess.Popen( - cmdline, - shell=True, - stdout=_subprocess.PIPE, - stderr=_subprocess.STDOUT, - close_fds=True) - stdout, stderr = proc.communicate() - exitcode = proc.returncode - if exitcode == 0: - stuff = fp_output.read() - return stuff - else: - raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" % - (stdout, stderr)) - - -def _tensor_name(x): - return x.name.split(":")[0] - - -def toco_convert(input_data, - input_tensors, - output_tensors, - inference_type=FLOAT, - input_format=TENSORFLOW_GRAPHDEF, - output_format=TFLITE, - quantized_input_stats=None, - drop_control_dependency=True, - allow_custom_ops=None): - """Convert a model using TOCO from `input_format` to `output_format`. - - Typically this is to convert from TensorFlow GraphDef to TFLite, in which - case the default `input_format` and `output_format` are sufficient. - - Args: - input_data: Input data (i.e. often `sess.graph_def`). - input_tensors: List of input tensors. Type and shape are computed using - `foo.get_shape()` and `foo.dtype`. - output_tensors: List of output tensors (only .name is used from this). - inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`. - input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF). - output_format: Type of data to write (currently must be TFLITE or - GRAPHVIZ_DOT) - quantized_input_stats: For each member of input_tensors the mean and - std deviation of training data. Only needed if `inference_type` is - `QUANTIZED_UINT8`. - drop_control_dependency: Drops control dependencies silently. This is due - to tf lite not supporting control dependencies. - - Returns: - The converted data. For example if tflite was the destination, then - this will be a tflite flatbuffer in a bytes array. - - Raises: - ValueError: If the input tensor type is unknown - RuntimeError: If TOCO fails to convert (in which case the runtime error's - error text will contain the TOCO error log) - """ - toco = _toco_flags_pb2.TocoFlags() - toco.input_format = input_format - toco.output_format = output_format - toco.inference_type = inference_type - toco.drop_control_dependency = drop_control_dependency - if allow_custom_ops is not None: - toco.allow_custom_ops = allow_custom_ops - - model = _model_flags_pb2.ModelFlags() - for idx, input_tensor in enumerate(input_tensors): - if input_tensor.dtype == _dtypes.float32: - tflite_input_type = FLOAT - elif input_tensor.dtype == _dtypes.int32: - tflite_input_type = INT32 - elif input_tensor.dtype == _dtypes.int64: - tflite_input_type = INT64 - # TODO(aselle): Insert strings when they are available - else: - raise ValueError("Tensors %s not known type %r" % (input_tensor.name, - input_tensor.dtype)) - - input_array = model.input_arrays.add() - - if inference_type == QUANTIZED_UINT8: - if tflite_input_type == FLOAT: - tflite_input_type = QUANTIZED_UINT8 - input_array.mean_value, input_array.std_value = quantized_input_stats[idx] - - input_array.name = _tensor_name(input_tensor) - input_array.shape.dims.extend(map(int, input_tensor.get_shape())) - - for output_tensor in output_tensors: - model.output_arrays.append(_tensor_name(output_tensor)) - - # TODO(aselle): Consider handling the case of allowing quantized - # inputs to be converted to float (via the toco.inference_input_type field). - data = toco_convert_protos(model.SerializeToString(), - toco.SerializeToString(), - input_data.SerializeToString()) - return data - - -_allowed_symbols = [ - "FLOAT", - "INT32", - "INT64", - "STRING", - "QUANTIZED_UINT8", - "TENSORFLOW_GRAPHDEF", - "TFLITE", - "GRAPHVIZ_DOT", - "EXPERIMENTAL_USE_TOCO_API_DIRECTLY", -] -remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/lite/python/lite_constants.py b/tensorflow/contrib/lite/python/lite_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..195d7a732f337676937c7af5137d4dea84989c03 --- /dev/null +++ b/tensorflow/contrib/lite/python/lite_constants.py @@ -0,0 +1,53 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Constants for TFLite.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2 +from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2 +from tensorflow.python.util.all_util import remove_undocumented + +# Enum types from the protobuf promoted to the API +FLOAT = _types_pb2.FLOAT +INT32 = _types_pb2.INT32 +INT64 = _types_pb2.INT64 +STRING = _types_pb2.STRING +QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8 +TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF +TFLITE = _toco_flags_pb2.TFLITE +GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT + +# Currently the default mode of operation is to shell to another python process +# to protect against crashes. However, it breaks some dependent targets because +# it forces us to depend on an external py_binary. The experimental API doesn't +# have that drawback. +EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False + + +_allowed_symbols = [ + "FLOAT", + "INT32", + "INT64", + "STRING", + "QUANTIZED_UINT8", + "TENSORFLOW_GRAPHDEF", + "TFLITE", + "GRAPHVIZ_DOT", + "EXPERIMENTAL_USE_TOCO_API_DIRECTLY", +] +remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs index 2b62c257d8410f9af1b250c9d108eba6737a9efe..a65c2e0c70dcf9f97e49cc749132afbd5fc69365 100644 --- a/tensorflow/contrib/lite/schema/schema.fbs +++ b/tensorflow/contrib/lite/schema/schema.fbs @@ -435,21 +435,25 @@ table Operator { custom_options_format:CustomOptionsFormat; } -// The root type, defining a model. +// The root type, defining a subgraph, which typically represents an entire +// model. table SubGraph { - // A list of all tensors used in this model. + // A list of all tensors used in this subgraph. tensors:[Tensor]; - // Indices of the input tensors. + // Indices of the tensors that are inputs into this subgraph. Note this is + // the list of non-static tensors that feed into the subgraph for inference. inputs:[int]; - // Indices of the output tensors. + // Indices of the tensors that are outputs out of this subgraph. Note this is + // the list of output tensors that are considered the product of the + // subgraph's inference. outputs:[int]; // All operators, in execution order. operators:[Operator]; - // Name of subgraph (used for debugging). + // Name of this subgraph (used for debugging). name:string; } @@ -475,7 +479,10 @@ table Model { // A description of the model. description:string; - // Buffers of the model + // Buffers of the model. + // Note the 0th entry of this array must be an empty buffer (sentinel). + // This is a convention so that tensors without a buffer can provide 0 as + // their buffer. buffers:[Buffer]; } diff --git a/tensorflow/contrib/lite/string_util.cc b/tensorflow/contrib/lite/string_util.cc index cd41299d38361321503d421272426a9d1082c937..a89776b29f895fe82ee71efe00c0949c58c109df 100644 --- a/tensorflow/contrib/lite/string_util.cc +++ b/tensorflow/contrib/lite/string_util.cc @@ -24,7 +24,10 @@ namespace tflite { namespace { // Convenient method to get pointer to int32_t. -int32_t* GetIntPtr(char* ptr) { return reinterpret_cast(ptr); } +const int32_t* GetIntPtr(const char* ptr) { + return reinterpret_cast(ptr); +} + } // namespace void DynamicBuffer::AddString(const char* str, size_t len) { @@ -64,7 +67,7 @@ void DynamicBuffer::AddJoinedString(const std::vector& strings, offset_.push_back(offset_.back() + total_len); } -void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) { +int DynamicBuffer::WriteToBuffer(char** buffer) { // Allocate sufficient memory to tensor buffer. int32_t num_strings = offset_.size() - 1; // Total bytes include: @@ -75,43 +78,57 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) { int32_t bytes = data_.size() // size of content + sizeof(int32_t) * (num_strings + 2); // size of header - // Output tensor will take over the ownership of tensor_buffer, and free it - // during Interpreter destruction. - char* tensor_buffer = static_cast(malloc(bytes)); + // Caller will take ownership of buffer. + *buffer = reinterpret_cast(malloc(bytes)); // Set num of string - memcpy(tensor_buffer, &num_strings, sizeof(int32_t)); + memcpy(*buffer, &num_strings, sizeof(int32_t)); // Set offset of strings. int32_t start = sizeof(int32_t) * (num_strings + 2); for (int i = 0; i < offset_.size(); i++) { int32_t offset = start + offset_[i]; - memcpy(tensor_buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t)); + memcpy(*buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t)); } // Copy data of strings. - memcpy(tensor_buffer + start, data_.data(), data_.size()); + memcpy(*buffer + start, data_.data(), data_.size()); + return bytes; +} + +void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) { + char* tensor_buffer; + int bytes = WriteToBuffer(&tensor_buffer); // Set tensor content pointer to tensor_buffer, and release original data. auto dims = TfLiteIntArrayCreate(1); - dims->data[0] = num_strings; + dims->data[0] = offset_.size() - 1; // Store number of strings. TfLiteTensorReset(tensor->type, tensor->name, dims, tensor->params, tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation, tensor); } +int GetStringCount(const char* raw_buffer) { + // The first integers in the raw buffer is the number of strings. + return *GetIntPtr(raw_buffer); +} + int GetStringCount(const TfLiteTensor* tensor) { // The first integers in the raw buffer is the number of strings. - return *GetIntPtr(tensor->data.raw); + return GetStringCount(tensor->data.raw); } -StringRef GetString(const TfLiteTensor* tensor, int string_index) { - int32_t* offset = - GetIntPtr(tensor->data.raw + sizeof(int32_t) * (string_index + 1)); +StringRef GetString(const char* raw_buffer, int string_index) { + const int32_t* offset = + GetIntPtr(raw_buffer + sizeof(int32_t) * (string_index + 1)); return { - tensor->data.raw + (*offset), + raw_buffer + (*offset), (*(offset + 1)) - (*offset), }; } +StringRef GetString(const TfLiteTensor* tensor, int string_index) { + return GetString(tensor->data.raw, string_index); +} + } // namespace tflite diff --git a/tensorflow/contrib/lite/string_util.h b/tensorflow/contrib/lite/string_util.h index c35a2fff3c23b17515323b65d08df6f6da288834..57f129bf5ea7ab7e792cc01f83e281922dbfb834 100644 --- a/tensorflow/contrib/lite/string_util.h +++ b/tensorflow/contrib/lite/string_util.h @@ -49,7 +49,7 @@ namespace tflite { // Convenient structure to store string pointer and length. typedef struct { - char* str; + const char* str; int len; } StringRef; @@ -70,6 +70,10 @@ class DynamicBuffer { // buffer. void AddJoinedString(const std::vector& strings, char separator); + // Fill content into a buffer and returns the number of bytes stored. + // The function allocates space for the buffer but does NOT take ownership. + int WriteToBuffer(char** buffer); + // Fill content into a string tensor. void WriteToTensor(TfLiteTensor* tensor); @@ -81,10 +85,12 @@ class DynamicBuffer { }; // Return num of strings in a String tensor. +int GetStringCount(const char* raw_buffer); int GetStringCount(const TfLiteTensor* tensor); // Get String pointer and length of index-th string in tensor. // NOTE: This will not create a copy of string data. +StringRef GetString(const char* raw_buffer, int string_index); StringRef GetString(const TfLiteTensor* tensor, int string_index); } // namespace tflite diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc index c289ddcd929c74e9a5b0eaf2e0c8fc1a6a01d320..5bb0e3ba4d289ccfcb54198230f53c62222963a2 100644 --- a/tensorflow/contrib/lite/toco/dump_graphviz.cc +++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc @@ -259,6 +259,19 @@ NodeProperties GetPropertiesForOperator(const Operator& op) { node_properties.color = Color(0xC5, 0x39, 0x29); // Bolder color break; } + case OperatorType::kFakeQuant: { + const auto& fakequant_op = static_cast(op); + node_properties.color = Color(0xC5, 0x39, 0x29); // Bolder color + if (fakequant_op.minmax) { + AppendF(&node_properties.label, "\\n%dbit [%g,%g]", + fakequant_op.num_bits, fakequant_op.minmax->min, + fakequant_op.minmax->max); + } else { + AppendF(&node_properties.label, "\\n%dbit [?,?]", + fakequant_op.num_bits); + } + break; + } default: node_properties.color = Color(0xDB, 0x44, 0x37); break; diff --git a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc index f098981a5cf4b91df4c7798bd3db8563705a3bd0..c00cdcb944b085dda41033b95c96537cc2e047c3 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/experimental_shuffle_fc_weights.cc @@ -55,17 +55,26 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) { // Exit if, based on the known shapes, this FC op is not a GEMV. // The shuffling of FC weights is only useful to enable fast GEMV paths. const Shape& input_shape = input_array.shape(); - for (int i = 0; i < input_shape.dimensions_count() - 1; i++) { + for (int i = 1; i < input_shape.dimensions_count() - 1; i++) { if (input_shape.dims(i) != 1) { // The input activations, shaped as a matrix, have multiple columns. // This FC op isn't a matrix*vector multiplication. AddMessageF( "Not applying experimental shuffling to the weights of %s because " - "it's not a matrix*vector product", + "the input shape is not 1D or 2D (possibly with additional inner " + "dimensions of size 1)", LogName(*op)); return false; } } + if (input_shape.dims(0) != 1 && input_shape.dims(0) != 4) { + AddMessageF( + "Not applying experimental shuffling to the weights of %s because " + "the input shape's leading dimension, i.e. the 'batch size', is not " + "equal to 1 or 4", + LogName(*op)); + return false; + } // Exit if the weights shape isn't an integral multiple of the shuffled // block shape, 4x16. We don't want to have to write code dealing with // odd sizes, that would go un-exercised at the moment as the models @@ -129,6 +138,20 @@ bool ExperimentalShuffleFCWeights::Run(Model* model, std::size_t op_index) { fc_op->experimental_shuffled_weights = true; AddMessageF("Applied experimental shuffling to the weights of %s", LogName(*op)); + // Add a second output array to this FC op, serving as a workspace to perform + // runtime shuffling/xoring of its input activations. + CHECK_EQ(fc_op->outputs.size(), 1); + const string& shuffled_input_workspace_array_name = + AvailableArrayName(*model, fc_op->inputs[0] + "_shuffled"); + fc_op->outputs.push_back(shuffled_input_workspace_array_name); + auto& shuffled_input_workspace_array = + model->GetOrCreateArray(shuffled_input_workspace_array_name); + shuffled_input_workspace_array.data_type = input_array.data_type; + *shuffled_input_workspace_array.mutable_shape() = input_array.shape(); + shuffled_input_workspace_array.GetOrCreateMinMax() = input_array.GetMinMax(); + shuffled_input_workspace_array.GetOrCreateQuantizationParams() = + input_array.GetQuantizationParams(); + return true; } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc index b34aca1f091e70d709558df999c8aa321ff0b46d..be6e0e07dd08abef856fb48892d1fa693dd926e6 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc @@ -168,7 +168,9 @@ void ProcessConvOperator(Model* model, ConvOperator* op) { return; } const auto& input_shape = input_array.shape(); - CHECK_EQ(input_shape.dimensions_count(), 4); + CHECK(input_shape.dimensions_count() == 4) + << "Conv ops require 4D inputs. Input array \"" << op->inputs[0] + << "\" is " << input_shape.dimensions_count() << "D."; const auto& weights_array = model->GetArray(op->inputs[1]); // Yield until weights dims have been resolved. @@ -249,12 +251,6 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) { << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape " << toco::ShapeToString(weights_shape) << "."; - CHECK(weights_shape.dims(0) == 1 && weights_shape.dims(3) == 1) - << "TransposeConv weights dimensions must begin and end with 1. Input " - "weights \"" - << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape " - << toco::ShapeToString(weights_shape) << "."; - // Compute padding const int kheight = weights_shape.dims(1); const int kwidth = weights_shape.dims(2); @@ -269,9 +265,7 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) { LOG(FATAL) << "TransposeConv only supports SAME or VALID padding"; } - // VALIDATE OUTPUT SHAPE - // Compute the output shape from the input and weights shapes to verify it - // agrees with the specified output shape. + // VALIDATE some dimensions and set the output shape. const auto& input_array = model->GetArray(op->inputs[TransposeConvOperator::DATA_INPUT]); if (!input_array.has_shape()) { @@ -283,31 +277,13 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) { << "TransposeConv input shape must have 4 dimensions. Input \"" << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape " << toco::ShapeToString(weights_shape) << "."; + CHECK_EQ(input_shape.dims(3), weights_shape.dims(0)) + << "Input shape depth and weight depth do not agree"; - // Compute output shape - const int input_width = input_shape.dims(2); - const int input_height = input_shape.dims(1); - int output_height = op->stride_height * (input_height - 1); - int output_width = op->stride_width * (input_width - 1); - if (op->padding.type == PaddingType::kValid) { - output_height += kheight; - output_width += kwidth; - } else if (op->padding.type == PaddingType::kSame) { - output_height += 1; - output_width += 1; - } - - CHECK(specified_output_shape_array.GetBuffer().data == - std::vector({input_shape.dims(0), output_height, output_width, - weights_shape.dims(3)})) - << "Specified output shape: " << ShapeToString(output_array.shape()) - << ", does not agree with shape computed from input data and weights: [" - << input_shape.dims(0) << ", " << output_height << ", " << output_width - << ", " << weights_shape.dims(3) << "]."; - - // SUCCESS: Set the op's output shape according to the specified output shape. - *(output_array.mutable_shape()->mutable_dims()) = + // Set the output shape according to the specified output shape. + std::vector const& specified_output_shape = specified_output_shape_array.GetBuffer().data; + *(output_array.mutable_shape()->mutable_dims()) = specified_output_shape; } void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) { @@ -1179,6 +1155,11 @@ void ProcessRankOperator(Model* model, RankOperator* op) { return; } + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes + return; + } + const auto& input_array = model->GetArray(op->inputs[0]); if (!input_array.has_shape()) { // Yield until input dims have been resolved. @@ -1200,6 +1181,11 @@ void ProcessShapeOperator(Model* model, TensorFlowShapeOperator* op) { return; } + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes + return; + } + const auto& input_array = model->GetArray(op->inputs[0]); if (!input_array.has_shape()) { // Yield until input dims have been resolved. @@ -1230,10 +1216,6 @@ void ProcessStackOperator(Model* model, StackOperator* op) { } Shape shape = input_array.shape(); - if (shape.dimensions_count() == 0) { - // Convert 0D scalars to 1D scalars of shape {1}. - shape.mutable_dims()->push_back(1); - } if (!stacked_shape) { stacked_shape.reset(new Shape(shape)); } else { @@ -1516,13 +1498,10 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) { return; } - // The current ArgMax implementation only supports 4-dimensional inputs with - // the last dimension as the axis to perform ArgMax for. const std::vector& input_dims = input_array.shape().dims(); - CHECK_EQ(input_dims.size(), 4); std::vector output_dims; - output_dims.reserve(input_dims.size() - 1); + output_dims.reserve(input_dims.size()); for (int i = 0; i < input_dims.size() - 1; ++i) { output_dims.push_back(input_dims[i]); } diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc index 5e779f6765262326bd59db886c2feed603e0102e..6e78653fad238085da5ba66166884093ea9b0214 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc @@ -233,7 +233,12 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) { } // Check that input data types agree. - CHECK(input0_array.data_type == input1_array.data_type); + CHECK(input0_array.data_type == input1_array.data_type) + << "Dissimilar data types given to op outputting \"" + << binary_op->outputs[0] << "\". 0:\"" << binary_op->inputs[0] << "\"(" + << static_cast(input0_array.data_type) << ") 1:\"" + << binary_op->inputs[1] << "\"(" + << static_cast(input1_array.data_type) << ")."; // Do the actual constants propagation EvaluateBinaryOperatorOnConstantInputs(model, binary_op); diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc index 37beb41dfc5904fc6ace79ebea2420d2ab92fbfb..4bb1217828a9c480241a3b503dffe26462df4063 100644 --- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc +++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc @@ -60,6 +60,11 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) { const auto& output_array_name = mul_op->outputs[0]; auto& output_array = model->GetArray(output_array_name); + if (output_array.data_type == ArrayDataType::kNone) { + // Yield until the output type has been set by PropagateArrayDataTypes + return false; + } + // Yield if the output shape is not known yet. if (!output_array.has_shape()) { return false; diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc index 155d890c9f23ba206f1f0e6db645a601308cea5b..2ed05cb3720662013e4805204aca89bcbf19447a 100644 --- a/tensorflow/contrib/lite/toco/import_tensorflow.cc +++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc @@ -1093,8 +1093,10 @@ void ConvertMatMulOperator(const NodeDef& node, // Transpose flags should be easy to support, but we don't have a // GraphDef with them to test on at the moment. - CHECK_EQ(GetBoolAttr(node, "transpose_a"), false); - CHECK_EQ(GetBoolAttr(node, "transpose_b"), false); + CHECK_EQ(HasAttr(node, "transpose_a") && GetBoolAttr(node, "transpose_a"), + false); + CHECK_EQ(HasAttr(node, "transpose_b") && GetBoolAttr(node, "transpose_b"), + false); CHECK(!HasAttr(node, "adjoint_a") || (GetBoolAttr(node, "adjoint_a") == false)); CHECK(!HasAttr(node, "adjoint_b") || @@ -1300,11 +1302,17 @@ void ConvertStridedSliceOperator(const NodeDef& node, } op->outputs.push_back(node.name()); - op->begin_mask = GetIntAttr(node, "begin_mask"); - op->ellipsis_mask = GetIntAttr(node, "ellipsis_mask"); - op->end_mask = GetIntAttr(node, "end_mask"); - op->new_axis_mask = GetIntAttr(node, "new_axis_mask"); - op->shrink_axis_mask = GetIntAttr(node, "shrink_axis_mask"); + op->begin_mask = + HasAttr(node, "begin_mask") ? GetIntAttr(node, "begin_mask") : 0; + op->ellipsis_mask = + HasAttr(node, "ellipsis_mask") ? GetIntAttr(node, "ellipsis_mask") : 0; + op->end_mask = HasAttr(node, "end_mask") ? GetIntAttr(node, "end_mask") : 0; + op->new_axis_mask = + HasAttr(node, "new_axis_mask") ? GetIntAttr(node, "new_axis_mask") : 0; + op->shrink_axis_mask = HasAttr(node, "shrink_axis_mask") + ? GetIntAttr(node, "shrink_axis_mask") + : 0; + model->operators.emplace_back(op); } @@ -1394,8 +1402,11 @@ void ConvertArgMaxOperator(const NodeDef& node, Model* model) { CHECK_EQ(node.op(), "ArgMax"); CheckInputsCount(node, tf_import_flags, 2); - const auto axis_data_type = GetDataTypeAttr(node, "Tidx"); - const auto output_type = GetDataTypeAttr(node, "output_type"); + const auto axis_data_type = + HasAttr(node, "Tidx") ? GetDataTypeAttr(node, "Tidx") : DT_INT32; + const auto output_type = HasAttr(node, "output_type") + ? GetDataTypeAttr(node, "output_type") + : DT_INT64; CHECK(axis_data_type == DT_INT64 || axis_data_type == DT_INT32); CHECK(output_type == DT_INT64 || output_type == DT_INT32); auto* op = new ArgMaxOperator; @@ -1772,7 +1783,7 @@ void ConvertStackOperator(const NodeDef& node, op->inputs.push_back(node.input(i)); } // Both "Stack" and "Pack" have the "axis" attribute. - op->axis = GetIntAttr(node, "axis"); + op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : 0; op->outputs.push_back(node.name()); model->operators.emplace_back(op); } diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD index e0191801a0f0076565c51085ec293524d63cbe88..e1025c66642d2860c5916bf7625f1c0403c9901c 100644 --- a/tensorflow/contrib/lite/toco/tflite/BUILD +++ b/tensorflow/contrib/lite/toco/tflite/BUILD @@ -54,6 +54,7 @@ cc_library( "types.h", ], deps = [ + "//tensorflow/contrib/lite:string_util", "//tensorflow/contrib/lite/schema:schema_fbs", "//tensorflow/contrib/lite/toco:model", ], diff --git a/tensorflow/contrib/lite/toco/tflite/types.cc b/tensorflow/contrib/lite/toco/tflite/types.cc index 0afd2f3df57caf3214dd198bfa2ee75fa7a8fd7b..c9c2e9ba0184ef3f531f325091afaf6976e07f4f 100644 --- a/tensorflow/contrib/lite/toco/tflite/types.cc +++ b/tensorflow/contrib/lite/toco/tflite/types.cc @@ -13,12 +13,29 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/contrib/lite/toco/tflite/types.h" +#include "tensorflow/contrib/lite/string_util.h" namespace toco { namespace tflite { namespace { + +DataBuffer::FlatBufferOffset CopyStringToBuffer( + const Array& array, flatbuffers::FlatBufferBuilder* builder) { + const auto& src_data = array.GetBuffer().data; + ::tflite::DynamicBuffer dyn_buffer; + for (const string& str : src_data) { + dyn_buffer.AddString(str.c_str(), str.length()); + } + char* tensor_buffer; + int bytes = dyn_buffer.WriteToBuffer(&tensor_buffer); + std::vector dst_data(bytes); + memcpy(dst_data.data(), tensor_buffer, bytes); + free(tensor_buffer); + return builder->CreateVector(dst_data.data(), bytes); +} + template DataBuffer::FlatBufferOffset CopyBuffer( const Array& array, flatbuffers::FlatBufferBuilder* builder) { @@ -29,6 +46,18 @@ DataBuffer::FlatBufferOffset CopyBuffer( return builder->CreateVector(dst_data, size); } +void CopyStringFromBuffer(const ::tflite::Buffer& buffer, Array* array) { + auto* src_data = reinterpret_cast(buffer.data()->data()); + std::vector* dst_data = + &array->GetMutableBuffer().data; + int32_t num_strings = ::tflite::GetStringCount(src_data); + for (int i = 0; i < num_strings; i++) { + ::tflite::StringRef str_ref = ::tflite::GetString(src_data, i); + string this_str(str_ref.str, str_ref.len); + dst_data->push_back(this_str); + } +} + template void CopyBuffer(const ::tflite::Buffer& buffer, Array* array) { using NativeT = ::toco::DataType; @@ -93,7 +122,7 @@ flatbuffers::Offset> DataBuffer::Serialize( case ArrayDataType::kInt64: return CopyBuffer(array, builder); case ArrayDataType::kString: - return CopyBuffer(array, builder); + return CopyStringToBuffer(array, builder); case ArrayDataType::kUint8: return CopyBuffer(array, builder); default: @@ -114,7 +143,7 @@ void DataBuffer::Deserialize(const ::tflite::Tensor& tensor, case ::tflite::TensorType_INT64: return CopyBuffer(buffer, array); case ::tflite::TensorType_STRING: - return CopyBuffer(buffer, array); + return CopyStringFromBuffer(buffer, array); case ::tflite::TensorType_UINT8: return CopyBuffer(buffer, array); default: diff --git a/tensorflow/contrib/lite/toco/tflite/types_test.cc b/tensorflow/contrib/lite/toco/tflite/types_test.cc index a040fe135841b92a6e668f32cc5e36cf812ab15b..29fb0b2af22ef13e735f7c8128a2b9f6dbc42d19 100644 --- a/tensorflow/contrib/lite/toco/tflite/types_test.cc +++ b/tensorflow/contrib/lite/toco/tflite/types_test.cc @@ -151,6 +151,13 @@ TEST(DataBuffer, Int32) { ::testing::ElementsAre(1, 1 << 30)); } +TEST(DataBuffer, String) { + Array recovered = ToFlatBufferAndBack( + {"AA", "BBB", "Best. String. Ever."}); + EXPECT_THAT(recovered.GetBuffer().data, + ::testing::ElementsAre("AA", "BBB", "Best. String. Ever.")); +} + TEST(Padding, All) { EXPECT_EQ(::tflite::Padding_SAME, Padding::Serialize(PaddingType::kSame)); EXPECT_EQ(PaddingType::kSame, Padding::Deserialize(::tflite::Padding_SAME)); diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc index cf2cbeedc77c382b84f80f11a371f300ea4282ae..5a341294db5e5ce4e1774f8b772df9e77f3b61e8 100644 --- a/tensorflow/contrib/lite/toco/tooling_util.cc +++ b/tensorflow/contrib/lite/toco/tooling_util.cc @@ -1405,20 +1405,7 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) { } input_minmax.min = (qmin - mean_value) / std_value; input_minmax.max = (qmax - mean_value) / std_value; - if (input_array.minmax) { - if (input_array_proto.has_mean_value() || - input_array_proto.has_std_value()) { - const double width = input_minmax.max - input_minmax.min; - const double kMinMaxAllowedDiff = 1e-6 * width; - CHECK(std::abs(input_minmax.min - input_array.minmax->min) < - kMinMaxAllowedDiff && - std::abs(input_minmax.max - input_array.minmax->max) < - kMinMaxAllowedDiff) - << input_minmax.min << ", " << input_minmax.max - << " != " << input_array.minmax->min << ", " - << input_array.minmax->max; - } - } else { + if (!input_array.minmax) { input_array.GetOrCreateMinMax() = input_minmax; } } diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py index f681b7b132750ef80aa56f25143418fbc4eaa1bb..5d4682ec9f4b8c5864383bd1d2f4c0b41a11baad 100644 --- a/tensorflow/contrib/lookup/lookup_ops_test.py +++ b/tensorflow/contrib/lookup/lookup_ops_test.py @@ -58,6 +58,12 @@ class HashTableOpTest(test.TestCase): result = output.eval() self.assertAllEqual([0, 1, -1], result) + exported_keys_tensor, exported_values_tensor = table.export() + + self.assertItemsEqual([b"brain", b"salad", b"surgery"], + exported_keys_tensor.eval()) + self.assertItemsEqual([0, 1, 2], exported_values_tensor.eval()) + def testHashTableFindHighRank(self): with self.test_session(): default_val = -1 diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile index 05e8d9064bea748c935859f5f9b4c7e646f504cf..1a1ab54a53dd5866ca8357067846c002c5d5e9c1 100644 --- a/tensorflow/contrib/makefile/Makefile +++ b/tensorflow/contrib/makefile/Makefile @@ -89,6 +89,7 @@ HOST_INCLUDES := \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/double_conversion \ -I$(HOST_GENDIR) ifeq ($(HAS_GEN_HOST_PROTOC),true) HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include @@ -125,7 +126,9 @@ PROTO_TEXT := $(HOST_BINDIR)proto_text # The list of dependencies is derived from the Bazel build file by running # the gen_file_lists.sh script on a system with a working Bazel setup. PROTO_TEXT_CC_FILES := $(shell cat $(MAKEFILE_DIR)/proto_text_cc_files.txt) -PROTO_TEXT_PB_CC_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) +PROTO_TEXT_PB_CC_LIST := \ + $(shell cat $(MAKEFILE_DIR)/proto_text_pb_cc_files.txt) \ + $(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) PROTO_TEXT_PB_H_LIST := $(shell cat $(MAKEFILE_DIR)/proto_text_pb_h_files.txt) # Locations of the intermediate files proto_text generates. @@ -171,6 +174,7 @@ INCLUDES := \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/double_conversion \ -I$(PROTOGENDIR) \ -I$(PBTGENDIR) ifeq ($(HAS_GEN_HOST_PROTOC),true) @@ -326,6 +330,7 @@ $(MARCH_OPTION) \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/double_conversion \ -I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \ -I$(PROTOGENDIR) \ -I$(PBTGENDIR) @@ -603,6 +608,7 @@ $(wildcard tensorflow/core/platform/*/*.cc) \ $(wildcard tensorflow/core/platform/*/*/*.cc) \ $(wildcard tensorflow/core/util/*.cc) \ $(wildcard tensorflow/core/util/*/*.cc) \ +$(wildcard tensorflow/contrib/makefile/downloads/double_conversion/double-conversion/*.cc) \ tensorflow/core/util/version_info.cc # Remove duplicates (for version_info.cc) CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS)) diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh index 4d3de36e2a4141bed33d60f9e2e3c520aec3de67..eff9081e35c285027c764c5bdbaf14f78bc5f512 100755 --- a/tensorflow/contrib/makefile/download_dependencies.sh +++ b/tensorflow/contrib/makefile/download_dependencies.sh @@ -34,7 +34,8 @@ GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.g NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" -FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" +FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)" +DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL_FILE_PATH}" | head -n1)" ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)" CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" @@ -89,6 +90,7 @@ download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync" download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf" download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2" download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d" +download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion" download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl" download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive" diff --git a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py index 4fe4e8d044bd0b0987c0221ab225f449a71ccfc7..c35e60a5547c23e5f9c7b7fc2a0702d8a7decf30 100644 --- a/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py +++ b/tensorflow/contrib/meta_graph_transform/meta_graph_transform.py @@ -13,7 +13,10 @@ # limitations under the License. # ============================================================================== -"""Apply graph_transforms tool to MetaGraphDefs.""" +"""Apply graph_transforms tool to MetaGraphDefs. + +@@meta_graph_transform +""" from __future__ import absolute_import from __future__ import division @@ -30,7 +33,7 @@ from tensorflow.python.framework import importer as _importer from tensorflow.python.framework import ops as _ops from tensorflow.python.saved_model import constants as _saved_model_constants from tensorflow.python.training import saver as _saver_lib -from tensorflow.python.util import compat +from tensorflow.python.util import compat as _compat from tensorflow.tools import graph_transforms as _graph_transforms @@ -161,7 +164,7 @@ def _clean_save_and_restore(graph_def, op, removed_op_names): shapes = [] dtypes = [] for index, value in enumerate(name_op_value_tensor.string_val): - if not _is_removed(compat.as_str(value), removed_op_names): + if not _is_removed(_compat.as_str(value), removed_op_names): names.append(value) shapes.append(shape_op_value_tensor.string_val[index]) dtypes.append(op.attr['dtypes'].list.type[index]) @@ -651,7 +654,7 @@ def _is_removed_mentioned(s, removed_op_names): # /foo/bar. This regex ensures that we handle these two nodes # as separate entities. It matches on nodes having names in the form of # '/foo/bar_x' as well as nodes having names in the form of 'foo.' - s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', compat.as_str_any(s)) + s_names = _re.findall(r'((?:[\/]?[a-zA-Z0-9\_]*)*)', _compat.as_str_any(s)) for removed_op_name in removed_op_names: for s_name in s_names: if s_name.endswith(removed_op_name): @@ -737,9 +740,9 @@ def meta_graph_transform( for tag in tags: meta_graph_def.meta_info_def.tags.append(tag) - base_op_names = [compat.as_str(node.name) + base_op_names = [_compat.as_str(node.name) for node in base_meta_graph_def.graph_def.node] - retained_op_names = [compat.as_str(node.name) + retained_op_names = [_compat.as_str(node.name) for node in meta_graph_def.graph_def.node] removed_op_names = set(base_op_names) - set(retained_op_names) diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD index 5ca42f41c1c5055bf1917ad175b7b30666b18d4b..e050f3c8d4fc61adfdd3d15869e533ed2b51c4a8 100644 --- a/tensorflow/contrib/metrics/BUILD +++ b/tensorflow/contrib/metrics/BUILD @@ -77,7 +77,7 @@ py_test( py_test( name = "metric_ops_test", srcs = ["python/ops/metric_ops_test.py"], - shard_count = 3, + shard_count = 8, srcs_version = "PY2AND3", tags = ["noasan"], # times out b/63678675 deps = [ diff --git a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc index 8dca90a1e34d6a234c2b1479ca5594e88afcc194..ed22ee667f1d73b3f86f77e09bad9bfec7e46391 100644 --- a/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc +++ b/tensorflow/contrib/mpi_collectives/kernels/mpi_ops.cc @@ -73,7 +73,7 @@ limitations under the License. */ template -using StatusOr = perftools::gputools::port::StatusOr; +using StatusOr = se::port::StatusOr; using CPUDevice = Eigen::ThreadPoolDevice; using GPUDevice = Eigen::GpuDevice; diff --git a/tensorflow/contrib/mpi_collectives/mpi_ops.cc b/tensorflow/contrib/mpi_collectives/mpi_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..475297ca92111c6ead01b41d402556094dab1ee0 --- /dev/null +++ b/tensorflow/contrib/mpi_collectives/mpi_ops.cc @@ -0,0 +1,1236 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef TENSORFLOW_USE_MPI + +#include +#include +#include + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/types.pb.h" +#include "tensorflow/core/platform/mutex.h" + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#include +#include "tensorflow/stream_executor/stream.h" +#endif + +#include "tensorflow/stream_executor/lib/statusor.h" + +#define OMPI_SKIP_MPICXX +#include "third_party/mpi/mpi.h" +#include "tensorflow/contrib/mpi_collectives/mpi_message.pb.h" +#include "tensorflow/contrib/mpi_collectives/ring.h" + +/* + * MPI Allreduce and Allgather Ops for TensorFlow. + * + * TensorFlow natively provides inter-device communication through send and + * receive ops and inter-node communication through Distributed TensorFlow, + * based on the same send and receive abstractions. These end up being + * insufficient for synchronous data-parallel training on HPC clusters where + * Infiniband or other high-speed interconnects are available. This module + * implements MPI ops for allgather and allreduce, which do bandwidth-optimal + * gathers and reductions and can take advantage of hardware-optimized + * communication libraries through the MPI implementation. + * + * The primary logic of the allreduce and allgather are in RingAllgather() and + * RingAllreduce(). The background thread which facilitates MPI operations is + * run in BackgroundThreadLoop(). The provided MPI ops are: + * – MPIInit: + * Initialize MPI on a given device (CPU or GPU). + * Should only be run on a single device in every process. + * – MPISize: + * Get the number of MPI processes in the global communicator. + * – MPIRank: + * Get the rank of the current MPI process in the global communicator. + * – MPILocalRank: + * Get the local rank of the current MPI process within its node. + * – MPIAllreduce: + * Perform an allreduce on a Tensor, returning the sum + * across all MPI processes in the global communicator. + * – MPIAllgather: + * Perform an allgather on a Tensor, returning the concatenation of + * the tensor on the first dimension across all MPI processes in the + * global communicator. + * + */ + +template +using StatusOr = se::port::StatusOr; + +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +namespace tensorflow { +namespace contrib { +namespace mpi { + +// Make sure template specializations are generated in the ring.cu.cc and the +// ring.cc file, not in this file. +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, Tensor*, + Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, + Tensor*, Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, Tensor*, + Tensor*); +extern template Status RingAllgather(OpKernelContext*, + const Tensor*, + const std::vector&, + Tensor*); +extern template Status RingAllgather( + OpKernelContext*, const Tensor*, const std::vector&, Tensor*); +extern template Status RingAllgather( + OpKernelContext*, const Tensor*, const std::vector&, Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, Tensor*, + Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, + Tensor*, Tensor*); +extern template Status RingAllreduce(OpKernelContext*, + const Tensor*, Tensor*, + Tensor*); +extern template Status RingAllgather(OpKernelContext*, + const Tensor*, + const std::vector&, + Tensor*); +extern template Status RingAllgather( + OpKernelContext*, const Tensor*, const std::vector&, Tensor*); +extern template Status RingAllgather( + OpKernelContext*, const Tensor*, const std::vector&, Tensor*); + +namespace { + +// Return true if the templated type is GPUDevice, otherwise false. +template +bool IsGPUDevice(); +template <> +bool IsGPUDevice() { + return true; +}; +template <> +bool IsGPUDevice() { + return false; +}; + +// A callback to call after the MPI communication completes. Since the +// allreduce and allgather ops are asynchronous, this callback is what resumes +// computation after the reduction is completed. +typedef std::function)> CommunicationDoneCallback; + +struct CollectiveOpRecord { + // The rank performing this piece of the op + int rank; + + // The name of the op/tensor to be reduced + std::string name; + + // The op's kernel context + OpKernelContext* context; + + // Data type of the op + DataType dtype; + + // The input tensor + const Tensor* in_t; + + // Allgather: Vector of per-rank first-dimension sizes + std::vector sizes_vec; + + // The temp tensor for intermediate results + Tensor temp_t; + + // The output tensor + Tensor* out_t; + + // Whether to run this op on the gpu + bool on_gpu; + + // The callback to call after the op has completed + CommunicationDoneCallback callback; +}; + +// Table storing Tensors to be reduced, keyed by unique name. +// This table contains everything necessary to do the reduction +typedef std::unordered_map TensorTable; + +// Table for storing Tensor metadata on rank zero. This is used for error +// checking and size calculations, as well as determining when a reduction is +// ready to be done (when all nodes are ready to do it). +typedef std::unordered_map > MessageTable; + +// The global state required for the MPI ops. +// +// MPI is a library that stores a lot of global per-program state and often +// requires running on a single thread. As a result, we have to have a single +// background thread responsible for all MPI operations, and communicate with +// that background thread through global state. +struct MPIGlobalState { + // An atomic boolean which is set to true when MPI is initialized. + // This ensures that MPI_Init is never called twice. + std::atomic_flag initialized_flag = ATOMIC_FLAG_INIT; + + // Condition variable to wait for initialization + condition_variable cv; + + // Whether MPI_Init has been completed on the background thread. + bool initialization_done = false; + + // Whether MPI_Init succeeded on the background thread. + Status init_status; + + // A mutex that needs to be used whenever MPI operations touch + // shared structures. + mutex mu; + + // Tensors waiting to be allreduced or allgathered. + TensorTable tensor_table; + + // Queue of MPI requests waiting to be sent to the coordinator node. + std::queue message_queue; + + // Background thread running MPI communication. + std::thread background_thread; + + // Whether the background thread should shutdown. + bool shut_down = false; + + // Only exists on the coordinator node (rank zero). Maintains a count of + // how many nodes are ready to allreduce every tensor (keyed by tensor + // name). + std::unique_ptr message_table; + + // The MPI rank, local rank, and size. + int rank = 0; + int local_rank = 0; + int size = 1; + + // The device that MPI was initialized on. (-1 for no GPU) + int device = -1; + + // The CUDA stream used for data transfers and within-allreduce operations. + // A naive implementation would use the TensorFlow StreamExecutor CUDA + // stream. However, the allreduce and allgather require doing memory copies + // and kernel executions (for accumulation of values on the GPU). However, + // the subsequent operations must wait for those operations to complete, + // otherwise MPI (which uses its own stream internally) will begin the data + // transfers before the CUDA calls are complete. In order to wait for those + // CUDA operations, if we were using the TensorFlow stream, we would have + // to synchronize that stream; however, other TensorFlow threads may be + // submitting more work to that stream, so synchronizing on it can cause + // the allreduce to be delayed, waiting for compute totally unrelated to it + // in other parts of the graph. Overlaying memory transfers and compute + // during backpropagation is crucial for good performance, so we cannot use + // the TensorFlow stream, and must use our own stream. +#if GOOGLE_CUDA + cudaStream_t stream; + std::atomic_flag stream_created_flag = ATOMIC_FLAG_INIT; +#endif + + ~MPIGlobalState() { + // Make sure that the destructor of the background thread is safe to + // call. If a thread is still joinable (not detached or complete) its + // destructor cannot be called. + if (background_thread.joinable()) { + shut_down = true; + background_thread.join(); + } + } +}; + +// All the MPI state that must be stored globally per-process. +static MPIGlobalState mpi_global; + +// For clarify in argument lists. +#define RANK_ZERO 0 + +// A tag used for all coordinator messaging. +#define TAG_NOTIFY 1 + +// Store the MPIRequest for a name, and return whether the total count of +// MPIRequests for that tensor is now equal to the MPI size (and thus we are +// ready to reduce the tensor). +bool IncrementTensorCount(std::unique_ptr& message_table, + MPIRequest msg, int mpi_size) { + auto name = msg.tensor_name(); + auto table_iter = message_table->find(name); + if (table_iter == message_table->end()) { + message_table->emplace(name, std::vector({msg})); + table_iter = message_table->find(name); + } else { + table_iter->second.push_back(msg); + } + + int count = table_iter->second.size(); + return count == mpi_size; +} + +// Once a tensor is ready to be reduced, the coordinator sends an MPIResponse +// instructing all ranks to start the reduction to all ranks. The MPIResponse +// also contains error messages in case the submitted MPIRequests were not +// valid (for example, contained mismatched shapes or types). +// +// Constructing the MPIResponse, thus, requires a whole lot of error checking. +MPIResponse ConstructMPIResponse(std::unique_ptr& message_table, + std::string name) { + bool error = false; + auto it = message_table->find(name); + assert(it != message_table->end()); + + std::vector requests = it->second; + assert(requests.size() > 0); + + std::ostringstream error_message_stream; + + // Check that all data types being reduced or gathered are identical + auto data_type = requests[0].tensor_type(); + for (unsigned int i = 1; i < requests.size(); i++) { + auto request_type = requests[i].tensor_type(); + if (data_type != request_type) { + error = true; + error_message_stream << "Mismatched data types: One rank had type " + << DataType_Name(data_type) + << ", but another rank had type " + << DataType_Name(request_type) << "."; + break; + } + } + + // Check that all requested operations are the same + auto message_type = requests[0].request_type(); + for (unsigned int i = 1; i < requests.size(); i++) { + if (error) { + break; + } + + auto request_type = requests[i].request_type(); + if (message_type != request_type) { + error = true; + error_message_stream << "Mismatched MPI operations: One rank did an " + << message_type << ", but another rank did an " + << request_type << "."; + break; + } + } + + // If we are doing an allreduce, check that all tensor shapes + // are identical + if (message_type == MPIRequest::ALLREDUCE) { + TensorShape tensor_shape = requests[0].tensor_shape(); + for (unsigned int i = 1; i < requests.size(); i++) { + if (error) { + break; + } + + TensorShape request_shape = requests[i].tensor_shape(); + if (tensor_shape != request_shape) { + error = true; + error_message_stream << "Mismatched allreduce tensor shapes: " + << "One rank reduced a tensor of shape " + << tensor_shape.DebugString() + << ", but another rank sent a tensor of shape " + << request_shape.DebugString() << "."; + break; + } + } + } + + // If we are doing an allgather, make sure all but the first dimension are + // the same. The first dimension may be different and the output tensor is + // the sum of the first dimension. Collect the sizes by rank. + if (message_type == MPIRequest::ALLGATHER) { + TensorShape tensor_shape = requests[0].tensor_shape(); + + if (tensor_shape.dims() == 0) { + error = true; + error_message_stream << "Rank zero tried to gather a rank-zero tensor."; + } + + for (unsigned int i = 1; i < requests.size(); i++) { + if (error) { + break; + } + + TensorShape request_shape = requests[i].tensor_shape(); + if (tensor_shape.dims() != request_shape.dims()) { + error = true; + error_message_stream << "Mismatched allgather tensor shapes: " + << "One rank gathered a tensor of rank " + << tensor_shape.dims() + << ", but another rank sent a tensor of rank " + << request_shape.dims() << "."; + break; + } + + for (unsigned int dim = 1; dim < tensor_shape.dims(); dim++) { + if (tensor_shape.dim_size(dim) != request_shape.dim_size(dim)) { + error = true; + error_message_stream + << "Mismatched allgather tensor shapes: " + << "One rank gathered a tensor with dimension " << dim + << " equal to " << tensor_shape.dim_size(dim) + << ", but another rank sent a tensor with dimension " << dim + << " equal to " << request_shape.dim_size(dim) << "."; + break; + } + } + } + } + + MPIResponse response; + response.set_tensor_name(name); + if (error) { + std::string error_message = error_message_stream.str(); + response.set_response_type(MPIResponse::ERROR); + response.set_error_message(error_message); + } else { + auto response_type = MPIResponse::ERROR; + if (message_type == MPIRequest::ALLREDUCE) { + response_type = MPIResponse::ALLREDUCE; + } else { + response_type = MPIResponse::ALLGATHER; + } + response.set_response_type(response_type); + } + + // Clear all queued up requests for this name. They are now taken care of + // by the constructed MPI response. + message_table->erase(it); + + return response; +} + +// Process an MPIResponse by doing a reduction, a gather, or raising an error. +void PerformCollectiveOp(TensorTable& tensor_table, MPIResponse response) { + OpKernelContext* context; + const Tensor* input_tensor; + std::vector sizes_vec; + Tensor temp_tensor; + Tensor* output_tensor; + CommunicationDoneCallback callback; + bool on_gpu; + { + // Lock on the tensor table. + mutex_lock guard(mpi_global.mu); + + // We should never fail at finding this key in the tensor table. + auto name = response.tensor_name(); + auto iter = tensor_table.find(name); + assert(iter != tensor_table.end()); + + assert(response.response_type() == MPIResponse::ALLREDUCE || + response.response_type() == MPIResponse::ALLGATHER || + response.response_type() == MPIResponse::ERROR); + + CollectiveOpRecord record = iter->second; + context = record.context; + input_tensor = record.in_t; + sizes_vec = record.sizes_vec; + temp_tensor = record.temp_t; + output_tensor = record.out_t; + on_gpu = record.on_gpu; + callback = record.callback; + + // Clear the tensor table of this tensor and its callbacks; the rest of + // this function takes care of it. + tensor_table.erase(iter); + } + + // Use CPUDevice instead of GPUDevice if no CUDA, to ensure we don't + // link to non-existent symbols. +#if GOOGLE_CUDA +#define GPU_DEVICE_IF_CUDA GPUDevice +#else +#define GPU_DEVICE_IF_CUDA CPUDevice +#endif + + Status status; + auto dtype = input_tensor->dtype(); + if (response.response_type() == MPIResponse::ALLGATHER) { + if (dtype == DT_FLOAT) { + status = on_gpu ? RingAllgather( + context, input_tensor, sizes_vec, output_tensor) + : RingAllgather( + context, input_tensor, sizes_vec, output_tensor); + } else if (dtype == DT_INT32) { + status = on_gpu ? RingAllgather( + context, input_tensor, sizes_vec, output_tensor) + : RingAllgather(context, input_tensor, + sizes_vec, output_tensor); + } else if (dtype == DT_INT64) { + status = on_gpu ? RingAllgather( + context, input_tensor, sizes_vec, output_tensor) + : RingAllgather( + context, input_tensor, sizes_vec, output_tensor); + } else { + status = errors::Unknown("Invalid tensor type for MPI allgather."); + } + } else if (response.response_type() == MPIResponse::ALLREDUCE) { + if (dtype == DT_FLOAT) { + status = on_gpu ? RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor) + : RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor); + } else if (dtype == DT_INT32) { + status = on_gpu ? RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor) + : RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor); + } else if (dtype == DT_INT64) { + status = on_gpu ? RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor) + : RingAllreduce( + context, input_tensor, &temp_tensor, output_tensor); + } else { + status = errors::Unknown("Invalid tensor type for MPI allreduce."); + } + } else if (response.response_type() == MPIResponse::ERROR) { + status = errors::FailedPrecondition(response.error_message()); + } + + if (status.ok()) { + callback(StatusOr(*output_tensor)); + } else { + callback(StatusOr(status)); + } +} + +// The MPI background thread loop coordinates all the MPI processes and the +// tensor reductions. The design of the communicator mechanism is limited by a +// few considerations: +// +// 1. Some MPI implementations require all MPI calls to happen from a +// single thread. Since TensorFlow may use several threads for graph +// processing, this means we must have our own dedicated thread for +// dealing with MPI. +// 2. We want to gracefully handle errors, when MPI processes do not +// properly agree upon what should happen (such as mismatched types or +// shapes). To do so requires the MPI processes to know about the shapes +// and types of the relevant tensors on the other processes. +// 3. The MPI reductions and gathers should be able to happen in parallel +// with other ongoing operations. Since MPI uses an internal +// (inaccessible) GPU stream separate from the TF GPUDevice streams, we +// cannot explicitly synchronize memcpys or kernels with it. As a result, +// MPIAllreduce and MPIAllgather must be AsyncOpKernels to ensure proper +// ordering of memcpys and kernels with respect to TF streams. +// 4. NOTE: We cannot guarantee that all the MPI processes reduce their +// tensors in the same order. Thus, there must be a way to ensure the +// reduction memcpys and kernels occur for correct tensors across all +// ranks at the same time. We choose to use a coordinator (rank ID 0) to +// gather and trigger the reduction operations that are ready to execute. +// +// The coordinator currently follows a master-worker paradigm. Rank zero acts +// as the master (the "coordinator"), whereas all other ranks are simply +// workers. Each rank runs its own background thread which progresses in ticks. +// In each tick, the following actions happen: +// +// a) The workers send any available MPIRequests to the coordinator. These +// MPIRequests indicate what the worker would like to do (i.e. which +// tensor they would like to gather or reduce, as well as their shape and +// type). They repeat this for every tensor that they would like to +// operate on after that tensor's collective op has executed ComputeAsync. +// +// b) The workers send an empty "DONE" message to the coordinator to +// indicate that there are no more tensors they wish to operate on. +// +// c) The coordinator receives the MPIRequests from the workers, as well +// as from its own TensorFlow ops, and stores them in a request table. The +// coordinator continues to receive MPIRequest messages until it has +// received MPI_SIZE number of empty "DONE" messages. +// +// d) The coordinator finds all tensors that are ready to be reduced, +// gathered, or all operations that result in an error. For each of those, +// it sends an MPIResponse to all the workers. When no more MPIResponses +// are available, it sends a "DONE" response to the workers. If the +// process is being shutdown, it instead sends a "SHUTDOWN" response. +// +// e) The workers listen for MPIResponse messages, processing each one by +// doing the required reduce or gather, until they receive a "DONE" +// response from the coordinator. At that point, the tick ends. +// If instead of "DONE" they receive "SHUTDOWN", they exit their +// background loop. +// TODO: Use the global mpi_global state variable instead of a local one +void BackgroundThreadLoop() { +#if GOOGLE_CUDA + // Set the device, so that this thread uses the same GPU context as the + // calling thread. + // TODO: Ensure that this is operating correctly. The background thread + // needs to be able to control all GPUs that the rank has access to, and + // might be more than 1 GPU. Tensors could be resident in any of the + // GPUs, so the background thread's accumulate and copy kernels might need + // to correctly set the device and it might be necessary for the background + // thread to manage multiple streams. + cudaSetDevice(mpi_global.device); + cudaStreamCreate(&mpi_global.stream); +#endif + + // Initialize MPI. This must happen on the background thread, since not all + // MPI implementations support being called from multiple threads. + auto init_result = MPI_Init(NULL, NULL); + if (init_result != MPI_SUCCESS) { + mpi_global.init_status = + errors::Unknown("Could not initialize MPI; MPI_Init() failed."); + mpi_global.initialization_done = true; + mpi_global.cv.notify_all(); + return; + } else { + mpi_global.init_status = Status::OK(); + } + + // Get MPI rank to determine if we are rank zero. + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + bool is_coordinator = rank == 0; + + // Get MPI size to determine how many tensors to wait for before reducing. + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // Determine local rank by querying the local communicator. + MPI_Comm local_comm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, + &local_comm); + int local_rank; + MPI_Comm_rank(local_comm, &local_rank); + + mpi_global.rank = rank; + mpi_global.local_rank = local_rank; + mpi_global.size = size; + mpi_global.initialization_done = true; + + // Notify calling thread that initialization is complete + mpi_global.cv.notify_all(); + + // TODO: MOVE MESSAGE TABLE INITIALIZATION TO LIBRARY LOAD! + // Initialize the tensor count table. No tensors are available yet. + if (is_coordinator) { + mpi_global.message_table = + std::unique_ptr(new MessageTable()); + } + + // The coordinator sends a SHUTDOWN message to trigger shutdown. + bool should_shut_down = false; + do { + // TODO: Eliminate the need for thread sleep by making all activity + // depend on other activity (e.g. condition or MPI waits). + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + + // Copy the data structures from global state under this lock. + // However, don't keep the lock for the rest of the loop, so that + // enqueued stream callbacks can continue. + std::queue message_queue; + { + mutex_lock guard(mpi_global.mu); + while (!mpi_global.message_queue.empty()) { + MPIRequest message = mpi_global.message_queue.front(); + mpi_global.message_queue.pop(); + message_queue.push(message); + } + } + + // Collect all tensors that are ready to be reduced. Record them in the + // tensor count table (rank zero) or send them to rank zero to be + // recorded (everyone else). + std::vector ready_to_reduce; + while (!message_queue.empty()) { + // Pop the first available message message + MPIRequest message = message_queue.front(); + message_queue.pop(); + + if (is_coordinator) { + bool reduce = + IncrementTensorCount(mpi_global.message_table, message, size); + if (reduce) { + ready_to_reduce.push_back(message.tensor_name()); + } + } else { + std::string encoded_message; + message.SerializeToString(&encoded_message); + MPI_Send(encoded_message.c_str(), encoded_message.length() + 1, + MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD); + } + } + + // Rank zero has put all its own tensors in the tensor count table. + // Now, it should count all the tensors that are coming from other + // ranks at this tick. It should keep getting tensors until it gets a + // DONE message from all the other ranks. + if (is_coordinator) { + // Count of DONE messages. Keep receiving messages until the number + // of messages is equal to the number of processes. Initialize to + // one since the coordinator is effectively done. + int completed_ranks = 1; + while (completed_ranks != size) { + MPI_Status status; + MPI_Probe(MPI_ANY_SOURCE, TAG_NOTIFY, MPI_COMM_WORLD, &status); + + // Find number of characters in message (including zero byte). + int source_rank = status.MPI_SOURCE; + int msg_length; + MPI_Get_count(&status, MPI_BYTE, &msg_length); + + // If the length is zero, this is a DONE message. + if (msg_length == 0) { + completed_ranks++; + MPI_Recv(NULL, 0, MPI_BYTE, source_rank, TAG_NOTIFY, MPI_COMM_WORLD, + &status); + continue; + } + + // Get tensor name from MPI into an std::string. + char* buffer = new char[msg_length]; + MPI_Recv(buffer, msg_length, MPI_BYTE, source_rank, TAG_NOTIFY, + MPI_COMM_WORLD, &status); + std::string received_data(buffer); + delete[] buffer; + + MPIRequest received_message; + received_message.ParseFromString(received_data); + auto received_name = received_message.tensor_name(); + + bool reduce = IncrementTensorCount(mpi_global.message_table, + received_message, size); + if (reduce) { + ready_to_reduce.push_back(received_name); + } + } + + // At this point, rank zero should have a fully updated tensor + // count table and should know all the tensors that need to be + // reduced or gathered, and everyone else should have sent all + // their information to rank zero. We can now do reductions and + // gathers; rank zero will choose which ones and in what order, + // and will notify the other ranks before doing each reduction. + for (int i = 0; i < ready_to_reduce.size(); i++) { + // Notify all nodes which tensor we'd like to reduce now + auto name = ready_to_reduce[i]; + MPIResponse response = + ConstructMPIResponse(mpi_global.message_table, name); + + std::string encoded_response; + response.SerializeToString(&encoded_response); + for (int r = 1; r < size; r++) { + MPI_Send(encoded_response.c_str(), encoded_response.length() + 1, + MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD); + } + + // Perform the reduction. All nodes should end up performing + // the same reduction. + PerformCollectiveOp(mpi_global.tensor_table, response); + } + + // Notify all nodes that we are done with the reductions for this + // tick. + MPIResponse done_response; + should_shut_down = mpi_global.shut_down; + done_response.set_response_type( + mpi_global.shut_down ? MPIResponse::SHUTDOWN : MPIResponse::DONE); + std::string encoded_response; + done_response.SerializeToString(&encoded_response); + for (int r = 1; r < size; r++) { + MPI_Send(encoded_response.c_str(), encoded_response.length() + 1, + MPI_BYTE, r, TAG_NOTIFY, MPI_COMM_WORLD); + } + } else { + // Notify the coordinator that this node is done sending messages. + // A DONE message is encoded as a zero-length message. + MPI_Send(NULL, 0, MPI_BYTE, RANK_ZERO, TAG_NOTIFY, MPI_COMM_WORLD); + + // Receive names for tensors to reduce from rank zero. Once we + // receive a empty DONE message, stop waiting for more names. + while (true) { + MPI_Status status; + MPI_Probe(0, TAG_NOTIFY, MPI_COMM_WORLD, &status); + + // Find number of characters in message (including zero byte). + int msg_length; + MPI_Get_count(&status, MPI_BYTE, &msg_length); + + // Get tensor name from MPI into an std::string. + char* buffer = new char[msg_length]; + MPI_Recv(buffer, msg_length, MPI_BYTE, 0, TAG_NOTIFY, MPI_COMM_WORLD, + &status); + std::string received_message(buffer); + delete[] buffer; + + MPIResponse response; + response.ParseFromString(received_message); + if (response.response_type() == MPIResponse::DONE) { + // No more messages this tick + break; + } else if (response.response_type() == MPIResponse::SHUTDOWN) { + // No more messages this tick, and the background thread + // should shut down + should_shut_down = true; + break; + } else { + // Process the current message + PerformCollectiveOp(mpi_global.tensor_table, response); + } + } + } + } while (!should_shut_down); + + MPI_Finalize(); +} + +// Initialize MPI and start the MPI background thread. Ensure that this is +// only done once no matter how many times this function is called. +Status InitializeMPIOnce(bool gpu) { + // Ensure MPI is only initialized once. + if (mpi_global.initialized_flag.test_and_set()) return mpi_global.init_status; + + mpi_global.device = -1; +#if GOOGLE_CUDA + if (gpu) { + cudaGetDevice(&mpi_global.device); + } +#endif + + // Start the MPI background thread, which assumes MPI is initialized + // TODO: Change this to a Tensorflow thread + mpi_global.background_thread = std::thread(BackgroundThreadLoop); + + // Wait to ensure that the background thread has finished initializing MPI + mutex_lock guard(mpi_global.mu); + mpi_global.cv.wait(guard); + if (!mpi_global.initialization_done) { + mpi_global.init_status = + errors::Unknown("Failed to wait for MPI initialization."); + } + + return mpi_global.init_status; +} + +// Check that MPI is initialized. +Status IsMPIInitialized() { + if (!mpi_global.initialization_done) { + return errors::FailedPrecondition( + "MPI has not been initialized; use tf.contrib.mpi.Session."); + } + return Status::OK(); +} + +// This function (called from the callback set up in MPIAll*Op::ComputeAsync) +// only adds the op's record into the local op queue (to track the op's +// progress), and sends a message to the coordinator indicating that this rank +// is ready to begin. The MPI background thread will handle the MPI message. +void EnqueueTensorCollective(CollectiveOpRecord record, + MPIRequest::RequestType rtype) { + const Tensor* input_tensor = record.in_t; + MPIRequest message; + message.set_request_rank(record.rank); + message.set_tensor_name(record.name); + message.set_tensor_type(record.dtype); + message.set_request_type(rtype); + input_tensor->shape().AsProto(message.mutable_tensor_shape()); + + mutex_lock guard(mpi_global.mu); + mpi_global.tensor_table.emplace(record.name, record); + mpi_global.message_queue.push(message); +} + +} // namespace + +#if GOOGLE_CUDA +cudaStream_t CudaStreamForMPI() { return mpi_global.stream; } +#endif + +// Op to initialize MPI in the current process. The settings used in the +// configuration are the same that must be used for all future MPI ops. +template +class MPIInitOp : public OpKernel { + public: + explicit MPIInitOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + bool on_gpu = IsGPUDevice(); + OP_REQUIRES_OK(context, InitializeMPIOnce(on_gpu)); + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_CPU), + MPIInitOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("MPIInit").Device(DEVICE_GPU), + MPIInitOp); +#endif + +REGISTER_OP("MPIInit").Doc(R"doc( +Initialize MPI for the current process. + +If this is run on a GPU, then that GPU must be used for all future MPI +operations. If it is run on CPU, then all future MPI operations must also +run on CPU. +)doc"); + +// Op to get the current MPI Size. +template +class MPISizeOp : public OpKernel { + public: + explicit MPISizeOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + OP_REQUIRES_OK(context, IsMPIInitialized()); + + // Write integer to output tensor + Tensor* output; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + + auto flat = output->flat(); + flat(0) = mpi_global.size; + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_CPU), + MPISizeOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("MPISize").Device(DEVICE_GPU).HostMemory("size"), + MPISizeOp); +#endif + +REGISTER_OP("MPISize") + .Output("size: int32") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->Scalar()); + return Status::OK(); + }) + .Doc(R"doc( +Returns the number of running MPI processes. + +More precisely, returns the number of MPI processes in the group associated +with the MPI_COMM_WORLD communicator. + +size: Size of the MPI group. +)doc"); + +// Op to get the current MPI Rank. +template +class MPIRankOp : public OpKernel { + public: + explicit MPIRankOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + OP_REQUIRES_OK(context, IsMPIInitialized()); + + // Write integer to output tensor + Tensor* output; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + + auto flat = output->flat(); + flat(0) = mpi_global.rank; + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_CPU), + MPIRankOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("MPIRank").Device(DEVICE_GPU).HostMemory("rank"), + MPIRankOp); +#endif + +REGISTER_OP("MPIRank") + .Output("rank: int32") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->Scalar()); + return Status::OK(); + }) + .Doc(R"doc( +Returns the index of the current process in the MPI group. + +More precisely, returns the rank of the calling process in the MPI_COMM_WORLD +communicator. + +rank: Rank of the calling process. +)doc"); + +// Op to get the current local MPI Rank. +template +class MPILocalRankOp : public OpKernel { + public: + explicit MPILocalRankOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + OP_REQUIRES_OK(context, IsMPIInitialized()); + + // Write integer to output tensor + Tensor* output; + OP_REQUIRES_OK(context, + context->allocate_output(0, TensorShape({}), &output)); + + auto flat = output->flat(); + flat(0) = mpi_global.local_rank; + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPILocalRank").Device(DEVICE_CPU), + MPILocalRankOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER( + Name("MPILocalRank").Device(DEVICE_GPU).HostMemory("rank"), + MPILocalRankOp); +#endif + +REGISTER_OP("MPILocalRank") + .Output("rank: int32") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->Scalar()); + return Status::OK(); + }) + .Doc(R"doc( +Returns the index of the current process in the node it is on. + +More precisely, returns the rank of the calling process in communicator that +only spans the MPI processes running on that node. + +rank: Rank of the calling process on the node it is on. +)doc"); + +template +class MPIAllreduceOp : public AsyncOpKernel { + public: + explicit MPIAllreduceOp(OpKernelConstruction* context) + : AsyncOpKernel(context) {} + + // Although this op is handled asynchronously, the ComputeAsync call is + // very inexpensive. It only sets up a CollectiveOpRecord and places it + // in the table for the background thread to handle. Thus, we do not need + // a TF pool thread to perform the op. + bool IsExpensive() override { return false; } + + void ComputeAsync(OpKernelContext* context, DoneCallback done) override { + OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done); + const Tensor* input_tensor = &context->input(0); + Tensor* output_tensor; + OP_REQUIRES_OK_ASYNC( + context, + context->allocate_output(0, input_tensor->shape(), &output_tensor), + done); + + // Record allocated on stack so op can fail without memory leak + CollectiveOpRecord record; + record.name = name(); + record.context = context; + record.in_t = input_tensor; + record.out_t = output_tensor; + record.on_gpu = IsGPUDevice(); + record.dtype = input_tensor->dtype(); + + const size_t temp_size = + (input_tensor->NumElements() + mpi_global.size - 1) / mpi_global.size; + TensorShape temp_shape; + temp_shape.AddDim(temp_size); + OP_REQUIRES_OK_ASYNC(context, + context->allocate_temp(input_tensor->dtype(), + temp_shape, &record.temp_t), + done); + + auto allreduce_done_callback = [done, context](StatusOr status) { + context->SetStatus(status.status()); + done(); + }; + record.callback = allreduce_done_callback; + + auto allreduce_launch_callback = [record] { + EnqueueTensorCollective(record, MPIRequest::ALLREDUCE); + }; + + // If we are on a CPU, our device context will be null and we can't + // get a stream to enqueue this on. On a CPU this op is called when the + // data is already available, so we can just immediately do the + // allreduce; we don't have to wait for the data to get populated. +#if GOOGLE_CUDA + auto device_context = context->op_device_context(); + if (device_context == nullptr) { + allreduce_launch_callback(); + } else { + auto stream = device_context->stream(); + stream->ThenDoHostCallback(allreduce_launch_callback); + } +#else + allreduce_launch_callback(); +#endif + } +}; + +REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_CPU), + MPIAllreduceOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER(Name("MPIAllreduce").Device(DEVICE_GPU), + MPIAllreduceOp); +#endif + +REGISTER_OP("MPIAllreduce") + .Attr("T: {int32, int64, float32}") + .Input("tensor: T") + .Output("sum: T") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c->set_output(0, c->input(0)); + return Status::OK(); + }) + .Doc(R"doc( +Perform an MPI Allreduce on a tensor. All other processes that do a reduction +on a tensor with the same name must have the same dimension for that tensor. +Tensors are reduced with other tensors that have the same node name for the +allreduce. + +Arguments + tensor: A tensor to reduce. + +Output + sum: A tensor with the same shape as `tensor`, summed across all + MPI processes. +)doc"); + +template +class MPIAllgatherOp : public AsyncOpKernel { + public: + explicit MPIAllgatherOp(OpKernelConstruction* context) + : AsyncOpKernel(context) {} + + // Although this op is handled asynchronously, the ComputeAsync call is + // very inexpensive. It only sets up a CollectiveOpRecord and places it + // in the table for the background thread to handle. Thus, we do not need + // a TF pool thread to perform the op. + bool IsExpensive() override { return false; } + + void ComputeAsync(OpKernelContext* context, DoneCallback done) override { + OP_REQUIRES_OK_ASYNC(context, IsMPIInitialized(), done); + const Tensor* input_tensor = &context->input(0); + const Tensor* sizing_tensor = &context->input(1); + + // Record allocated on stack so op can fail without memory leak + CollectiveOpRecord record; + record.name = name(); + record.context = context; + record.in_t = input_tensor; + record.on_gpu = IsGPUDevice(); + + // Construct the output size from the sizing tensor + size_t output_first_dim = 0; + if (sizing_tensor->shape().dims() == 0) { + // 0-dim sizing_tensor implies that the op is just gathering + // a single element from each rank + output_first_dim = mpi_global.size; + for (int i = 0; i < mpi_global.size; i++) { + record.sizes_vec.push_back(1); + } + } else { + // Collect the total output tensor sizing from the sizing tensor + // NOTE: The sizing tensor is forced to be placed on the CPU by + // declaring the input as HostMemory, so it is valid to read it here. + const int64* sizing_array = + (const int64*)sizing_tensor->tensor_data().data(); + for (int i = 0; i < mpi_global.size; i++) { + record.sizes_vec.push_back(sizing_array[i]); + output_first_dim += sizing_array[i]; + } + } + + TensorShape output_shape; + output_shape.AddDim(output_first_dim); + for (int i = 1; i < input_tensor->shape().dims(); i++) { + output_shape.AddDim(input_tensor->shape().dim_size(i)); + } + + Tensor* output_tensor; + OP_REQUIRES_OK_ASYNC( + context, context->allocate_output(0, output_shape, &output_tensor), + done); + + record.out_t = output_tensor; + record.dtype = input_tensor->dtype(); + + auto allgather_done_callback = [done, context](StatusOr status) { + context->SetStatus(status.status()); + done(); + }; + record.callback = allgather_done_callback; + + auto allgather_launch_callback = [record] { + EnqueueTensorCollective(record, MPIRequest::ALLGATHER); + }; + + // If we are on a CPU, our device context will be null and we can't + // get a stream to enqueue this on. On a CPU this op is called when the + // data is already available, so we can just immediately do the + // allgather; we don't have to wait for the data to get populated. +#if GOOGLE_CUDA + auto device_context = context->op_device_context(); + if (device_context == nullptr) { + allgather_launch_callback(); + } else { + auto stream = device_context->stream(); + stream->ThenDoHostCallback(allgather_launch_callback); + } +#else + allgather_launch_callback(); +#endif + } +}; + +REGISTER_OP("MPIAllgather") + .Attr("T: {int32, int64, float32}") + .Attr("S: {int64}") + .Input("tensor: T") + .Input("sizes: S") + .Output("gathered: T") + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle output; + TF_RETURN_IF_ERROR( + c->ReplaceDim(c->input(0), 0, c->UnknownDim(), &output)); + c->set_output(0, output); + return Status::OK(); + }) + .Doc(R"doc( +Perform an MPI Allgather on a tensor. All other processes that do a gather on a +tensor with the same name must have the same rank for that tensor, and have the +same dimension on all but the first dimension. + +Arguments + tensor: A tensor to gather. + sizes: A tensor containing the first-dimension sizes of tensors to be + gathered from other ranks + +Output + gathered: A tensor with the same shape as `tensor` except for the first + dimension, which is the sum of dimensions in `sizes`. +)doc"); + +REGISTER_KERNEL_BUILDER( + Name("MPIAllgather").Device(DEVICE_CPU).HostMemory("sizes"), + MPIAllgatherOp); +#if GOOGLE_CUDA +REGISTER_KERNEL_BUILDER( + Name("MPIAllgather").Device(DEVICE_GPU).HostMemory("sizes"), + MPIAllgatherOp); +#endif + +} // namespace mpi +} // namespace contrib +} // namespace tensorflow + +#endif // TENSORFLOW_USE_MPI diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc index b9b482a6981e03144c6d00f2a38b71959b4b3621..b1cb89391ceaa70813be47cc1bba0c16f4f70e77 100644 --- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc +++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc @@ -24,7 +24,7 @@ limitations under the License. namespace tensorflow { -using ::perftools::gputools::cuda::ScopedActivateExecutorContext; +using se::cuda::ScopedActivateExecutorContext; // Contains data for a single stream used for nccl communication; this includes // a background thread that calls NcclManager::LoopKernelLaunches. @@ -37,11 +37,11 @@ struct NcclManager::NcclStream { cv.notify_all(); } - perftools::gputools::StreamExecutor* executor = nullptr; + se::StreamExecutor* executor = nullptr; // The stream on which to run the nccl collective. // This is a different stream than the tensorflow compute stream. - std::unique_ptr stream; + std::unique_ptr stream; // See NcclManager::LoopKernelLaunches for information on these. std::unique_ptr thread; @@ -95,9 +95,8 @@ ncclDataType_t ToNcclType(DataType t) { // A participant in a Collective. See below. struct NcclManager::Participant { Participant(const Tensor* in_t, Tensor* out_t, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - perftools::gputools::StreamExecutor* executor, int gpu_device_id, - NcclManager::DoneCallback done_callback) + se::Stream* tensor_stream, se::StreamExecutor* executor, + int gpu_device_id, NcclManager::DoneCallback done_callback) : in_t(in_t), out_t(out_t), event_mgr(event_mgr), @@ -121,11 +120,11 @@ struct NcclManager::Participant { EventMgr* const event_mgr; // Owned by the caller, who must keep it live until is called. - perftools::gputools::Stream* const tensor_stream; + se::Stream* const tensor_stream; // Matches the executor in CommunicatorMember::stream. Expected to be live for // process lifetime. - perftools::gputools::StreamExecutor* const executor = nullptr; + se::StreamExecutor* const executor = nullptr; const int gpu_device_id; @@ -245,7 +244,7 @@ NcclManager::Communicator* NcclManager::GetCommunicator( if (nccl_stream == nullptr) { nccl_stream = new NcclStream(); nccl_stream->executor = executor; - nccl_stream->stream.reset(new perftools::gputools::Stream(executor)); + nccl_stream->stream.reset(new se::Stream(executor)); nccl_stream->stream->Init(); streams.emplace_back(nccl_stream); @@ -300,10 +299,10 @@ NcclManager::Communicator* NcclManager::GetCommunicator( void NcclManager::AddToAllReduce(int num_devices, const string& key, ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, + se::StreamExecutor* executor, int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, Tensor* out_t, + se::Stream* tensor_stream, const Tensor* in_t, + Tensor* out_t, const DoneCallback& done_callback) { std::unique_ptr participant( new Participant(in_t, out_t, event_mgr, tensor_stream, executor, @@ -312,11 +311,12 @@ void NcclManager::AddToAllReduce(int num_devices, const string& key, kAllReduce, reduction_op); } -void NcclManager::AddBroadcastSend( - int num_devices, const string& key, - perftools::gputools::StreamExecutor* executor, int gpu_device_id, - EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, DoneCallback done_callback) { +void NcclManager::AddBroadcastSend(int num_devices, const string& key, + se::StreamExecutor* executor, + int gpu_device_id, EventMgr* event_mgr, + se::Stream* tensor_stream, + const Tensor* in_t, + DoneCallback done_callback) { std::unique_ptr participant( new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream, executor, gpu_device_id, std::move(done_callback))); @@ -325,11 +325,11 @@ void NcclManager::AddBroadcastSend( kBroadcast, ncclSum /* unused */); } -void NcclManager::AddBroadcastRecv( - int num_devices, const string& key, - perftools::gputools::StreamExecutor* executor, int gpu_device_id, - EventMgr* event_mgr, perftools::gputools::Stream* tensor_stream, - Tensor* out_t, DoneCallback done_callback) { +void NcclManager::AddBroadcastRecv(int num_devices, const string& key, + se::StreamExecutor* executor, + int gpu_device_id, EventMgr* event_mgr, + se::Stream* tensor_stream, Tensor* out_t, + DoneCallback done_callback) { std::unique_ptr participant( new Participant(nullptr /* in_t */, out_t, event_mgr, tensor_stream, executor, gpu_device_id, std::move(done_callback))); @@ -339,9 +339,8 @@ void NcclManager::AddBroadcastRecv( void NcclManager::AddReduceSend(int num_devices, const string& key, ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, - int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, + se::StreamExecutor* executor, int gpu_device_id, + EventMgr* event_mgr, se::Stream* tensor_stream, const Tensor* in_t, DoneCallback done_callback) { std::unique_ptr participant( @@ -353,9 +352,8 @@ void NcclManager::AddReduceSend(int num_devices, const string& key, void NcclManager::AddReduceRecv(int num_devices, const string& key, ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, - int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, + se::StreamExecutor* executor, int gpu_device_id, + EventMgr* event_mgr, se::Stream* tensor_stream, const Tensor* in_t, Tensor* out_t, DoneCallback done_callback) { std::unique_ptr participant( @@ -444,7 +442,7 @@ void NcclManager::RunCollective(const string& key, Collective* collective) { } void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) { - perftools::gputools::Stream* comm_stream = nccl_stream->stream.get(); + se::Stream* comm_stream = nccl_stream->stream.get(); ScopedActivateExecutorContext scoped_context(nccl_stream->executor); const cudaStream_t* cu_stream = reinterpret_cast( comm_stream->implementation()->CudaStreamMemberHack()); diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h index 6ff8cea84eb912d5e5c891c40efc617661725a63..57a96c5d3342f6e934e88367881388fb160dc5e3 100644 --- a/tensorflow/contrib/nccl/kernels/nccl_manager.h +++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h @@ -55,41 +55,34 @@ class NcclManager { // is also the stream that will use the produced data; is // not called until the next kernel launched on would see the data. void AddToAllReduce(int num_devices, const string& key, - ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, + ncclRedOp_t reduction_op, se::StreamExecutor* executor, int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, Tensor* out_t, - const DoneCallback& done_callback); + se::Stream* tensor_stream, const Tensor* in_t, + Tensor* out_t, const DoneCallback& done_callback); // AddBroadcastSend and AddBroadcastRecv combine to sent data from one sender // to all receivers. void AddBroadcastSend(int num_devices, const string& key, - perftools::gputools::StreamExecutor* executor, - int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, + se::StreamExecutor* executor, int gpu_device_id, + EventMgr* event_mgr, se::Stream* tensor_stream, const Tensor* in_t, DoneCallback done_callback); void AddBroadcastRecv(int num_devices, const string& key, - perftools::gputools::StreamExecutor* executor, - int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, + se::StreamExecutor* executor, int gpu_device_id, + EventMgr* event_mgr, se::Stream* tensor_stream, Tensor* out_t, DoneCallback done_callback); // AddReduceSend and AddReduceRecv combine to sent data from all senders // to one receiver. void AddReduceSend(int num_devices, const string& key, - ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, + ncclRedOp_t reduction_op, se::StreamExecutor* executor, int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, DoneCallback done_callback); + se::Stream* tensor_stream, const Tensor* in_t, + DoneCallback done_callback); void AddReduceRecv(int num_devices, const string& key, - ncclRedOp_t reduction_op, - perftools::gputools::StreamExecutor* executor, + ncclRedOp_t reduction_op, se::StreamExecutor* executor, int gpu_device_id, EventMgr* event_mgr, - perftools::gputools::Stream* tensor_stream, - const Tensor* in_t, Tensor* out_t, - DoneCallback done_callback); + se::Stream* tensor_stream, const Tensor* in_t, + Tensor* out_t, DoneCallback done_callback); private: enum CollectiveType { @@ -123,8 +116,7 @@ class NcclManager { // Maps a device to the communication streams that make up its collective. // This is used to share the stream across different communicators that // include the same device. - std::map>> + std::map>> device_to_comm_streams_ GUARDED_BY(mu_); std::vector> communicators_; diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc index 06ca65e33ad6f5fb6620144231dd368379dcc190..4d8d922cb42d2974dab32cf4562bee3993bef098 100644 --- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc +++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc @@ -175,11 +175,9 @@ class NcclManagerTest : public ::testing::Test { nullptr /* step_resource_manager */); } - static perftools::gputools::DeviceMemory AsDeviceMemory( - const Scalar* cuda_memory) { - perftools::gputools::DeviceMemoryBase wrapped( - const_cast(cuda_memory)); - perftools::gputools::DeviceMemory typed(wrapped); + static se::DeviceMemory AsDeviceMemory(const Scalar* cuda_memory) { + se::DeviceMemoryBase wrapped(const_cast(cuda_memory)); + se::DeviceMemory typed(wrapped); return typed; } diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD index 612ecc3e63891f4dabf97828fe75672dd7877a91..13aa1d7e7a11877373a848c1ba865aa418790cd0 100644 --- a/tensorflow/contrib/opt/BUILD +++ b/tensorflow/contrib/opt/BUILD @@ -25,6 +25,7 @@ py_library( "python/training/multitask_optimizer_wrapper.py", "python/training/nadam_optimizer.py", "python/training/powersign.py", + "python/training/reg_adagrad_optimizer.py", "python/training/sign_decay.py", "python/training/variable_clipping_optimizer.py", ], @@ -155,6 +156,25 @@ py_test( ], ) +py_test( + name = "reg_adagrad_optimizer_test", + srcs = ["python/training/reg_adagrad_optimizer_test.py"], + srcs_version = "PY2AND3", + deps = [ + ":opt_py", + "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:embedding_ops", + "//tensorflow/python:framework_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:resource_variable_ops", + "//tensorflow/python:variable_scope", + "//tensorflow/python:variables", + "//third_party/py/numpy", + ], +) + py_test( name = "nadam_optimizer_test", srcs = ["python/training/nadam_optimizer_test.py"], diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py index aeca900bc8ff4c4cc26da490ce43dfec70fd9f11..72117c1e81a164b0517fabeaddec3ea5132af5a9 100644 --- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py +++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer.py @@ -56,21 +56,21 @@ class LazyAdamOptimizer(adam.AdamOptimizer): epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) - # m := beta1 * m + (1 - beta1) * g_t + # \\(m := beta1 * m + (1 - beta1) * g_t\\) m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1 - beta1_t) * grad.values, use_locking=self._use_locking) - # v := beta2 * v + (1 - beta2) * (g_t * g_t) + # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1 - beta2_t) * math_ops.square(grad.values), use_locking=self._use_locking) - # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t)) + # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\) m_t_slice = array_ops.gather(m_t, grad.indices) v_t_slice = array_ops.gather(v_t, grad.indices) denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..d0e0405a2c3e5ec05cf487a2ca48207b7a9d4663 --- /dev/null +++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer.py @@ -0,0 +1,107 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""RegAdagrad for TensorFlow.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.ops import math_ops +from tensorflow.python.training import adagrad +from tensorflow.python.training import training_ops +from tensorflow.python.util import tf_contextlib + + +class RegAdagradOptimizer(adagrad.AdagradOptimizer): + """RegAdagrad: Adagrad with updates that optionally skip updating the slots. + + This is meant to address the problem of additional regularization terms in the + loss function affecting learning rate decay and causing hyper-param + entanglement. Example usage: + + loss = tf.nn.cross_entropy(x, labels) + reg_loss = reg_strength * tf.reduce_sum(x * x) + opt = tf.contrib.opt.RegAdagradOptimizer(learning_rate) + loss_update = opt.minimize(loss) + with opt.avoid_updating_slots(): + reg_update = opt.minimize(reg_loss) + total_update = tf.group([loss_update, reg_update]) + + # ... + + sess.run(total_update, ...) + """ + + def __init__(self, + learning_rate, + initial_accumulator_value=0.1, + use_locking=False, + name="RegAdagrad"): + super(RegAdagradOptimizer, self).__init__( + learning_rate, + initial_accumulator_value=initial_accumulator_value, + use_locking=use_locking, + name=name) + self._should_update_slots = True + + @tf_contextlib.contextmanager + def avoid_updating_slots(self): + old = self._should_update_slots + self._should_update_slots = False + try: + yield + finally: + self._should_update_slots = old + + def _apply_dense(self, grad, var): + acc = self.get_slot(var, "accumulator") + return training_ops.apply_adagrad( + var, + acc, + math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), + grad, + use_locking=self._use_locking, + update_slots=self._should_update_slots) + + def _resource_apply_dense(self, grad, var, update_slots=True): + acc = self.get_slot(var, "accumulator") + return training_ops.resource_apply_adagrad( + var.handle, + acc.handle, + math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), + grad, + use_locking=self._use_locking, + update_slots=self._should_update_slots) + + def _apply_sparse(self, grad, var, update_slots=True): + acc = self.get_slot(var, "accumulator") + return training_ops.sparse_apply_adagrad( + var, + acc, + math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), + grad.values, + grad.indices, + use_locking=self._use_locking, + update_slots=self._should_update_slots) + + def _resource_apply_sparse(self, grad, var, indices, update_slots=True): + acc = self.get_slot(var, "accumulator") + return training_ops.resource_sparse_apply_adagrad( + var.handle, + acc.handle, + math_ops.cast(self._learning_rate_tensor, grad.dtype), + grad, + indices, + use_locking=self._use_locking, + update_slots=self._should_update_slots) diff --git a/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py new file mode 100644 index 0000000000000000000000000000000000000000..ea56e1646a0811ab065105cd260a760b5b718354 --- /dev/null +++ b/tensorflow/contrib/opt/python/training/reg_adagrad_optimizer_test.py @@ -0,0 +1,343 @@ +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functional tests for Regreg_adagrad_optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.opt.python.training import reg_adagrad_optimizer +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import embedding_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.platform import test + + +class RegAdagradOptimizerTest(test.TestCase): + + def doTestBasic(self, use_locking=False, use_resource=False): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + if use_resource: + var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) + var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) + else: + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=0.1, use_locking=use_locking) + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Run 3 steps of adagrad + for _ in range(3): + ada_update.run() + # Validate updated params + self.assertAllCloseAccordingToType( + np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([2.715679168701172, 3.715679168701172]), var1.eval()) + + def testBasic(self): + self.doTestBasic(use_locking=False) + + def testBasicResource(self): + self.doTestBasic(use_locking=False, use_resource=True) + + def testBasicLocked(self): + self.doTestBasic(use_locking=True) + + def testMinimizeSparseResourceVariable(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = resource_variable_ops.ResourceVariable( + [[1.0, 2.0], [3.0, 4.0]], dtype=dtype) + x = constant_op.constant([[4.0], [5.0]], dtype=dtype) + pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x) + loss = pred * pred + sgd_op = reg_adagrad_optimizer.RegAdagradOptimizer(1.0).minimize(loss) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]], + var0.eval()) + # Run 1 step of sgd + sgd_op.run() + # Validate updated params + self.assertAllCloseAccordingToType( + [[0, 1], [3, 4]], var0.eval(), atol=0.01) + + def testTensorLearningRate(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + constant_op.constant(3.0), initial_accumulator_value=0.1) + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Run 3 steps of adagrad + for _ in range(3): + ada_update.run() + # Validate updated params + self.assertAllCloseAccordingToType( + np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([2.715679168701172, 3.715679168701172]), var1.eval()) + + def testSparseBasic(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([[1.0], [2.0]], dtype=dtype) + var1 = variables.Variable([[3.0], [4.0]], dtype=dtype) + grads0 = ops.IndexedSlices( + constant_op.constant([0.1], shape=[1, 1], dtype=dtype), + constant_op.constant([0]), constant_op.constant([2, 1])) + grads1 = ops.IndexedSlices( + constant_op.constant([0.01], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), constant_op.constant([2, 1])) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=0.1) + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([[1.0], [2.0]], var0.eval()) + self.assertAllClose([[3.0], [4.0]], var1.eval()) + # Run 3 step of sgd + for _ in range(3): + ada_update.run() + # Validate updated params + self.assertAllCloseAccordingToType( + np.array([[-1.6026098728179932], [2.0]]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([[3.0], [3.715679168701172]]), var1.eval()) + + def testSparseRepeatedIndices(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + repeated_index_update_var = variables.Variable( + [[1.0], [2.0]], dtype=dtype) + aggregated_update_var = variables.Variable([[1.0], [2.0]], dtype=dtype) + grad_repeated_index = ops.IndexedSlices( + constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype), + constant_op.constant([1, 1]), constant_op.constant([2, 1])) + grad_aggregated = ops.IndexedSlices( + constant_op.constant([0.2], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), constant_op.constant([2, 1])) + repeated_update = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0).apply_gradients([(grad_repeated_index, + repeated_index_update_var)]) + aggregated_update = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0).apply_gradients([(grad_aggregated, aggregated_update_var)]) + variables.global_variables_initializer().run() + self.assertAllClose(aggregated_update_var.eval(), + repeated_index_update_var.eval()) + for _ in range(3): + repeated_update.run() + aggregated_update.run() + self.assertAllClose(aggregated_update_var.eval(), + repeated_index_update_var.eval()) + + def testSparseRepeatedIndicesResourceVariable(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var_repeated = resource_variable_ops.ResourceVariable( + [1.0, 2.0], dtype=dtype) + loss_repeated = math_ops.reduce_sum( + embedding_ops.embedding_lookup(var_repeated, [0, 0])) + var_aggregated = resource_variable_ops.ResourceVariable( + [1.0, 2.0], dtype=dtype) + loss_aggregated = 2 * math_ops.reduce_sum( + embedding_ops.embedding_lookup(var_aggregated, [0])) + update_op_repeated = reg_adagrad_optimizer.RegAdagradOptimizer( + 2.0).minimize(loss_repeated) + update_op_aggregated = reg_adagrad_optimizer.RegAdagradOptimizer( + 2.0).minimize(loss_aggregated) + variables.global_variables_initializer().run() + self.assertAllCloseAccordingToType(var_repeated.eval(), + var_aggregated.eval()) + for _ in range(3): + update_op_repeated.run() + update_op_aggregated.run() + self.assertAllCloseAccordingToType(var_repeated.eval(), + var_aggregated.eval()) + + def testSparseStability(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + shape = [1, 6] + var0 = variables.Variable( + [[ + 0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257, + -0.0105945 + ]], + dtype=dtype) + grads0 = ops.IndexedSlices( + constant_op.constant( + [[ + -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, + -8.4877e-05, -9.48906e-05 + ]], + shape=shape, + dtype=dtype), constant_op.constant([0]), + constant_op.constant(shape)) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 1.0, initial_accumulator_value=0.1) + ada_update = ada_opt.apply_gradients(zip([grads0], [var0])) + self.assertEqual(["accumulator"], ada_opt.get_slot_names()) + slot0 = ada_opt.get_slot(var0, "accumulator") + init = variables.global_variables_initializer() + for _ in range(100): + init.run() + ada_update.run() + self.assertAllCloseAccordingToType( + np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval()) + self.assertAllCloseAccordingToType( + np.array([[ + 0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573, + -0.01029443 + ]]), var0.eval()) + + def testSharing(self): + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer(3.0) + # Apply the optimizer twice. Both applications will use + # the same accums. + ada_update1 = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + ada_update2 = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + self.assertEqual(["accumulator"], ada_opt.get_slot_names()) + slot0 = ada_opt.get_slot(var0, "accumulator") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + slot1 = ada_opt.get_slot(var1, "accumulator") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values. + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Mix the first and the second adagrad for 3 steps. + ada_update1.run() + ada_update2.run() + ada_update1.run() + # Validate updated params (the same as with only 1 RegAdagrad). + self.assertAllCloseAccordingToType( + np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval()) + self.assertAllCloseAccordingToType( + np.array([2.715679168701172, 3.715679168701172]), var1.eval()) + + def testDynamicShapeVariable_Ok(self): + with self.test_session(): + v = variable_scope.get_variable( + "v", initializer=constant_op.constant(1.), validate_shape=False) + self.assertFalse(v.shape.is_fully_defined()) + # Creating optimizer should cause no exception. + reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=0.1) + + def testSkipUpdatingSlots(self): + iav = 0.130005 # A value that works with float16 + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([1.0, 2.0], dtype=dtype) + var1 = variables.Variable([3.0, 4.0], dtype=dtype) + grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) + grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=iav) + # Apply the optimizer twice. Both applications will use + # the same accums. + with ada_opt.avoid_updating_slots(): + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + self.assertEqual(["accumulator"], ada_opt.get_slot_names()) + slot0 = ada_opt.get_slot(var0, "accumulator") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + slot1 = ada_opt.get_slot(var1, "accumulator") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + variables.global_variables_initializer().run() + + # Fetch params to validate initial values. + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + # Mix the first and the second adagrad for 3 steps. + for _ in range(3): + ada_update.run() + # Validate that ada_opt's slots are not updated. + self.assertAllCloseAccordingToType(np.array([iav, iav]), slot0.eval()) + self.assertAllCloseAccordingToType(np.array([iav, iav]), slot1.eval()) + + def testSparseSkipUpdatingSlots(self): + iav = 0.130005 # A value that works with float16 + for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: + with self.test_session(): + var0 = variables.Variable([[1.0], [2.0]], dtype=dtype) + var1 = variables.Variable([[3.0], [4.0]], dtype=dtype) + grads0 = ops.IndexedSlices( + constant_op.constant([0.1], shape=[1, 1], dtype=dtype), + constant_op.constant([0]), constant_op.constant([2, 1])) + grads1 = ops.IndexedSlices( + constant_op.constant([0.01], shape=[1, 1], dtype=dtype), + constant_op.constant([1]), constant_op.constant([2, 1])) + ada_opt = reg_adagrad_optimizer.RegAdagradOptimizer( + 3.0, initial_accumulator_value=iav) + with ada_opt.avoid_updating_slots(): + ada_update = ada_opt.apply_gradients( + zip([grads0, grads1], [var0, var1])) + slot0 = ada_opt.get_slot(var0, "accumulator") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + slot1 = ada_opt.get_slot(var1, "accumulator") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + + variables.global_variables_initializer().run() + # Fetch params to validate initial values + self.assertAllClose([[1.0], [2.0]], var0.eval()) + self.assertAllClose([[3.0], [4.0]], var1.eval()) + # Run 3 step of sgd + for _ in range(3): + ada_update.run() + # Validate that ada_opt's slots are not updated. + self.assertAllCloseAccordingToType( + np.array([[iav], [iav]]), slot0.eval()) + self.assertAllCloseAccordingToType( + np.array([[iav], [iav]]), slot1.eval()) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py index 42b7f92a76c1971e2a63722d769ee006c3f3210b..d538ad0fb02699ed8514f512208914f629a47436 100644 --- a/tensorflow/contrib/optimizer_v2/adam.py +++ b/tensorflow/contrib/optimizer_v2/adam.py @@ -40,23 +40,19 @@ class AdamOptimizer(optimizer_v2.OptimizerV2): Initialization: - ``` - m_0 <- 0 (Initialize initial 1st moment vector) - v_0 <- 0 (Initialize initial 2nd moment vector) - t <- 0 (Initialize timestep) - ``` + $$m_0 := 0 (Initialize initial 1st moment vector)$$ + $$v_0 := 0 (Initialize initial 2nd moment vector)$$ + $$t := 0 (Initialize timestep)$$ The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: - ``` - t <- t + 1 - lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) + $$t := t + 1$$ + $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ - m_t <- beta1 * m_{t-1} + (1 - beta1) * g - v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g - variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon) - ``` + $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ + $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ + $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ The default value of 1e-8 for epsilon might not be a good default in general. For example, when training an Inception network on ImageNet a diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py index 8ac9b581455f8f4c7af1a66432169ae179de1634..9e2858d00ff192e56680b288651975410c63c539 100644 --- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py +++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py @@ -702,8 +702,7 @@ class CheckpointCompatibilityTests(test.TestCase): with save_graph.as_default(), self.test_session( graph=save_graph) as session: root = self._initialized_model() - object_saver = checkpointable_utils.CheckpointableSaver(root) - save_path = object_saver.save( + save_path = root.save( session=session, file_prefix=checkpoint_prefix) with context.eager_mode(): root = self._initialized_model() @@ -716,8 +715,7 @@ class CheckpointCompatibilityTests(test.TestCase): checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt") with context.eager_mode(): root = self._initialized_model() - object_saver = checkpointable_utils.CheckpointableSaver(root) - save_path = object_saver.save(file_prefix=checkpoint_prefix) + save_path = root.save(file_prefix=checkpoint_prefix) with context.graph_mode(): save_graph = ops.Graph() with save_graph.as_default(), self.test_session( diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py index ce15db6f1ec067e5aeb6ddbc8939d2b773692269..46bfbb729fa9cdfc98f4228f516a7c5c42f23059 100644 --- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py +++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py @@ -125,19 +125,6 @@ class _DenseResourceVariableProcessor(_OptimizableVariable): return update_op -class _StreamingModelPortProcessor(_OptimizableVariable): - """Processor for streaming ModelPorts.""" - - def __init__(self, v): - self._v = v - - def target(self): - return self._v - - def update_op(self, optimizer, g, *args): - return g - - class _TensorProcessor(_OptimizableVariable): """Processor for ordinary Tensors. @@ -167,8 +154,6 @@ def _get_processor(v): return _DenseResourceVariableProcessor(v) if isinstance(v, variables.Variable): return _RefVariableProcessor(v) - if v.op.type == "SubmodelPort": - return _StreamingModelPortProcessor(v) if isinstance(v, ops.Tensor): return _TensorProcessor(v) raise NotImplementedError("Trying to optimize unsupported type ", v) diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py index aa0ef643088ef36b84596d08f78c29594ceca2d6..6f41722748b4754a293f4998e925577a71bc4576 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py @@ -501,8 +501,27 @@ def _GetBatchNormParams(graph, context, has_scaling): bn_decay_var_tensor = None split_context = context.split('/') - base_context = split_context[-1] - + # Matching variable names is brittle and relies on scoping + # conventions. Fused batch norm folding is more robust. Support for unfused + # batch norms will be deprecated as we move forward. Fused batch norms allow + # for faster training and should be used whenever possible. + # context contains part of the names of the tensors we are interested in: + # For MobilenetV1, the context has repetitions: + # MobilenetV1/MobilenetV1/Conv2d_3_depthwise + # when the moving_mean tensor has the name: + # MobilenetV1/Conv2d_3_depthwise/BatchNorm/moving_mean/read + # To pick the correct variable name, it is necessary to ignore the repeating + # header. + + # For MobilenetV2, this problem does not exist: + # The context is: MobilenetV2/expanded_conv_3/depthwise + # and the names of the tensors start with a single MobilenetV2 + # The moving mean for example, has the name: + # MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read + # We ignore the first string (MobilenetV1 or MobilenetV2) + # in the context to match correctly in both cases + + base_context = '/'.join(split_context[1:]) oplist = graph.get_operations() op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze' op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1' @@ -520,7 +539,6 @@ def _GetBatchNormParams(graph, context, has_scaling): op_suffix_gamma = base_context + '/BatchNorm/gamma' op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read' op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read' - # Parse through list of ops to find relevant ops for op in oplist: if op.name.endswith(op_suffix_mean): diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py index af31467476b1536adef2bb74308fd1093f7bea7a..64e8142e7c60924e0e5f6136886943b518976c6f 100644 --- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py +++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py @@ -134,6 +134,85 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase): def testFoldConv2d(self): self._RunTestOverParameters(self._TestFoldConv2d) + def testMultipleLayerConv2d(self, + relu=nn_ops.relu, + relu_op_name='Relu', + has_scaling=True, + fused_batch_norm=False, + freeze_batch_norm_delay=None): + """Tests folding cases for a network with multiple layers. + + Args: + relu: Callable that returns an Operation, a factory method for the Relu*. + relu_op_name: String, name of the Relu* operation. + has_scaling: Bool, when true the batch norm has scaling. + fused_batch_norm: Bool, when true the batch norm is fused. + freeze_batch_norm_delay: None or the number of steps after which training + switches to using frozen mean and variance + """ + g = ops.Graph() + with g.as_default(): + batch_size, height, width = 5, 128, 128 + inputs = array_ops.zeros((batch_size, height, width, 3)) + out_depth = 3 + stride = 1 + activation_fn = relu + scope = 'network/expanded_conv_1/conv' + layer1 = conv2d( + inputs, + out_depth, [5, 5], + stride=stride, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=activation_fn, + normalizer_fn=batch_norm, + normalizer_params=self._BatchNormParams( + scale=has_scaling, fused=fused_batch_norm), + scope=scope) + # Add another layer + scope = 'network/expanded_conv_2/conv' + + _ = conv2d( + layer1, + 2 * out_depth, [5, 5], + stride=stride, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=activation_fn, + normalizer_fn=batch_norm, + normalizer_params=self._BatchNormParams( + scale=has_scaling, fused=fused_batch_norm), + scope=scope) + + fold_batch_norms.FoldBatchNorms( + g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay) + folded_mul = g.get_operation_by_name(scope + '/mul_fold') + self.assertEqual(folded_mul.type, 'Mul') + self._AssertInputOpsAre(folded_mul, [ + scope + '/correction_mult', + self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm) + ]) + self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold']) + + folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold') + self.assertEqual(folded_conv.type, 'Conv2D') + # Remove :0 at end of name for tensor prior to comparison + self._AssertInputOpsAre(folded_conv, + [scope + '/mul_fold', layer1.name[:-2]]) + self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul']) + + folded_add = g.get_operation_by_name(scope + '/add_fold') + self.assertEqual(folded_add.type, 'Add') + self._AssertInputOpsAre(folded_add, [ + scope + '/correction_add', + self._BathNormBiasName(scope, fused_batch_norm) + ]) + output_op_names = [scope + '/' + relu_op_name] + self._AssertOutputGoesToOps(folded_add, g, output_op_names) + + for op in g.get_operations(): + self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name) + def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm, freeze_batch_norm_delay): diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py index d2d0426d233aaadb4ffd0fb222c77ade0a98278c..efc1a94b3c6e34cfd9f45e57af0be3faf4490866 100644 --- a/tensorflow/contrib/quantize/python/quantize.py +++ b/tensorflow/contrib/quantize/python/quantize.py @@ -133,19 +133,27 @@ def Quantize(graph, bits=activation_bits, producer_scope=scope, consumer_scope=scope) - _InsertQuantOp( - add_context, - 'add_quant', - layer_match.bypass_op, - input_to_ops_map.ConsumerOperations(layer_match.bypass_op), - is_training, - moving_avg=True, - ema_decay=ema_decay, - quant_delay=quant_delay, - vars_collection=vars_collection, - bits=activation_bits, - producer_scope=scope, - consumer_scope=scope) + # Make sure the op following this isn't an activation. In which case, we + # shouldn't quantize it, since the activation will be Fused into the + # Add at inference time. + consumers = input_to_ops_map.ConsumerOperations(layer_match.bypass_op) + if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]): + logging.info('Skipping %s, because its followed by an activation.', + layer_match.bypass_op.name) + else: + _InsertQuantOp( + add_context, + 'add_quant', + layer_match.bypass_op, + input_to_ops_map.ConsumerOperations(layer_match.bypass_op), + is_training, + moving_avg=True, + ema_decay=ema_decay, + quant_delay=quant_delay, + vars_collection=vars_collection, + bits=activation_bits, + producer_scope=scope, + consumer_scope=scope) # Quantize bypass ops that occur after the activation. if layer_match.post_activation_bypass_op is not None: @@ -153,19 +161,27 @@ def Quantize(graph, r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1) # If `scope` is given, only quantize it if the producer is in the right # scope. - _InsertQuantOp( - post_activation_bypass_context, - 'post_activation_bypass_quant', - layer_match.post_activation_bypass_op, - input_to_ops_map.ConsumerOperations( - layer_match.post_activation_bypass_op), - is_training, - moving_avg=True, - ema_decay=ema_decay, - quant_delay=quant_delay, - vars_collection=vars_collection, - bits=activation_bits, - producer_scope=scope) + # Make sure the op following this isn't an activation. In which case, we + # shouldn't quantize it, since the activation will be Fused into the + # Add at inference time. + consumers = input_to_ops_map.ConsumerOperations( + layer_match.post_activation_bypass_op) + if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]): + logging.info('Skipping %s, because its followed by an activation.', + layer_match.post_activation_bypass_op.name) + else: + _InsertQuantOp( + post_activation_bypass_context, + 'post_activation_bypass_quant', + layer_match.post_activation_bypass_op, + consumers, + is_training, + moving_avg=True, + ema_decay=ema_decay, + quant_delay=quant_delay, + vars_collection=vars_collection, + bits=activation_bits, + producer_scope=scope) def _FindLayersToQuantize(graph): diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py index caf8ff28d50d2880d491d04c1ed368597519dcd7..54faf582f15a26c12813f3fdffe2dda6aa5cc91f 100644 --- a/tensorflow/contrib/quantize/python/quantize_graph_test.py +++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py @@ -113,20 +113,6 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase): # Ensure that variables were added. self.assertTrue(len(orig_variable_names) < len(q_variables)) - def testWithPreActivationBypass(self): - self._RunTestOverAllRewrites(self._TestWithPreActivationBypass) - - def _TestWithPreActivationBypass(self, rewrite_fn): - # Tests that the default graph is correctly used when no args are provided - # to rewrite_fn. - with ops.Graph().as_default() as g: - self._ConvLayer(pre_activation_bypass=True, scope='scope1') - rewrite_fn() - - op_names = [op.name for op in g.get_operations()] - self.assertTrue( - any('scope1/add_quant/' in name for name in op_names)) - def testWithPostActivationBypass(self): self._RunTestOverAllRewrites(self._TestWithPostActivationBypass) diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py index d37c83d6839f02c52a72cac97c9238c135dc2f66..5e479f394680420e510550a510d46d8fff2c03b0 100644 --- a/tensorflow/contrib/quantize/python/quantize_test.py +++ b/tensorflow/contrib/quantize/python/quantize_test.py @@ -82,9 +82,22 @@ class QuantizeTest(test_util.TensorFlowTestCase): quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) quantization_node_name = 'FakeQuantWithMinMaxVars' - add_quant = graph.get_operation_by_name('test/add_quant/' + - quantization_node_name) - self.assertEqual(add_quant.type, quantization_node_name) + conv_quant = graph.get_operation_by_name('test/test/conv_quant/' + + quantization_node_name) + self.assertEqual(conv_quant.type, quantization_node_name) + + # Scan through all FakeQuant operations, ensuring that the activation + # isn't in the consumers of the operation. Since activations are folded + # the preceding operation during inference, the FakeQuant operation after + # the activation is all that is needed. + for op in graph.get_operations(): + if op.type == quantization_node_name: + quant_op = graph.get_operation_by_name(op.name) + consumers = [] + for output in quant_op.outputs: + consumers.extend(output.consumers()) + + self.assertNotIn('test/identity', [c.name for c in consumers]) def testInsertQuantOpForAddAfterSeparableConv2d(self): self._RunTestOverParameters( @@ -109,9 +122,20 @@ class QuantizeTest(test_util.TensorFlowTestCase): quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) quantization_node_name = 'FakeQuantWithMinMaxVars' - add_quant = graph.get_operation_by_name('test/add_quant/' + - quantization_node_name) - self.assertEqual(add_quant.type, quantization_node_name) + conv_quant = graph.get_operation_by_name('test/test/conv_quant/' + + quantization_node_name) + self.assertEqual(conv_quant.type, quantization_node_name) + + for op in graph.get_operations(): + if op.type == quantization_node_name: + quant_op = graph.get_operation_by_name(op.name) + # Scan through all FakeQuant operations, ensuring that the activation + # identity op isn't in the consumers of the operation. + consumers = [] + for output in quant_op.outputs: + consumers.extend(output.consumers()) + + self.assertNotIn('test/identity', [c.name for c in consumers]) def testFinalLayerQuantized(self): self._RunTestOverParameters(self._TestFinalLayerQuantized) @@ -153,12 +177,21 @@ class QuantizeTest(test_util.TensorFlowTestCase): activation_fn=array_ops.identity, scope='test/test') bypass_tensor = math_ops.add(conv, input2, name='test/add') - _ = array_ops.identity(bypass_tensor, name='test/output') + # The output of the post_activation bypass will be another layer. + _ = conv2d( + bypass_tensor, + 32, [5, 5], + stride=2, + padding='SAME', + weights_initializer=self._WeightInit(0.09), + activation_fn=array_ops.identity, + scope='test/unused') quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) - # Ensure that the bypass node is preceded and followed by - # FakeQuantWithMinMaxVars operations. + # Ensure that the bypass node is preceded by and followed by a + # FakeQuantWithMinMaxVar operation, since the output of the Add isn't an + # activation. self.assertTrue('FakeQuantWithMinMaxVars' in [c.type for c in bypass_tensor.consumers()]) self.assertTrue('FakeQuantWithMinMaxVars' in @@ -198,9 +231,9 @@ class QuantizeTest(test_util.TensorFlowTestCase): quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8) - # Ensure that the bypass node is preceded and followed by - # FakeQuantWithMinMaxVars operations. - self.assertTrue('FakeQuantWithMinMaxVars' in + # Ensure that the bypass node is preceded by a FakeQuantWithMinMaxVar + # operation, and NOT followed by one. + self.assertTrue('FakeQuantWithMinMaxVars' not in [c.type for c in bypass_tensor.consumers()]) self.assertTrue('FakeQuantWithMinMaxVars' in [i.op.type for i in bypass_tensor.op.inputs]) diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc index 03006dab323a7c6dc83d9a17c035ef705f7b0366..45d22b739b8c597c7ebda85968aa44cd599a798c 100644 --- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc +++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc @@ -26,9 +26,9 @@ namespace tensorflow { #if GOOGLE_CUDA namespace { template -perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory) { - perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory)); - perftools::gputools::DeviceMemory typed(wrapped); +se::DeviceMemory AsDeviceMemory(const T* cuda_memory) { + se::DeviceMemoryBase wrapped(const_cast(cuda_memory)); + se::DeviceMemory typed(wrapped); return typed; } } // namespace @@ -41,9 +41,8 @@ void TensorCuBlasGemm::operator()(OpKernelContext* ctx, bool transa, T alpha, const T* a, int lda, const T* b, int ldb, T beta, T* c, int ldc) { #if GOOGLE_CUDA - perftools::gputools::blas::Transpose trans[] = { - perftools::gputools::blas::Transpose::kNoTranspose, - perftools::gputools::blas::Transpose::kTranspose}; + se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose, + se::blas::Transpose::kTranspose}; auto a_ptr = AsDeviceMemory(a); auto b_ptr = AsDeviceMemory(b); diff --git a/tensorflow/contrib/rpc/python/kernel_tests/BUILD b/tensorflow/contrib/rpc/python/kernel_tests/BUILD index f3e6731213fd0aa8f14845a7b1562001b134c7ac..2311c15a68c46090cec0f97bd950296506b0817e 100644 --- a/tensorflow/contrib/rpc/python/kernel_tests/BUILD +++ b/tensorflow/contrib/rpc/python/kernel_tests/BUILD @@ -28,7 +28,6 @@ py_library( py_library( name = "rpc_op_test_base", srcs = ["rpc_op_test_base.py"], - tags = ["notsan"], deps = [ ":test_example_proto_py", "//tensorflow/contrib/proto", diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py index e2e0dbc7a22efb2d8bf1e03e325eb8b6a4734993..3fc6bfbb4d03a39906d4441e48b2788423caa234 100644 --- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py +++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py @@ -35,6 +35,7 @@ class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase): _protocol = 'grpc' invalid_method_string = 'Method not found' + connect_failed_string = 'Connect Failed' def __init__(self, methodName='runTest'): # pylint: disable=invalid-name super(RpcOpTest, self).__init__(methodName) diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py index 89f3ee1a1c52e4e292ef850ed375ad49b79fa1f5..27273d16b1c09eba60e124e632b353b09ea2d063 100644 --- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py +++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py @@ -93,40 +93,39 @@ class RpcOpTestBase(object): response_values = sess.run(response_tensors) self.assertAllEqual(response_values.shape, [0]) - def testInvalidAddresses(self): - with self.test_session() as sess: - with self.assertRaisesOpError(self.invalid_method_string): - sess.run( - self.rpc( - method='/InvalidService.IncrementTestShapes', - address=self._address, - request='')) + def testInvalidMethod(self): + for method in [ + '/InvalidService.IncrementTestShapes', + self.get_method_name('InvalidMethodName') + ]: + with self.test_session() as sess: + with self.assertRaisesOpError(self.invalid_method_string): + sess.run(self.rpc(method=method, address=self._address, request='')) - with self.assertRaisesOpError(self.invalid_method_string): - sess.run( - self.rpc( - method=self.get_method_name('InvalidMethodName'), - address=self._address, - request='')) + _, status_code_value, status_message_value = sess.run( + self.try_rpc(method=method, address=self._address, request='')) + self.assertEqual(errors.UNIMPLEMENTED, status_code_value) + self.assertTrue( + self.invalid_method_string in status_message_value.decode('ascii')) - # This also covers the case of address='' - # and address='localhost:293874293874' + def testInvalidAddress(self): + # This covers the case of address='' and address='localhost:293874293874' + address = 'unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@' + with self.test_session() as sess: with self.assertRaises(errors.UnavailableError): sess.run( self.rpc( method=self.get_method_name('IncrementTestShapes'), - address='unix:/tmp/this_unix_socket_doesnt_exist_97820348!!@', + address=address, request='')) - - # Test invalid method with the TryRpc op _, status_code_value, status_message_value = sess.run( self.try_rpc( - method=self.get_method_name('InvalidMethodName'), - address=self._address, + method=self.get_method_name('IncrementTestShapes'), + address=address, request='')) - self.assertEqual(errors.UNIMPLEMENTED, status_code_value) + self.assertEqual(errors.UNAVAILABLE, status_code_value) self.assertTrue( - self.invalid_method_string in status_message_value.decode('ascii')) + self.connect_failed_string in status_message_value.decode('ascii')) def testAlwaysFailingMethod(self): with self.test_session() as sess: @@ -138,6 +137,18 @@ class RpcOpTestBase(object): with self.assertRaisesOpError(I_WARNED_YOU): sess.run(response_tensors) + response_tensors, status_code, status_message = self.try_rpc( + method=self.get_method_name('AlwaysFailWithInvalidArgument'), + address=self._address, + request='') + self.assertEqual(response_tensors.shape, ()) + self.assertEqual(status_code.shape, ()) + self.assertEqual(status_message.shape, ()) + status_code_value, status_message_value = sess.run((status_code, + status_message)) + self.assertEqual(errors.INVALID_ARGUMENT, status_code_value) + self.assertTrue(I_WARNED_YOU in status_message_value.decode('ascii')) + def testSometimesFailingMethodWithManyRequests(self): with self.test_session() as sess: # Fail hard by default. @@ -197,8 +208,7 @@ class RpcOpTestBase(object): address=self._address, request=request_tensors) for _ in range(10) ] - # Launch parallel 10 calls to the RpcOp, each containing - # 20 rpc requests. + # Launch parallel 10 calls to the RpcOp, each containing 20 rpc requests. many_response_values = sess.run(many_response_tensors) self.assertEqual(10, len(many_response_values)) for response_values in many_response_values: diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py index 90033015ebc5e44ea70fbf2bc9735d0aeb4ec27d..e893e1d1c836cc7feef15757dde79d0db362cbaf 100644 --- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py +++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py @@ -37,7 +37,7 @@ def _top_k_generator(k): def _top_k(probabilities, targets): targets = math_ops.to_int32(targets) if targets.get_shape().ndims > 1: - targets = array_ops.squeeze(targets, squeeze_dims=[1]) + targets = array_ops.squeeze(targets, axis=[1]) return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k)) return _top_k @@ -57,7 +57,7 @@ def _r2(probabilities, targets, weights=None): def _squeeze_and_onehot(targets, depth): - targets = array_ops.squeeze(targets, squeeze_dims=[1]) + targets = array_ops.squeeze(targets, axis=[1]) return array_ops.one_hot(math_ops.to_int32(targets), depth) diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py index ff3ab21eaa9a4aa823f2ae7d3dd39674abea3d2a..745a5b1caf2fe348f1b276ccc245aa2ef350a62e 100644 --- a/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py +++ b/tensorflow/contrib/tensor_forest/hybrid/python/layers/fully_connected.py @@ -55,7 +55,7 @@ class ManyToOneLayer(hybrid_layer.HybridLayer): # There is always one activation per instance by definition, so squeeze # away the extra dimension. - return array_ops.squeeze(nn_activations, squeeze_dims=[1]) + return array_ops.squeeze(nn_activations, axis=[1]) class FlattenedFullyConnectedLayer(hybrid_layer.HybridLayer): diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py index b9bcbb170b04fe953be2d2dd515b607127d3cae6..7a35a70bbe3112e0649cefd8116cc50565978da5 100644 --- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py +++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py @@ -445,7 +445,7 @@ class RandomForestGraphs(object): mask = math_ops.less( r, array_ops.ones_like(r) * self.params.bagging_fraction) gather_indices = array_ops.squeeze( - array_ops.where(mask), squeeze_dims=[1]) + array_ops.where(mask), axis=[1]) # TODO(thomaswc): Calculate out-of-bag data and labels, and store # them for use in calculating statistics later. tree_data = array_ops.gather(processed_dense_features, gather_indices) diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc index b412b296e02751427b80e7c1596f2530942519c6..07740277115fe4956f21e8db1dabfb10990a2095 100644 --- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc +++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc @@ -111,20 +111,22 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph, } } -std::pair ParseTensorName(string name, int default_idx = 0) { +std::pair ParseTensorName(const string& name, + int default_idx = 0) { + string name_no_idx = name; int idx = default_idx; - size_t sep = name.find_last_of(':'); + const size_t sep = name_no_idx.find_last_of(':'); if (sep != string::npos) { - name = name.substr(0, sep); + name_no_idx = name_no_idx.substr(0, sep); idx = std::stoi(name.substr(sep + 1)); } - return std::make_pair(name, idx); + return std::make_pair(name_no_idx, idx); } std::unordered_map> BuildTensorNameMap( const std::vector& tensor_names) { std::unordered_map> result; - for (string const& tensor_name : tensor_names) { + for (const string& tensor_name : tensor_names) { string node_name; int index; std::tie(node_name, index) = ParseTensorName(tensor_name); @@ -132,6 +134,7 @@ std::unordered_map> BuildTensorNameMap( } return result; } + // TODO(sami): convert references to pointers struct ConvertGraphParams { ConvertGraphParams( diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc index b32371b642f38b0851955a4a3beab97b86e1f6a0..53ba7badcaea15a049010049f427e1f4786df7ce 100644 --- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc @@ -25,7 +25,6 @@ limitations under the License. namespace tensorflow { static ::tensorflow::tensorrt::Logger logger; -namespace gpu = ::perftools::gputools; using IRuntime = nvinfer1::IRuntime; using Dims = nvinfer1::Dims; diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly.py b/tensorflow/contrib/timeseries/examples/known_anomaly.py index e77628ddd390374d6336e3583e07ce03cdec7aea..71621abc7190fae9973f78522e23f03d43e342c6 100644 --- a/tensorflow/contrib/timeseries/examples/known_anomaly.py +++ b/tensorflow/contrib/timeseries/examples/known_anomaly.py @@ -41,17 +41,8 @@ _MODULE_PATH = path.dirname(__file__) _DATA_FILE = path.join(_MODULE_PATH, "data/changepoints.csv") -def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300): - """Training, evaluating, and predicting on a series with changepoints.""" - - # Indicate the format of our exogenous feature, in this case a string - # representing a boolean value. - string_feature = tf.feature_column.categorical_column_with_vocabulary_list( - key="is_changepoint", vocabulary_list=["no", "yes"]) - # Specify the way this feature is presented to the model, here using a one-hot - # encoding. - one_hot_feature = tf.feature_column.indicator_column( - categorical_column=string_feature) +def state_space_esitmator(exogenous_feature_columns): + """Constructs a StructuralEnsembleRegressor.""" def _exogenous_update_condition(times, features): del times # unused @@ -62,14 +53,48 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300): # no changepoint. return tf.equal(tf.squeeze(features["is_changepoint"], axis=-1), "yes") - estimator = tf.contrib.timeseries.StructuralEnsembleRegressor( - periodicities=12, - # Extract a smooth period by constraining the number of latent values - # being cycled between. - cycle_num_latent_values=3, - num_features=1, - exogenous_feature_columns=[one_hot_feature], - exogenous_update_condition=_exogenous_update_condition) + return ( + tf.contrib.timeseries.StructuralEnsembleRegressor( + periodicities=12, + # Extract a smooth period by constraining the number of latent values + # being cycled between. + cycle_num_latent_values=3, + num_features=1, + exogenous_feature_columns=exogenous_feature_columns, + exogenous_update_condition=_exogenous_update_condition), + # Use truncated backpropagation with a window size of 64, batching + # together 4 of these windows (random offsets) per training step. Training + # with exogenous features often requires somewhat larger windows. + 4, 64) + + +def autoregressive_esitmator(exogenous_feature_columns): + input_window_size = 8 + output_window_size = 2 + return ( + tf.contrib.timeseries.ARRegressor( + periodicities=12, + num_features=1, + input_window_size=input_window_size, + output_window_size=output_window_size, + exogenous_feature_columns=exogenous_feature_columns), + 64, input_window_size + output_window_size) + + +def train_and_evaluate_exogenous( + estimator_fn, csv_file_name=_DATA_FILE, train_steps=300): + """Training, evaluating, and predicting on a series with changepoints.""" + # Indicate the format of our exogenous feature, in this case a string + # representing a boolean value. + string_feature = tf.feature_column.categorical_column_with_vocabulary_list( + key="is_changepoint", vocabulary_list=["no", "yes"]) + # Specify the way this feature is presented to the model, here using a one-hot + # encoding. + one_hot_feature = tf.feature_column.indicator_column( + categorical_column=string_feature) + + estimator, batch_size, window_size = estimator_fn( + exogenous_feature_columns=[one_hot_feature]) reader = tf.contrib.timeseries.CSVReader( csv_file_name, # Indicate the format of our CSV file. First we have two standard columns, @@ -85,10 +110,7 @@ def train_and_evaluate_exogenous(csv_file_name=_DATA_FILE, train_steps=300): # This CSV has a header line; here we just ignore it. skip_header_lines=1) train_input_fn = tf.contrib.timeseries.RandomWindowInputFn( - # Use truncated backpropagation with a window size of 64, batching - # together 4 of these windows (random offsets) per training step. Training - # with exogenous features often requires somewhat larger windows. - reader, batch_size=4, window_size=64) + reader, batch_size=batch_size, window_size=window_size) estimator.train(input_fn=train_input_fn, steps=train_steps) evaluation_input_fn = tf.contrib.timeseries.WholeDatasetInputFn(reader) evaluation = estimator.evaluate(input_fn=evaluation_input_fn, steps=1) @@ -145,7 +167,12 @@ def main(unused_argv): if not HAS_MATPLOTLIB: raise ImportError( "Please install matplotlib to generate a plot from this example.") - make_plot("Ignoring a known anomaly", *train_and_evaluate_exogenous()) + make_plot("Ignoring a known anomaly (state space)", + *train_and_evaluate_exogenous( + estimator_fn=state_space_esitmator)) + make_plot("Ignoring a known anomaly (autoregressive)", + *train_and_evaluate_exogenous( + estimator_fn=autoregressive_esitmator, train_steps=3000)) pyplot.show() diff --git a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py index c3e307cad815d3c9c8556d0349d366d6f938101a..8c64f2e186a1aab0235f7cfbf1a942b872edd93b 100644 --- a/tensorflow/contrib/timeseries/examples/known_anomaly_test.py +++ b/tensorflow/contrib/timeseries/examples/known_anomaly_test.py @@ -23,12 +23,24 @@ from tensorflow.contrib.timeseries.examples import known_anomaly from tensorflow.python.platform import test -class KnownAnaomalyExampleTest(test.TestCase): +class KnownAnomalyExampleTest(test.TestCase): - def test_shapes_and_variance_structural(self): + def test_shapes_and_variance_structural_ar(self): (times, observed, all_times, mean, upper_limit, lower_limit, anomaly_locations) = known_anomaly.train_and_evaluate_exogenous( - train_steps=50) + train_steps=1, estimator_fn=known_anomaly.autoregressive_esitmator) + self.assertAllEqual( + anomaly_locations, + [25, 50, 75, 100, 125, 150, 175, 249]) + self.assertAllEqual(all_times.shape, mean.shape) + self.assertAllEqual(all_times.shape, upper_limit.shape) + self.assertAllEqual(all_times.shape, lower_limit.shape) + self.assertAllEqual(times.shape, observed.shape) + + def test_shapes_and_variance_structural_ssm(self): + (times, observed, all_times, mean, upper_limit, lower_limit, + anomaly_locations) = known_anomaly.train_and_evaluate_exogenous( + train_steps=50, estimator_fn=known_anomaly.state_space_esitmator) self.assertAllEqual( anomaly_locations, [25, 50, 75, 100, 125, 150, 175, 249]) diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py index 4f6527a5465ca01ed34150a26ba26d73a858cd74..558d9480b495ca87e828e8b440370ec9c6e3be2f 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py +++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py @@ -60,7 +60,8 @@ class ARModel(model.TimeSeriesModel): num_features, num_time_buckets=10, loss=NORMAL_LIKELIHOOD_LOSS, - hidden_layer_sizes=None): + hidden_layer_sizes=None, + exogenous_feature_columns=None): """Constructs an auto-regressive model. Args: @@ -81,6 +82,11 @@ class ARModel(model.TimeSeriesModel): observations and predictions, while the training loss is computed on normalized data (if input statistics are available). hidden_layer_sizes: list of sizes of hidden layers. + exogenous_feature_columns: A list of `tf.feature_column`s (for example + `tf.feature_column.embedding_column`) corresponding to exogenous + features which provide extra information to the model but are not part + of the series to be predicted. Passed to + `tf.feature_column.input_layer`. """ self.input_window_size = input_window_size self.output_window_size = output_window_size @@ -90,7 +96,12 @@ class ARModel(model.TimeSeriesModel): self.window_size = self.input_window_size + self.output_window_size self.loss = loss super(ARModel, self).__init__( - num_features=num_features) + num_features=num_features, + exogenous_feature_columns=exogenous_feature_columns) + if exogenous_feature_columns is not None: + self.exogenous_size = self._get_exogenous_embedding_shape()[-1] + else: + self.exogenous_size = 0 assert num_time_buckets > 0 self._buckets = int(num_time_buckets) if periodicities is None or not periodicities: @@ -110,7 +121,10 @@ class ARModel(model.TimeSeriesModel): # that the serving input_receiver_fn gets placeholder shapes correct. return (array_ops.zeros([self.input_window_size], dtype=dtypes.int64), array_ops.zeros( - [self.input_window_size, self.num_features], dtype=self.dtype)) + [self.input_window_size, self.num_features], dtype=self.dtype), + array_ops.zeros( + [self.input_window_size, self.exogenous_size], + dtype=self.dtype)) # TODO(allenl,agarwal): Support sampling for AR. def random_model_parameters(self, seed=None): @@ -163,7 +177,7 @@ class ARModel(model.TimeSeriesModel): activations.append((activation, activation_size)) return activations - def prediction_ops(self, times, values): + def prediction_ops(self, times, values, exogenous_regressors): """Compute model predictions given input data. Args: @@ -173,6 +187,8 @@ class ARModel(model.TimeSeriesModel): prediction times. values: A [batch size, self.input_window_size, self.num_features] Tensor with input features. + exogenous_regressors: A [batch size, self.window_size, + self.exogenous_size] Tensor with exogenous features. Returns: Tuple (predicted_mean, predicted_covariance), where each element is a Tensor with shape [batch size, self.output_window_size, @@ -183,25 +199,33 @@ class ARModel(model.TimeSeriesModel): if self.input_window_size: values.get_shape().assert_is_compatible_with( [None, self.input_window_size, self.num_features]) + if exogenous_regressors is not None: + exogenous_regressors.get_shape().assert_is_compatible_with( + [None, self.window_size, self.exogenous_size]) # Create input features. + activation_components = [] if self._periods: _, time_features = self._compute_time_features(times) activation_size = self.window_size * self._buckets * len(self._periods) - activation = array_ops.reshape(time_features, [-1, activation_size]) + activation_components.append( + array_ops.reshape(time_features, [-1, activation_size])) else: activation_size = 0 - activation = None - if self.input_window_size: inp = array_ops.slice(values, [0, 0, 0], [-1, self.input_window_size, -1]) inp_size = self.input_window_size * self.num_features inp = array_ops.reshape(inp, [-1, inp_size]) - if activation is not None: - activation = array_ops.concat([inp, activation], 1) - else: - activation = inp + activation_components.append(inp) activation_size += inp_size + if self.exogenous_size: + exogenous_size = self.window_size * self.exogenous_size + activation_size += exogenous_size + exogenous_flattened = array_ops.reshape( + exogenous_regressors, [-1, exogenous_size]) + activation_components.append(exogenous_flattened) assert activation_size + assert activation_components + activation = array_ops.concat(activation_components, axis=1) activations.append((activation, activation_size)) # Create hidden layers. activations += self._create_hidden_stack(activation, activation_size) @@ -228,6 +252,19 @@ class ARModel(model.TimeSeriesModel): math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype) return loss_op + def _process_exogenous_features(self, times, features): + embedded = super(ARModel, self)._process_exogenous_features( + times=times, features=features) + if embedded is None: + assert self.exogenous_size == 0 + # No embeddings. Return a zero-size [batch, times, 0] array so we don't + # have to special case it downstream. + return array_ops.zeros( + array_ops.concat([array_ops.shape(times), constant_op.constant([0])], + axis=0)) + else: + return embedded + # TODO(allenl, agarwal): Consider better ways of warm-starting predictions. def predict(self, features): """Computes predictions multiple steps into the future. @@ -243,6 +280,7 @@ class ARModel(model.TimeSeriesModel): segment of the time series before `TIMES`. This data is used to start of the autoregressive computation. This should have data for at least self.input_window_size timesteps. + And any exogenous features, with shapes prefixed by shape of `TIMES`. Returns: A dictionary with keys, "mean", "covariance". The values are Tensors of shape [batch_size, predict window size, @@ -250,25 +288,39 @@ class ARModel(model.TimeSeriesModel): """ predict_times = math_ops.cast( ops.convert_to_tensor(features[PredictionFeatures.TIMES]), dtypes.int32) + exogenous_regressors = self._process_exogenous_features( + times=predict_times, + features={key: value for key, value in features.items() + if key not in [TrainEvalFeatures.TIMES, + TrainEvalFeatures.VALUES, + PredictionFeatures.STATE_TUPLE]}) + with ops.control_dependencies( + [check_ops.assert_equal(array_ops.shape(predict_times)[1], + array_ops.shape(exogenous_regressors)[1])]): + exogenous_regressors = array_ops.identity(exogenous_regressors) batch_size = array_ops.shape(predict_times)[0] num_predict_values = array_ops.shape(predict_times)[1] prediction_iterations = ((num_predict_values + self.output_window_size - 1) // self.output_window_size) - # Pad predict_times so as to have exact multiple of self.output_window_size - # values per example. + # Pad predict_times and exogenous regressors so as to have exact multiple of + # self.output_window_size values per example. padding_size = (prediction_iterations * self.output_window_size - num_predict_values) - padding = array_ops.zeros([batch_size, padding_size], predict_times.dtype) - predict_times = control_flow_ops.cond( - padding_size > 0, lambda: array_ops.concat([predict_times, padding], 1), - lambda: predict_times) + predict_times = array_ops.pad( + predict_times, [[0, 0], [0, padding_size]]) + exogenous_regressors = array_ops.pad( + exogenous_regressors, [[0, 0], [0, padding_size], [0, 0]]) state = features[PredictionFeatures.STATE_TUPLE] - (state_times, state_values) = state + (state_times, state_values, state_exogenous_regressors) = state state_times = math_ops.cast( ops.convert_to_tensor(state_times), dtypes.int32) state_values = ops.convert_to_tensor(state_values, dtype=self.dtype) + state_exogenous_regressors = ops.convert_to_tensor( + state_exogenous_regressors, dtype=self.dtype) initial_input_times = predict_times[:, :self.output_window_size] + initial_input_exogenous_regressors = ( + exogenous_regressors[:, :self.output_window_size, :]) if self.input_window_size > 0: initial_input_times = array_ops.concat( [state_times[:, -self.input_window_size:], initial_input_times], 1) @@ -279,6 +331,11 @@ class ARModel(model.TimeSeriesModel): check_ops.assert_equal(values_size, times_size) ]): initial_input_values = state_values[:, -self.input_window_size:, :] + initial_input_exogenous_regressors = array_ops.concat( + [state_exogenous_regressors[:, -self.input_window_size:, :], + initial_input_exogenous_regressors[ + :, :self.output_window_size, :]], + axis=1) else: initial_input_values = 0 @@ -288,9 +345,10 @@ class ARModel(model.TimeSeriesModel): return math_ops.less(iteration_number, prediction_iterations) def _while_body(iteration_number, input_times, input_values, - mean_ta, covariance_ta): + input_exogenous_regressors, mean_ta, covariance_ta): """Predict self.output_window_size values.""" - prediction_ops = self.prediction_ops(input_times, input_values) + prediction_ops = self.prediction_ops( + input_times, input_values, input_exogenous_regressors) predicted_mean = prediction_ops["mean"] predicted_covariance = prediction_ops["covariance"] offset = self.output_window_size * gen_math_ops.minimum( @@ -299,20 +357,33 @@ class ARModel(model.TimeSeriesModel): if self.output_window_size < self.input_window_size: new_input_values = array_ops.concat( [input_values[:, self.output_window_size:, :], predicted_mean], 1) + new_input_exogenous_regressors = array_ops.concat( + [input_exogenous_regressors[:, -self.input_window_size:, :], + exogenous_regressors[ + :, offset:offset + self.output_window_size, :]], + axis=1) new_input_times = array_ops.concat([ - input_times[:, self.output_window_size:], + input_times[:, -self.input_window_size:], predict_times[:, offset:offset + self.output_window_size] ], 1) else: new_input_values = predicted_mean[:, -self.input_window_size:, :] + new_input_exogenous_regressors = exogenous_regressors[ + :, + offset - self.input_window_size:offset + self.output_window_size, + :] new_input_times = predict_times[ :, offset - self.input_window_size:offset + self.output_window_size] else: new_input_values = input_values + new_input_exogenous_regressors = exogenous_regressors[ + :, offset:offset + self.output_window_size, :] new_input_times = predict_times[:, offset:offset + self.output_window_size] new_input_times.set_shape(initial_input_times.get_shape()) + new_input_exogenous_regressors.set_shape( + initial_input_exogenous_regressors.get_shape()) new_mean_ta = mean_ta.write(iteration_number, predicted_mean) if isinstance(covariance_ta, tensor_array_ops.TensorArray): new_covariance_ta = covariance_ta.write(iteration_number, @@ -322,6 +393,7 @@ class ARModel(model.TimeSeriesModel): return (iteration_number + 1, new_input_times, new_input_values, + new_input_exogenous_regressors, new_mean_ta, new_covariance_ta) @@ -332,9 +404,13 @@ class ARModel(model.TimeSeriesModel): if self.loss != ARModel.SQUARED_LOSS else 0.) mean_ta_init = tensor_array_ops.TensorArray( dtype=self.dtype, size=prediction_iterations) - _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop( + _, _, _, _, mean_ta, covariance_ta = control_flow_ops.while_loop( _while_condition, _while_body, [ - 0, initial_input_times, initial_input_values, mean_ta_init, + 0, + initial_input_times, + initial_input_values, + initial_input_exogenous_regressors, + mean_ta_init, covariance_ta_init ]) @@ -366,11 +442,11 @@ class ARModel(model.TimeSeriesModel): return {"mean": predicted_mean, "covariance": predicted_covariance} - def _process_window(self, features, mode): + def _process_window(self, features, mode, exogenous_regressors): """Compute model outputs on a single window of data.""" - # TODO(agarwal): Use exogenous features times = math_ops.cast(features[TrainEvalFeatures.TIMES], dtypes.int64) values = math_ops.cast(features[TrainEvalFeatures.VALUES], dtype=self.dtype) + exogenous_regressors = math_ops.cast(exogenous_regressors, dtype=self.dtype) original_values = values # Extra shape checking for the window size (above that in @@ -395,7 +471,8 @@ class ARModel(model.TimeSeriesModel): input_values = values[:, :self.input_window_size, :] else: input_values = None - prediction_ops = self.prediction_ops(times, input_values) + prediction_ops = self.prediction_ops( + times, input_values, exogenous_regressors) prediction = prediction_ops["mean"] covariance = prediction_ops["covariance"] targets = array_ops.slice(values, [0, self.input_window_size, 0], @@ -419,7 +496,8 @@ class ARModel(model.TimeSeriesModel): return model.ModelOutputs( loss=loss, end_state=(times[:, -self.input_window_size:], - values[:, -self.input_window_size:, :]), + values[:, -self.input_window_size:, :], + exogenous_regressors[:, -self.input_window_size:, :]), predictions={"mean": prediction, "covariance": covariance, "observed": original_values[:, -self.output_window_size:]}, prediction_times=times[:, -self.output_window_size:]) @@ -454,17 +532,24 @@ class ARModel(model.TimeSeriesModel): """ features = {feature_name: ops.convert_to_tensor(feature_value) for feature_name, feature_value in features.items()} + times = features[TrainEvalFeatures.TIMES] + exogenous_regressors = self._process_exogenous_features( + times=times, + features={key: value for key, value in features.items() + if key not in [TrainEvalFeatures.TIMES, + TrainEvalFeatures.VALUES, + PredictionFeatures.STATE_TUPLE]}) if mode == estimator_lib.ModeKeys.TRAIN: # For training, we require the window size to be self.window_size as # iterating sequentially on larger windows could introduce a bias. - return self._process_window(features, mode=mode) + return self._process_window( + features, mode=mode, exogenous_regressors=exogenous_regressors) elif mode == estimator_lib.ModeKeys.EVAL: # For evaluation, we allow the user to pass in a larger window, in which # case we try to cover as much of the window as possible without # overlap. Quantitative evaluation is more efficient/correct with fixed # windows matching self.window_size (as with training), but this looping # allows easy plotting of "in-sample" predictions. - times = features[TrainEvalFeatures.TIMES] times.get_shape().assert_has_rank(2) static_window_size = times.get_shape()[1].value if (static_window_size is not None @@ -500,7 +585,9 @@ class ARModel(model.TimeSeriesModel): feature_name: feature_value[:, base_offset:base_offset + self.window_size] for feature_name, feature_value in features.items()}, - mode=mode) + mode=mode, + exogenous_regressors=exogenous_regressors[ + :, base_offset:base_offset + self.window_size]) # This code needs to be updated if new predictions are added in # self._process_window assert len(model_outputs.predictions) == 3 @@ -525,7 +612,9 @@ class ARModel(model.TimeSeriesModel): batch_size = array_ops.shape(times)[0] prediction_shape = [batch_size, self.output_window_size * num_iterations, self.num_features] - previous_state_times, previous_state_values = state + (previous_state_times, + previous_state_values, + previous_state_exogenous_regressors) = state # Make sure returned state always has windows of self.input_window_size, # even if we were passed fewer than self.input_window_size points this # time. @@ -540,14 +629,24 @@ class ARModel(model.TimeSeriesModel): self._scale_data(values)], axis=1)[:, -self.input_window_size:, :] new_state_values.set_shape((None, self.input_window_size, self.num_features)) + new_exogenous_regressors = array_ops.concat( + [previous_state_exogenous_regressors, + exogenous_regressors], axis=1)[:, -self.input_window_size:, :] + new_exogenous_regressors.set_shape( + (None, + self.input_window_size, + self.exogenous_size)) else: # There is no state to keep, and the strided slices above do not handle # input_window_size=0. new_state_times = previous_state_times new_state_values = previous_state_values + new_exogenous_regressors = previous_state_exogenous_regressors return model.ModelOutputs( loss=math_ops.reduce_mean(loss_ta.stack(), axis=0), - end_state=(new_state_times, new_state_values), + end_state=(new_state_times, + new_state_values, + new_exogenous_regressors), predictions={ "mean": array_ops.reshape( array_ops.transpose(mean_ta.stack(), [1, 0, 2, 3]), @@ -604,7 +703,8 @@ class AnomalyMixtureARModel(ARModel): num_features, anomaly_distribution=GAUSSIAN_ANOMALY, num_time_buckets=10, - hidden_layer_sizes=None): + hidden_layer_sizes=None, + exogenous_feature_columns=None): assert (anomaly_prior_probability < 1.0 and anomaly_prior_probability > 0.0) self._anomaly_prior_probability = anomaly_prior_probability @@ -619,7 +719,8 @@ class AnomalyMixtureARModel(ARModel): input_window_size=input_window_size, output_window_size=output_window_size, loss=ARModel.NORMAL_LIKELIHOOD_LOSS, - hidden_layer_sizes=hidden_layer_sizes) + hidden_layer_sizes=hidden_layer_sizes, + exogenous_feature_columns=exogenous_feature_columns) def _create_anomaly_ops(self, times, values, prediction_ops_dict): anomaly_log_param = variable_scope.get_variable( @@ -631,9 +732,9 @@ class AnomalyMixtureARModel(ARModel): # distribution. prediction_ops_dict["anomaly_params"] = gen_math_ops.exp(anomaly_log_param) - def prediction_ops(self, times, values): + def prediction_ops(self, times, values, exogenous_regressors): prediction_ops_dict = super(AnomalyMixtureARModel, self).prediction_ops( - times, values) + times, values, exogenous_regressors) self._create_anomaly_ops(times, values, prediction_ops_dict) return prediction_ops_dict diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py index 1e1ca4e77fc41bb418cf2521c2c7fbed9f27c6a8..d078ac8d46397d00efefc32e19aa9bd1133aebaa 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py +++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py @@ -155,12 +155,15 @@ class ARModelTest(test.TestCase): state_times = np.expand_dims(train_data_times[:input_window_size], 0) state_values = np.expand_dims( train_data_values[:input_window_size, :], 0) + state_exogenous = state_times[:, :, None][:, :, :0] def prediction_input_fn(): return ({ PredictionFeatures.TIMES: training.limit_epochs( predict_times, num_epochs=1), - PredictionFeatures.STATE_TUPLE: (state_times, state_values) + PredictionFeatures.STATE_TUPLE: (state_times, + state_values, + state_exogenous) }, {}) (predictions,) = tuple(estimator.predict(input_fn=prediction_input_fn)) predicted_mean = predictions["mean"][:, 0] @@ -246,7 +249,8 @@ class ARModelTest(test.TestCase): with session.Session(): predicted_values = model.predict({ PredictionFeatures.TIMES: [[4, 6, 10]], - PredictionFeatures.STATE_TUPLE: ([[1, 2]], [[[1.], [2.]]]) + PredictionFeatures.STATE_TUPLE: ( + [[1, 2]], [[[1.], [2.]]], [[[], []]]) }) variables.global_variables_initializer().run() self.assertAllEqual(predicted_values["mean"].eval().shape, diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py index 886e1846e2a4f75503a47a3ff92adf97f814053f..f4608ca2d1cc286575f10f6922335f43c7ec0231 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py +++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py @@ -190,7 +190,7 @@ class ARRegressor(TimeSeriesRegressor): def __init__( self, periodicities, input_window_size, output_window_size, - num_features, num_time_buckets=10, + num_features, exogenous_feature_columns=None, num_time_buckets=10, loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS, hidden_layer_sizes=None, anomaly_prior_probability=None, anomaly_distribution=None, optimizer=None, model_dir=None, config=None): @@ -205,7 +205,12 @@ class ARRegressor(TimeSeriesRegressor): output_window_size: Number of future time steps to predict. Note that setting it to > 1 empirically seems to give a better fit. num_features: The dimensionality of the time series (one for univariate, - more than one for multivariate). + more than one for multivariate). + exogenous_feature_columns: A list of `tf.feature_column`s (for example + `tf.feature_column.embedding_column`) corresponding to exogenous + features which provide extra information to the model but are not part + of the series to be predicted. Passed to + `tf.feature_column.input_layer`. num_time_buckets: Number of buckets into which to divide (time % periodicity) for generating time based features. loss: Loss function to use for training. Currently supported values are @@ -241,6 +246,7 @@ class ARRegressor(TimeSeriesRegressor): anomaly_distribution = ar_model.AnomalyMixtureARModel.GAUSSIAN_ANOMALY model = ar_model.ARModel( periodicities=periodicities, num_features=num_features, + exogenous_feature_columns=exogenous_feature_columns, num_time_buckets=num_time_buckets, input_window_size=input_window_size, output_window_size=output_window_size, loss=loss, @@ -255,6 +261,7 @@ class ARRegressor(TimeSeriesRegressor): input_window_size=input_window_size, output_window_size=output_window_size, num_features=num_features, + exogenous_feature_columns=exogenous_feature_columns, num_time_buckets=num_time_buckets, hidden_layer_sizes=hidden_layer_sizes, anomaly_prior_probability=anomaly_prior_probability, diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py index 9f161c1695f415ad28c41ad0c00bc0b056399b96..eebee053f8e6000bdf17996abde03896bf4f32e1 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py +++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py @@ -29,6 +29,7 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils from tensorflow.python.client import session from tensorflow.python.estimator import estimator_lib +from tensorflow.python.feature_column import feature_column from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.platform import test @@ -48,12 +49,17 @@ class TimeSeriesRegressorTest(test.TestCase): def _fit_restore_fit_test_template(self, estimator_fn, dtype): """Tests restoring previously fit models.""" model_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) - first_estimator = estimator_fn(model_dir) + exogenous_feature_columns = ( + feature_column.numeric_column("exogenous"), + ) + first_estimator = estimator_fn(model_dir, exogenous_feature_columns) times = numpy.arange(20, dtype=numpy.int64) values = numpy.arange(20, dtype=dtype.as_numpy_dtype) + exogenous = numpy.arange(20, dtype=dtype.as_numpy_dtype) features = { feature_keys.TrainEvalFeatures.TIMES: times, - feature_keys.TrainEvalFeatures.VALUES: values + feature_keys.TrainEvalFeatures.VALUES: values, + "exogenous": exogenous } train_input_fn = input_pipeline.RandomWindowInputFn( input_pipeline.NumpyReader(features), shuffle_seed=2, num_threads=1, @@ -68,14 +74,19 @@ class TimeSeriesRegressorTest(test.TestCase): first_loss_after_fit = first_estimator.evaluate( input_fn=eval_input_fn, steps=1)["loss"] self.assertLess(first_loss_after_fit, first_loss_before_fit) - second_estimator = estimator_fn(model_dir) + second_estimator = estimator_fn(model_dir, exogenous_feature_columns) second_estimator.train(input_fn=train_input_fn, steps=2) whole_dataset_input_fn = input_pipeline.WholeDatasetInputFn( input_pipeline.NumpyReader(features)) whole_dataset_evaluation = second_estimator.evaluate( input_fn=whole_dataset_input_fn, steps=1) + exogenous_values_ten_steps = { + "exogenous": numpy.arange( + 10, dtype=dtype.as_numpy_dtype)[None, :, None] + } predict_input_fn = input_pipeline.predict_continuation_input_fn( evaluation=whole_dataset_evaluation, + exogenous_features=exogenous_values_ten_steps, steps=10) # Also tests that limit_epochs in predict_continuation_input_fn prevents # infinite iteration @@ -92,6 +103,7 @@ class TimeSeriesRegressorTest(test.TestCase): saved_prediction = saved_model_utils.predict_continuation( continue_from=whole_dataset_evaluation, steps=10, + exogenous_features=exogenous_values_ten_steps, signatures=signatures, session=sess) # Saved model predictions should be the same as Estimator predictions @@ -104,7 +116,8 @@ class TimeSeriesRegressorTest(test.TestCase): continue_from=whole_dataset_evaluation, features={ feature_keys.FilteringFeatures.TIMES: times[None, -1] + 2, - feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2. + feature_keys.FilteringFeatures.VALUES: values[None, -1] + 2., + "exogenous": values[None, -1, None] + 12. }, signatures=signatures, session=sess) @@ -112,6 +125,10 @@ class TimeSeriesRegressorTest(test.TestCase): second_saved_prediction = saved_model_utils.predict_continuation( continue_from=first_filtering, steps=1, + exogenous_features={ + "exogenous": numpy.arange( + 1, dtype=dtype.as_numpy_dtype)[None, :, None] + }, signatures=signatures, session=sess) self.assertEqual( @@ -122,7 +139,8 @@ class TimeSeriesRegressorTest(test.TestCase): continue_from=first_filtering, features={ feature_keys.FilteringFeatures.TIMES: times[-1] + 3, - feature_keys.FilteringFeatures.VALUES: values[-1] + 3. + feature_keys.FilteringFeatures.VALUES: values[-1] + 3., + "exogenous": values[-1, None] + 13. }, signatures=signatures, session=sess) @@ -131,7 +149,8 @@ class TimeSeriesRegressorTest(test.TestCase): six.assertCountEqual( self, [feature_keys.FilteringFeatures.TIMES, - feature_keys.FilteringFeatures.VALUES], + feature_keys.FilteringFeatures.VALUES, + "exogenous"], signatures.signature_def[ feature_keys.SavedModelLabels.COLD_START_FILTER].inputs.keys()) batch_numpy_times = numpy.tile( @@ -142,7 +161,8 @@ class TimeSeriesRegressorTest(test.TestCase): session=sess, features={ feature_keys.FilteringFeatures.TIMES: batch_numpy_times, - feature_keys.FilteringFeatures.VALUES: batch_numpy_values + feature_keys.FilteringFeatures.VALUES: batch_numpy_values, + "exogenous": 10. + batch_numpy_values } ) predict_times = numpy.tile( @@ -150,26 +170,32 @@ class TimeSeriesRegressorTest(test.TestCase): predictions = saved_model_utils.predict_continuation( continue_from=state, times=predict_times, + exogenous_features={ + "exogenous": numpy.tile(numpy.arange( + 15, dtype=dtype.as_numpy_dtype), (10,))[None, :, None] + }, signatures=signatures, session=sess) self.assertAllEqual([10, 15, 1], predictions["mean"].shape) def test_fit_restore_fit_ar_regressor(self): - def _estimator_fn(model_dir): + def _estimator_fn(model_dir, exogenous_feature_columns): return estimators.ARRegressor( periodicities=10, input_window_size=10, output_window_size=6, num_features=1, model_dir=model_dir, config=_SeedRunConfig(), # This test is flaky with normal likelihood loss (could add more # training iterations instead). - loss=ar_model.ARModel.SQUARED_LOSS) + loss=ar_model.ARModel.SQUARED_LOSS, + exogenous_feature_columns=exogenous_feature_columns) self._fit_restore_fit_test_template(_estimator_fn, dtype=dtypes.float32) def test_fit_restore_fit_structural_ensemble_regressor(self): dtype = dtypes.float32 - def _estimator_fn(model_dir): + def _estimator_fn(model_dir, exogenous_feature_columns): return estimators.StructuralEnsembleRegressor( num_features=1, periodicities=10, model_dir=model_dir, dtype=dtype, - config=_SeedRunConfig()) + config=_SeedRunConfig(), + exogenous_feature_columns=exogenous_feature_columns) self._fit_restore_fit_test_template(_estimator_fn, dtype=dtype) diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py index d5dce30fda0353bd70f44ec567ac91acce1e9394..5f7e3da2db6da26f50aad9d500959238063a3e3c 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py +++ b/tensorflow/contrib/timeseries/python/timeseries/state_management_test.py @@ -78,7 +78,7 @@ class StubTimeSeriesModel(model.TimeSeriesModel): batch_end_values = array_ops.squeeze( array_ops.slice(values, [0, array_ops.shape(times)[1] - 1, 0], [-1, 1, -1]), - squeeze_dims=[1, 2]) + axis=[1, 2]) # A pretty odd but easy to think about loss: L1 loss on the batch end # values. loss = math_ops.reduce_sum( diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py index 1fcd3e391b63c2362d6187da9556e2c71836dbaa..a614386121e000961bf8b32625a28e1251654320 100644 --- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py +++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py @@ -170,7 +170,7 @@ class KalmanFilter(object): math_ops.matmul( transition_matrices, prior_state[..., None]), - squeeze_dims=[-1]) + axis=[-1]) return advanced_state def predict_state_var( @@ -254,7 +254,7 @@ class KalmanFilter(object): kalman_gain_transposed, array_ops.expand_dims(residual, -1), adjoint_a=True), - squeeze_dims=[-1]) + axis=[-1]) gain_obs = math_ops.matmul( kalman_gain_transposed, observation_model, adjoint_a=True) identity_extradim = linalg_ops.eye( @@ -332,7 +332,7 @@ class KalmanFilter(object): array_ops.expand_dims(state_mean, 1), observation_model, adjoint_b=True), - squeeze_dims=[1]) + axis=[1]) observed_var = math_ops.matmul( math_ops.matmul(observation_model, state_var), observation_model, diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD index 9646d15486ef618f206936ce55a5eb6ca0387e41..eac210418b57eadc2e99ea0c29690236c3a09516 100644 --- a/tensorflow/contrib/tpu/BUILD +++ b/tensorflow/contrib/tpu/BUILD @@ -162,6 +162,7 @@ py_library( "python/tpu/__init__.py", "python/tpu/bfloat16.py", "python/tpu/device_assignment.py", + "python/tpu/keras_support.py", "python/tpu/topology.py", "python/tpu/tpu.py", "python/tpu/tpu_feed.py", diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc index 5900c61a38726551391c212f92b9b9eacd4a465b..b05c76ca64fbaedc205ab06cc31616787ccc84b8 100644 --- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc +++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc @@ -26,6 +26,7 @@ REGISTER_OP("OutfeedEnqueue") .Input("input: dtype") .Attr("dtype: type") .SetIsStateful() + .SetShapeFn(shape_inference::NoOutputs) .Doc(R"doc( An op which emits a single Tensor value from an XLA computation. @@ -36,6 +37,7 @@ REGISTER_OP("OutfeedEnqueueTuple") .Input("inputs: dtypes") .Attr("dtypes: list(type)") .SetIsStateful() + .SetShapeFn(shape_inference::NoOutputs) .Doc(R"doc( An op which emits multiple Tensor values from an XLA computation. diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py new file mode 100644 index 0000000000000000000000000000000000000000..e86ca0a1d8f15ea77046b4e177e04f494ced55e9 --- /dev/null +++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py @@ -0,0 +1,391 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""*Experimental* support for running Keras models on the TPU. + +To use, wrap your model with the `keras_support.tpu_model` function. + +Example usage: + +``` +# Must activate before building TPU models +keras_support.setup_tpu_session(master_address) + +image = tf.keras.layers.Input(shape=(28, 28, 3), name='image') +c1 = tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3))( image) +flattened = tf.keras.layers.Flatten()(c1) +logits = tf.keras.layers.Dense(10, activation='softmax')(flattened) +model = tf.keras.Model(inputs=[image], outputs=[logits]) +model = keras_support.tpu_model(model) + +# Only TF optimizers are currently supported. +model.compile(optimizer=tf.train.AdamOptimizer(), ...) + +# `images` and `labels` should be Numpy arrays. Support for tensor input +# (e.g. datasets) is planned. +model.fit(images, labels) + +# Invoke before shutting down +keras_support.shutdown_tpu_session() +``` +""" + +# pylint: disable=protected-access + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re + +from tensorflow.contrib.framework.python.framework import experimental +from tensorflow.contrib.tpu.python.ops import tpu_ops +from tensorflow.contrib.tpu.python.tpu import tpu +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.client import session as tf_session +from tensorflow.python.estimator import model_fn as model_fn_lib +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_spec +from tensorflow.python.keras._impl.keras import backend as K +from tensorflow.python.keras._impl.keras import layers +from tensorflow.python.keras._impl.keras import models +from tensorflow.python.keras._impl.keras import optimizers as keras_optimizers +from tensorflow.python.keras._impl.keras.layers import embeddings +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training import training_util + + +class TPUEmbedding(embeddings.Embedding): + """TPU compatible embedding layer. + + The default Keras layer is not TPU compatible. This layer is a drop-in + replacement: it has the same behavior and will work on CPU and GPU devices. + """ + + def __init__(self, *args, **kw): + super(TPUEmbedding, self).__init__(*args, **kw) + + def build(self, input_shape): + if input_shape[0] is None: + raise ValueError( + 'TPUEmbeddings must have a fixed input_length or input shape.') + return super(TPUEmbedding, self).build(input_shape) + + def call(self, inputs): + if K.dtype(inputs) != 'int32': + inputs = math_ops.cast(inputs, 'int32') + + inputs = array_ops.one_hot(inputs, self.input_dim) + return math_ops.tensordot(inputs, self.embeddings, 1) + + +class CompiledTPUOp( + collections.namedtuple( + 'CompiledTPUOp', + ['tpu_execute_op', 'infeed_tensors', 'infeed_op', 'outfeed_op'])): + pass + + +def _valid_name(tensor_name): + """Return a valid tensor name (strips '/', ':', etc).""" + return re.sub('[^a-zA-Z0-9_-]+', '', tensor_name) + + +class TPUFunction(object): + """K.function compatible interface for invoking a TPU compiled function. + + Recompilation is triggered on-demand for each set of new inputs shapes: the + results are cached for future execution. We expect most computations will + be dominated by a standard batch-size, followed by a straggler batch for + the end of training or evaluation. + + All `inputs` and `outputs` will be loaded via the infeed and outfeed queues + instead of being injected as `feed_dict` items or fetches. + """ + + def __init__(self, model, execution_mode): + self.model = model + self.execution_mode = execution_mode + self._compilation_cache = {} + + def _specialize_model(self, input_specs): + """Specialize `self.model` (a Keras model) for the given input shapes.""" + # Re-create our input and output layers inside our subgraph. They will be + # attached to the true computation when we clone our model in `tpu_fn`. + K.set_learning_phase(self.execution_mode == model_fn_lib.ModeKeys.TRAIN) + + # functools.partial and callable objects are not supported by tpu.rewrite + def _model_fn(): + """Compute fit/eval/predict for the TPU.""" + is_training = self.execution_mode == model_fn_lib.ModeKeys.TRAIN + is_test = self.execution_mode == model_fn_lib.ModeKeys.EVAL + is_predict = self.execution_mode == model_fn_lib.ModeKeys.PREDICT + + # During train/eval, we infeed our features as well as labels. + if is_training or is_test: + infeed_layers = self.model._input_layers + self.model._output_layers + else: + infeed_layers = self.model._input_layers + + # Generate our infeed operation to read features & labels. + infeed_tensors = tpu_ops.infeed_dequeue_tuple( + dtypes=[spec.dtype for spec in input_specs], + shapes=[spec.shape for spec in input_specs], + name='infeed-%s' % self.execution_mode) + + assert len(infeed_tensors) == len(infeed_layers), ( + 'Infeed inputs did not match model: %s vs %s', (infeed_layers, + infeed_tensors)) + + tpu_targets = [] + tpu_inputs = [] + + # Sort infeed outputs into inputs and labels for calling our Keras model. + for tensor, layer in zip(infeed_tensors, infeed_layers): + if layer in self.model._input_layers: + tpu_inputs.append(layers.Input(name=layer.name, tensor=tensor)) + if layer in self.model._output_layers: + tpu_targets.append(tensor) + + optimizer = self.model.optimizer + optimizer.iterations = training_util.get_or_create_global_step() + + # Call our model with our infeed inputs (re-using the weights). + model_outputs = self.model(tpu_inputs) + child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs) + if is_training or is_test: + child_model.compile( + optimizer=self.model.optimizer, + loss=self.model.loss, + loss_weights=self.model.loss_weights, + metrics=self.model.metrics, + weighted_metrics=self.model.weighted_metrics, + target_tensors=tpu_targets, + ) + + # Compute our outfeed depending on the execution mode + if is_training: + child_model._make_train_function() + self._outfeed_spec = [ + tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) + for tensor in child_model.train_function.outputs + ] + return [ + child_model.train_function.updates_op, + tpu_ops.outfeed_enqueue_tuple( + child_model.train_function.outputs, name='oufeed-enqueue-train') + ] + elif is_test: + child_model._make_test_function() + self._outfeed_spec = [ + tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) + for tensor in child_model.test_function.outputs + ] + return [ + tpu_ops.outfeed_enqueue_tuple( + child_model.test_function.outputs, name='outfeed-enqueue-test') + ] + elif is_predict: + child_model._make_predict_function() + self._outfeed_spec = [ + tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name) + for tensor in child_model.predict_function.outputs + ] + return [ + tpu_ops.outfeed_enqueue_tuple( + child_model.predict_function.outputs, + name='outfeed-enqueue-predict', + ) + ] + else: + assert False, 'Unexpected execution mode: %s' % self.execution_mode + + # Capture outfeed metadata computed during the rewrite. + self._outfeed_spec = None + + tpu_execute_op = tpu.rewrite(_model_fn) + + K._initialize_variables(K.get_session()) # pylint-disable: protected-access + + # Generate CPU side operations to enqueue features/labels and dequeue + # outputs from the model call. + with ops.device('/device:TPU:0'): + infeed_tensors = [] + for spec in input_specs: + infeed_tensors.append( + array_ops.placeholder( + dtype=spec.dtype, + shape=spec.shape, + name='infeed-enqueue-%s' % spec.name)) + + infeed_op = tpu_ops.infeed_enqueue_tuple( + infeed_tensors, [spec.shape for spec in input_specs], + name='infeed-enqueue-%s' % self.execution_mode) + + outfeed_op = tpu_ops.outfeed_dequeue_tuple( + dtypes=[spec.dtype for spec in self._outfeed_spec], + shapes=[spec.shape for spec in self._outfeed_spec], + name='outfeed-dequeue-%s' % self.execution_mode) + + return CompiledTPUOp(tpu_execute_op, infeed_tensors, infeed_op, outfeed_op) + + def __call__(self, inputs): + assert isinstance(inputs, list) + + # Strip sample weight from inputs + if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or + self.execution_mode == model_fn_lib.ModeKeys.EVAL): + input_tensors = self.model._feed_inputs + self.model._feed_targets + inputs = inputs[:len(input_tensors)] + else: + input_tensors = self.model._feed_inputs + + # Compute an input specification (used to generate infeed enqueue and + # dequeue operations). We use the shape from our input array and the + # dtype from our model. A user may pass in a float64 for a float32 + # input: for model compatibility we still must generate a float32 infeed. + input_specs = [] + for tensor, ary in zip(input_tensors, inputs): + input_specs.append( + tensor_spec.TensorSpec(ary.shape, tensor.dtype, + _valid_name(tensor.name))) + + # XLA requires every operation in the graph has a fixed shape. To + # handle varying batch sizes we recompile a new sub-graph for each + # unique input shape. + shape_key = tuple([tuple(spec.shape.as_list()) for spec in input_specs]) + + if shape_key not in self._compilation_cache: + logging.info('New input shapes; (re-)compiling: mode=%s, %s', + self.execution_mode, input_specs) + self._compilation_cache[shape_key] = self._specialize_model(input_specs) + + compiled_model = self._compilation_cache[shape_key] + + infeed_dict = {} + for tensor, value in zip(compiled_model.infeed_tensors, inputs): + infeed_dict[tensor] = value + + session = K.get_session() + _, _, outfeed_outputs = session.run([ + compiled_model.infeed_op, compiled_model.tpu_execute_op, + compiled_model.outfeed_op + ], infeed_dict) + + return outfeed_outputs + + +@experimental +def setup_tpu_session(master): + """Initializes and returns a Keras/TF session connected the TPU `master`.""" + session = tf_session.Session( + target=master, config=config_pb2.ConfigProto(isolate_session_state=True)) + K.set_session(session) + K.get_session().run(tpu.initialize_system()) + K.manual_variable_initialization(True) + return session + + +@experimental +def shutdown_tpu_session(session=None): + """Shutdown the TPU attached to session. + + This should be called to cleanly shut down the TPU system before the client + exits. + + Args: + session: Session to shutdown, or None to use the default session. + + Returns: + + """ + if session is None: + session = K.get_session() + + session.run(tpu.shutdown_system()) + + +class KerasTPUModel(models.Model): + """TPU compatible Keras model wrapper.""" + + def __init__(self, inputs, outputs, name=None): + super(models.Model, self).__init__( + inputs=inputs, + outputs=outputs, + name=name, + ) + self.predict_function = None + self.test_function = None + self.train_function = None + + def compile(self, + optimizer, + loss=None, + metrics=None, + loss_weights=None, + sample_weight_mode=None, + weighted_metrics=None, + target_tensors=None, + **kwargs): + if sample_weight_mode: + raise ValueError('sample_weight_mode not supported for TPU execution.') + if weighted_metrics: + raise ValueError('weighted_metrics not supported for TPU execution.') + if target_tensors: + raise ValueError('target_tensors is not supported for TPU execution.') + + super(KerasTPUModel, self).compile(optimizer, loss, metrics, loss_weights, + sample_weight_mode, weighted_metrics, + target_tensors, **kwargs) + + # Keras optimizers are not compatible with TPU rewrite + if not isinstance(self.optimizer, keras_optimizers.TFOptimizer): + raise ValueError( + 'Optimizer must be a TFOptimizer, got: %s' % self.optimizer) + + def train_on_batch(self, x, y, sample_weight=None, class_weight=None): + return super(KerasTPUModel, self).train_on_batch(x, y, sample_weight, + class_weight) + + def _make_train_function(self): + if not self.train_function: + self.train_function = TPUFunction(self, model_fn_lib.ModeKeys.TRAIN) + + return self.train_function + + def _make_test_function(self): + if not self.test_function: + self.test_function = TPUFunction(self, model_fn_lib.ModeKeys.EVAL) + return self.test_function + + def _make_predict_function(self): + if not self.predict_function: + self.predict_function = TPUFunction(self, model_fn_lib.ModeKeys.PREDICT) + return self.predict_function + + def cpu_model(self): + return models.Model( + inputs=self.inputs, + outputs=self.outputs, + name=self.name, + ) + + +@experimental +def tpu_model(model): + return KerasTPUModel( + inputs=model.inputs, outputs=model.outputs, name=model.name) diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py index a1690dadffe5770af9416a7c5ad3a7e336f6bc18..7b8786304ccf7e707712f178838cafb8346d2941 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu.py @@ -173,36 +173,18 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext): # gradients, and put the gradient of X in cluster # 'root_cluster.gradient_uid'. # - # When the gradient code adds multiple Ops, it asks them to - # be colocated either with the original Op X, or with one of - # the preceding Ops that was added to the gradient. In other - # words, we want to detect the case where we are colocating - # with an Op that is in cluster root_cluster.gradient_uid - # and put the new Op in that same cluster if the - # gradient_uid is the same (the case that we are in the same - # invocation of gradients, and just adding new Ops to the - # cluster); and in a different cluster if the gradient_uids - # are different (the case that we are in a new invocation of - # gradients, taking the gradient of a previously-computed - # gradient). + # When taking a gradient of a gradient, some ops will be + # colocated with Op in the forward pass (e.g., cluster + # root_cluster) and some in the backward pass (e.g., cluster + # root_cluster.initial_gradient_uid). We need all of the + # grad-of-grad ops to be in the same cluster to avoid cyclic + # dependencies between clusters. We adopt a heuristic that + # puts any op clustered with root_cluster. in + # root_cluster.gradient_uid, even if xxx was + # initial_gradient_uid. self._in_gradient_colocation = op parts = outside_attr.split(".") - if len(parts) > 1: - uid = parts[-1] - if uid == gradient_uid: - # Keep using the same cluster - cluster = outside_attr - else: - # We're taking the gradient of a gradient so make a new - # cluster attr, adding a new '.uid' on the end to - # preserve the invariant that the gradient_uid is the - # suffix after the last '.' in the attr. - cluster = outside_attr + "." + gradient_uid - else: - # We're taking the gradient of an Op in the forward pass, so - # make a new cluster combining the Op's cluster and the - # gradient id. - cluster = outside_attr + "." + gradient_uid + cluster = parts[0] + "." + gradient_uid self._EnterOutsideCompilationScope(cluster=cluster) except ValueError: # The attr was not present: do nothing. diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py index cc1a7fd801506e3f0b758c4848205f1c375403d2..6d7331e3c79ade9c12c15de79f550cf3973c4e6c 100644 --- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py +++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py @@ -210,8 +210,9 @@ class RunConfig(run_config_lib.RunConfig): raise ValueError( 'You cannot provide a ClusterResolver and ' 'session_config.cluster_def.') - self._session_config.cluster_def.CopyFrom( - self._cluster_spec.as_cluster_def()) + if self._cluster_spec: + self._session_config.cluster_def.CopyFrom( + self._cluster_spec.as_cluster_def()) @property def evaluation_master(self): diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 4004db74a8a1a7597dfec3aa609b7cfd00b28bd3..acca47e9a3569cdac692349c142e7c51fa98dcae 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -145,6 +145,7 @@ load( "if_static", ) load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") +load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library") load( "//third_party/mkl:build_defs.bzl", "if_mkl", @@ -161,7 +162,7 @@ exports_files(["ops/ops.pbtxt"]) # Note that some protos are in neither additional_core_proto_srcs nor this # filegroup; e.g. ones with individual proto_library targets. # LINT.IfChange -CORE_PROTO_SRCS = [ +COMMON_PROTO_SRCS = [ "example/example.proto", "example/feature.proto", "framework/allocation_description.proto", @@ -189,7 +190,6 @@ CORE_PROTO_SRCS = [ "framework/types.proto", "framework/variable.proto", "framework/versions.proto", - "lib/core/error_codes.proto", "protobuf/config.proto", "protobuf/cluster.proto", "protobuf/debug.proto", @@ -202,8 +202,14 @@ CORE_PROTO_SRCS = [ "util/memmapped_file_system.proto", "util/saved_tensor_slice.proto", ] + +ERROR_CODES_PROTO_SRCS = [ + "lib/core/error_codes.proto", +] # LINT.ThenChange(//tensorflow/core/android_proto_config.asciipb) +CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS + # Protos which are not needed on mobile builds, but should be included in # protos_all. # @@ -224,12 +230,16 @@ ADDITIONAL_CORE_PROTO_SRCS = [ tf_proto_library( name = "protos_all", - srcs = CORE_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS, + srcs = [], cc_api_version = 2, default_header = True, j2objc_api_version = 1, java_api_version = 2, js_api_version = 2, + protodeps = [ + ":protos_all_proto", + ":error_codes_proto", + ], visibility = ["//visibility:public"], ) @@ -257,12 +267,6 @@ proto_library( visibility = ["//visibility:public"], ) -closure_proto_library( - name = "example_protos_closure", - visibility = ["//visibility:public"], - deps = [":example_protos"], -) - exports_files([ "framework/types.proto", ]) @@ -287,7 +291,7 @@ PLATFORM_BASE_HDRS = [ "platform/logging.h", "platform/macros.h", "platform/types.h", - "platform/cpu_info.h", + "platform/byte_order.h", ] PLATFORM_OTHER_HDRS = [ @@ -295,6 +299,7 @@ PLATFORM_OTHER_HDRS = [ "platform/stacktrace.h", "platform/stacktrace_handler.h", "platform/context.h", + "platform/cpu_info.h", "platform/cpu_feature_guard.h", "platform/dynamic_annotations.h", "platform/env.h", @@ -323,7 +328,6 @@ cc_library( srcs = glob([ "platform/*/integral_types.h", "platform/*/logging.h", - "platform/*/cpu_info.h", ]), hdrs = PLATFORM_BASE_HDRS, deps = [ @@ -353,7 +357,9 @@ cc_library( "lib/bfloat16/bfloat16.h", ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()), copts = tf_copts(), - deps = tf_lib_proto_parsing_deps(), + deps = tf_lib_proto_parsing_deps() + [ + "@double_conversion//:double-conversion", + ], ) # This build rule (along with :lib_internal, :framework, and @@ -540,6 +546,7 @@ tf_cuda_library( "framework/device_base.h", "framework/function.h", "framework/graph_def_util.h", + "framework/graph_to_functiondef.h", "framework/kernel_def_builder.h", "framework/log_memory.h", "framework/lookup_interface.h", @@ -672,6 +679,7 @@ cc_library( "framework/tensor_types.h", "framework/type_traits.h", "lib/bfloat16/bfloat16.h", + "platform/byte_order.h", "platform/default/dynamic_annotations.h", "platform/default/integral_types.h", "platform/default/logging.h", @@ -993,6 +1001,7 @@ cc_library( "//tensorflow/core/kernels:nn", "//tensorflow/core/kernels:parameterized_truncated_normal_op", "//tensorflow/core/kernels:parsing", + "//tensorflow/core/kernels:partitioned_function_ops", "//tensorflow/core/kernels:random_ops", "//tensorflow/core/kernels:random_poisson_op", "//tensorflow/core/kernels:remote_fused_graph_ops", @@ -1137,7 +1146,8 @@ filegroup( filegroup( name = "mobile_srcs_no_runtime", srcs = [ - ":proto_text_srcs_all", + ":protos_all_proto_text_srcs", + ":error_codes_proto_text_srcs", "//tensorflow/core/platform/default/build_config:android_srcs", ] + glob( [ @@ -1247,6 +1257,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1286,6 +1297,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1349,6 +1361,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1371,6 +1384,7 @@ cc_library( deps = [ ":protos_all_cc_impl", "//third_party/eigen3", + "@double_conversion//:double-conversion", "@nsync//:nsync_cpp", "@protobuf_archive//:protobuf", ], @@ -1618,6 +1632,18 @@ tf_proto_library_cc( ], ) +tf_proto_library_cc( + name = "eager_service_proto", + srcs = ["protobuf/eager_service.proto"], + has_services = 1, + cc_api_version = 2, + cc_stubby_versions = ["2"], + protodeps = tf_additional_all_protos(), + visibility = [ + "//tensorflow:internal", + ], +) + LIB_INTERNAL_PRIVATE_HEADERS = ["framework/resource_handle.h"] + glob( [ "lib/**/*.h", @@ -1767,6 +1793,7 @@ cc_library( "//tensorflow/core/platform/default/build_config:platformlib", "@snappy", "@zlib_archive//:zlib", + "@double_conversion//:double-conversion", "@protobuf_archive//:protobuf", ] + tf_protos_all_impl() + tf_protos_grappler_impl(), ) @@ -1912,6 +1939,7 @@ cc_library( "lib/core/casts.h", "lib/core/stringpiece.h", "lib/png/png_io.h", + "platform/byte_order.h", "platform/cpu_info.h", "platform/default/integral_types.h", "platform/default/logging.h", @@ -1927,15 +1955,58 @@ cc_library( ], ) -proto_text_hdrs_and_srcs = tf_generate_proto_text_sources( - name = "proto_text_srcs_all", - srcs = CORE_PROTO_SRCS, +tf_proto_library( + name = "error_codes_proto", + srcs = ERROR_CODES_PROTO_SRCS, + cc_api_version = 2, + default_header = True, + j2objc_api_version = 1, + java_api_version = 2, + js_api_version = 2, +) + +tf_generate_proto_text_sources( + name = "error_codes_proto_text", + srcs = ERROR_CODES_PROTO_SRCS, + protodeps = [], + srcs_relative_dir = "tensorflow/core/", + deps = [ + ":error_codes_proto_cc", + ":lib_internal", + ], +) + +tf_proto_library( + name = "protos_all_proto", + srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS, + cc_api_version = 2, + default_header = True, + j2objc_api_version = 1, + java_api_version = 2, + js_api_version = 2, + protodeps = [ + ":error_codes_proto", + ], +) + +tf_generate_proto_text_sources( + name = "protos_all_proto_text", + srcs = COMMON_PROTO_SRCS, + protodeps = ERROR_CODES_PROTO_SRCS, srcs_relative_dir = "tensorflow/core/", + deps = [ + ":error_codes_proto_text", + ":lib_internal", + ":protos_all_proto_cc", + ], ) cc_library( name = "proto_text", - hdrs = proto_text_hdrs_and_srcs.hdrs, + hdrs = [ + ":error_codes_proto_text_hdrs", + ":protos_all_proto_text_hdrs", + ], deps = [ ":lib", ":lib_internal", @@ -2080,7 +2151,7 @@ tf_cuda_library( "util/memmapped_file_system.cc", "util/memmapped_file_system_writer.cc", ], - }) + proto_text_hdrs_and_srcs.srcs + tf_additional_framework_srcs(), + }) + tf_additional_framework_srcs(), hdrs = FRAMEWORK_INTERNAL_PUBLIC_HEADERS, copts = tf_copts(), linkopts = select({ @@ -2094,7 +2165,8 @@ tf_cuda_library( deps = [ ":lib", ":lib_internal", - ":proto_text", + ":protos_all_proto_text", + ":error_codes_proto_text", ":protos_all_cc", ":version_lib", "//tensorflow/core/platform/default/build_config:platformlib", @@ -2506,6 +2578,19 @@ tf_cuda_library( cc_library( name = "gpu_id", + hdrs = [ + "common_runtime/gpu/gpu_id.h", + "common_runtime/gpu/gpu_id_manager.h", + ], + deps = [ + ":lib", + ] + if_static([ + ":gpu_id_impl", + ]), +) + +cc_library( + name = "gpu_id_impl", srcs = ["common_runtime/gpu/gpu_id_manager.cc"], hdrs = [ "common_runtime/gpu/gpu_id.h", @@ -2555,7 +2640,7 @@ tf_cuda_library( ":core_cpu_lib", ":framework", ":framework_internal", - ":gpu_id", + ":gpu_id_impl", ":gpu_init_impl", ":gpu_lib", ":graph", @@ -2991,6 +3076,7 @@ tf_cc_tests( "framework/common_shape_fns_test.cc", "framework/function_test.cc", "framework/graph_def_util_test.cc", + "framework/graph_to_functiondef_test.cc", "framework/kernel_def_builder_test.cc", "framework/memory_types_test.cc", "framework/node_def_builder_test.cc", @@ -3069,6 +3155,8 @@ tf_cc_tests( ":testlib", "//tensorflow/cc:cc_ops", "//tensorflow/cc:cc_ops_internal", + "//tensorflow/cc:function_ops", + "//tensorflow/cc:ops", "//tensorflow/cc:scope", "//tensorflow/cc:sendrecv_ops", "//tensorflow/cc:while_loop", @@ -4073,3 +4161,9 @@ alias( actual = ":mobile_srcs", visibility = ["//visibility:public"], ) + +closure_proto_library( + name = "example_protos_closure", + visibility = ["//visibility:public"], + deps = [":example_protos"], +) diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt index c2858a1bfbb5a30e5e72bcfad694f696e08d2346..b90f5473c89cbe3afe38f0283025e7273817d0e4 100644 --- a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt @@ -82,9 +82,9 @@ END } summary: "Update \'*var\' according to the Adam algorithm." description: <